[core] Handle `/../` sequences in HTTP URLs

* use Python's RFC implementation for embedded sequences
* hack: strip unbalanced leading `../` from path, like eg Firefox

See https://github.com/yt-dlp/yt-dlp/issues/3355
pull/29322/head
dirkf 2023-03-13 19:45:54 +00:00
parent baa6c5e95c
commit e8de54bce5
1 changed files with 24 additions and 0 deletions

View File

@ -39,6 +39,7 @@ from .compat import (
compat_str,
compat_tokenize_tokenize,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urllib_request_DataHandler,
)
@ -60,6 +61,7 @@ from .utils import (
format_bytes,
formatSeconds,
GeoRestrictedError,
HEADRequest,
int_or_none,
ISO3166Utils,
locked_file,
@ -74,6 +76,7 @@ from .utils import (
preferredencoding,
prepend_extension,
process_communicate_or_kill,
PUTRequest,
register_socks_protocols,
render_table,
replace_extension,
@ -2297,6 +2300,27 @@ class YoutubeDL(object):
""" Start an HTTP download """
if isinstance(req, compat_basestring):
req = sanitized_Request(req)
# an embedded /../ sequence is not automatically handled by urllib2
# see https://github.com/yt-dlp/yt-dlp/issues/3355
url = req.get_full_url()
parts = url.partition('/../')
if parts[1]:
url = compat_urllib_parse.urljoin(parts[0] + parts[1][:1], parts[1][1:] + parts[2])
if url:
# worse, URL path may have initial /../ against RFCs: work-around
# by stripping such prefixes, like eg Firefox
parts = compat_urllib_parse.urlsplit(url)
path = parts.path
while path.startswith('/../'):
path = path[3:]
url = parts._replace(path=path).geturl()
# get a new Request with the munged URL
if url != req.get_full_url():
req_type = {'HEAD': HEADRequest, 'PUT': PUTRequest}.get(
req.get_method(), compat_urllib_request.Request)
req = req_type(
url, data=req.data, headers=dict(req.header_items()),
origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):