From b2ba24bb026904f3503db71f65d2b1627f08edf1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 19 Jul 2023 14:14:50 +0100 Subject: [PATCH] [InfoExtractor] Add `_match_valid_url()` class method and refactor * API compatible with yt-dlp * also support Sequence of patterns in _VALID_URL * one place to compile _VALID_URL * TODO: remove existing extractor shims --- devscripts/make_lazy_extractors.py | 14 ++++++-- youtube_dl/extractor/common.py | 51 +++++++++++++++++++++------- youtube_dl/extractor/globalplayer.py | 6 ---- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index a8b6ff1b9..1a841a08b 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -4,6 +4,7 @@ from inspect import getsource import io import os from os.path import dirname as dirn +import re import sys print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) @@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor with open('devscripts/lazy_load_template.py', 'rt') as f: module_template = f.read() + +def get_source(m): + return re.sub(r'(?m)^\s*#.*\n', '', getsource(m)) + + module_contents = [ - module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', + module_template, + get_source(InfoExtractor.suitable), + get_source(InfoExtractor._match_valid_url) + '\n', 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', # needed for suitable() methods of Youtube extractor (see #28780) - 'from youtube_dl.utils import parse_qs\n', + 'from youtube_dl.utils import parse_qs, variadic\n', ] ie_template = ''' @@ -66,7 +74,7 @@ def build_lazy_ie(ie, name): valid_url=valid_url, module=ie.__module__) if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: - s += '\n' + getsource(ie.suitable) + s += '\n' + get_source(ie.suitable) if hasattr(ie, '_make_valid_url'): # search extractors s += make_valid_template.format(valid_url=ie._make_valid_url()) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 549781186..7f416d312 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -83,6 +83,7 @@ from ..utils import ( urljoin, url_basename, url_or_none, + variadic, xpath_element, xpath_text, xpath_with_ns, @@ -371,9 +372,22 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. + A subclass of InfoExtractor must be defined to handle each specific site (or + several sites). Such a concrete subclass should be added to the list of + extractors. It should also: + * define its _VALID_URL attribute as a regexp, or a Sequence of alternative + regexps (but see below) + * re-define the _real_extract() method + * optionally re-define the _real_initialize() method. + + An extractor subclass may also override suitable() if necessary, but the + function signature must be preserved and the function must import everything + it needs (except other extractors), so that lazy_extractors works correctly. + If the subclass's suitable() and _real_extract() functions avoid using + _VALID_URL, the subclass need not set that class attribute. + + An abstract subclass of InfoExtractor may be used to simplify implementation + within an extractor module; it should not be added to the list of extractors. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -408,22 +422,33 @@ class InfoExtractor(object): self._x_forwarded_for_ip = None self.set_downloader(downloader) + @classmethod + def __match_valid_url(cls, url): + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for cls, whereas getattr would also + # match its superclass + if '_VALID_URL_RE' not in cls.__dict__: + # _VALID_URL can now be a list/tuple of patterns + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7 + for p in cls._VALID_URL_RE: + p = p.match(url) + if p: + return p + + # The public alias can safely be overridden, as in some back-ports + _match_valid_url = __match_valid_url + @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - - # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) is not None + # This function must import everything it needs (except other extractors), + # so that lazy_extractors works correctly + return cls.__match_valid_url(url) is not None @classmethod def _match_id(cls, url): - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - m = cls._VALID_URL_RE.match(url) + m = cls.__match_valid_url(url) assert m return compat_str(m.group('id')) diff --git a/youtube_dl/extractor/globalplayer.py b/youtube_dl/extractor/globalplayer.py index db490b141..ae75dcabf 100644 --- a/youtube_dl/extractor/globalplayer.py +++ b/youtube_dl/extractor/globalplayer.py @@ -18,12 +18,6 @@ from ..utils import ( class GlobalPlayerBaseIE(InfoExtractor): - import re - - @classmethod - def _match_valid_url(cls, url): - return cls.re.match(cls._VALID_URL, url) - def _get_page_props(self, url, video_id): webpage = self._download_webpage(url, video_id) return self._search_nextjs_data(webpage, video_id)['props']['pageProps']