From e20ca543f037bd3a8e38507b870ed3a3de3c32e7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 1 Jun 2024 13:43:37 +0100 Subject: [PATCH] [ORF] Re-factor and update`ORFFM4StoryIE` * fix getting media via DASH instead of inaccessible mp4 * also get in-page YT media --- youtube_dl/extractor/orf.py | 253 ++++++++++++++++++------------------ 1 file changed, 126 insertions(+), 127 deletions(-) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 25c16c84d..f03aa40dc 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,6 +6,7 @@ import functools import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( clean_html, determine_ext, @@ -14,10 +15,8 @@ from ..utils import ( int_or_none, merge_dicts, mimetype2ext, - orderedSet, parse_age_limit, parse_iso8601, - remove_end, strip_jsonp, txt_or_none, unified_strdate, @@ -305,11 +304,90 @@ class ORFPodcastIE(ORFRadioBase): }, self._extract_podcast_upload(data), rev=True) -class ORFIPTVIE(InfoExtractor): +class ORFIPTVBase(InfoExtractor): + _TITLE_STRIP_RE = '' + + def _extract_video(self, video_id, webpage, fatal=False): + + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + video = traverse_obj(data, ( + 'sources', ('default', 'q8c'), + T(lambda x: x if x['loadBalancerUrl'] else None), + any)) + + load_balancer_url = video['loadBalancerUrl'] + + try: + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + except ExtractorError: + rendition = None + + if not rendition: + rendition = { + 'redirect': { + 'smil': re.sub( + r'(/)jsonp(/.+\.)mp4$', r'\1dash\2smil/manifest.mpd', + load_balancer_url), + }, + } + + f = traverse_obj(video, { + 'abr': ('audioBitrate', T(int_or_none)), + 'vbr': ('bitrate', T(int_or_none)), + 'fps': ('videoFps', T(int_or_none)), + 'width': ('videoWidth', T(int_or_none)), + 'height': ('videoHeight', T(int_or_none)), + }) + + formats = [] + for format_id, format_url in traverse_obj(rendition, ( + 'redirect', T(dict.items), Ellipsis)): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id, + entry_protocol='m3u8_native')) + elif determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id=format_id)) + + if formats or fatal: + self._sort_formats(formats) + else: + return + + return merge_dicts({ + 'id': video_id, + 'title': re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage)), + 'description': self._og_search_description(webpage), + 'upload_date': unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date', fatal=False)), + 'formats': formats, + }, traverse_obj(data, { + 'duration': ('duration', T(k_float_or_none)), + 'thumbnail': ('sources', 'default', 'preview', T(url_or_none)), + }), rev=True) + + +class ORFIPTVIE(ORFIPTVBase): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' _WORKING = False # URLs redirect to orf.at/ _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P\d+)' + _TITLE_STRIP_RE = r'\s+-\s+iptv\.ORF\.at\S*$' _TEST = { 'url': 'http://iptv.orf.at/stories/2275236/', @@ -334,74 +412,32 @@ class ORFIPTVIE(InfoExtractor): video_id = self._search_regex( r'data-video(?:id)?="(\d+)"', webpage, 'video id') - data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, - video_id)[0] - - duration = float_or_none(data['duration'], 1000) - - video = data['sources']['default'] - load_balancer_url = video['loadBalancerUrl'] - abr = int_or_none(video.get('audioBitrate')) - vbr = int_or_none(video.get('bitrate')) - fps = int_or_none(video.get('videoFps')) - width = int_or_none(video.get('videoWidth')) - height = int_or_none(video.get('videoHeight')) - thumbnail = video.get('preview') - - rendition = self._download_json( - load_balancer_url, video_id, transform_source=strip_jsonp) - - f = { - 'abr': abr, - 'vbr': vbr, - 'fps': fps, - 'width': width, - 'height': height, - } - - formats = [] - for format_id, format_url in rendition['redirect'].items(): - if format_id == 'rtmp': - ff = f.copy() - ff.update({ - 'url': format_url, - 'format_id': format_id, - }) - formats.append(ff) - elif determine_ext(format_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id)) - elif determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id)) - else: - continue - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dc.date', webpage, 'upload date')) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - } + return self._extract_video(video_id, webpage) -class ORFFM4StoryIE(InfoExtractor): +class ORFFM4StoryIE(ORFIPTVBase): IE_NAME = 'orf:fm4:story' IE_DESC = 'fm4.orf.at stories' _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P\d+)' + _TITLE_STRIP_RE = r'\s+-\s+fm4\.ORF\.at\s*$' - _TEST = { + _TESTS = [{ + 'url': 'https://fm4.orf.at/stories/3041554/', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '3041554', + 'title': 'Is The EU Green Deal In Mortal Danger?', + }, + 'playlist_count': 4, + 'params': { + 'format': 'bestvideo', + }, + }, { 'url': 'http://fm4.orf.at/stories/2865738/', + 'info_dict': { + 'id': '2865738', + 'title': 'Manu Delago und Inner Tongue live', + }, 'playlist': [{ 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', 'info_dict': { @@ -418,86 +454,49 @@ class ORFFM4StoryIE(InfoExtractor): 'info_dict': { 'id': '547798', 'ext': 'flv', - 'title': 'Manu Delago und Inner Tongue live (2)', + 'title': 'Manu Delago und Inner Tongue https://vod-ww.mdn.ors.at/cms-worldwide_episodes_nas/_definst_/nas/cms-worldwide_episodes/online/14228823_0005.smil/chunklist_b992000_vo.m3u8live (2)', 'duration': 1504.08, 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20170913', 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', }, }], - } + 'skip': 'Videos gone', + }] def _real_extract(self, url): story_id = self._match_id(url) webpage = self._download_webpage(url, story_id) entries = [] - all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) - for idx, video_id in enumerate(all_ids): - data = self._download_json( - 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, - video_id)[0] + seen_ids = set() + for idx, video_id in enumerate(re.findall(r'data-video(?:id)?="(\d+)"', webpage)): + if video_id in seen_ids: + continue + seen_ids.add(video_id) + entry = self._extract_video(video_id, webpage, fatal=False) + if not entry: + continue - duration = float_or_none(data['duration'], 1000) - - video = data['sources']['q8c'] - load_balancer_url = video['loadBalancerUrl'] - abr = int_or_none(video.get('audioBitrate')) - vbr = int_or_none(video.get('bitrate')) - fps = int_or_none(video.get('videoFps')) - width = int_or_none(video.get('videoWidth')) - height = int_or_none(video.get('videoHeight')) - thumbnail = video.get('preview') - - rendition = self._download_json( - load_balancer_url, video_id, transform_source=strip_jsonp) - - f = { - 'abr': abr, - 'vbr': vbr, - 'fps': fps, - 'width': width, - 'height': height, - } - - formats = [] - for format_id, format_url in rendition['redirect'].items(): - if format_id == 'rtmp': - ff = f.copy() - ff.update({ - 'url': format_url, - 'format_id': format_id, - }) - formats.append(ff) - elif determine_ext(format_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id)) - elif determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id=format_id)) - else: - continue - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') if idx >= 1: # Titles are duplicates, make them unique - title += ' (' + str(idx + 1) + ')' - description = self._og_search_description(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dc.date', webpage, 'upload date')) + entry['title'] = '%s (%d)' % (entry['title'], idx) - entries.append({ - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - }) + entries.append(entry) - return self.playlist_result(entries) + seen_ids = set() + for yt_id in re.findall( + r'data-id\s*=\s*["\']([\w-]+)[^>]+\bclass\s*=\s*["\']youtube\b', + webpage): + if yt_id in seen_ids: + continue + seen_ids.add(yt_id) + if YoutubeIE.suitable(yt_id): + entries.append(self.url_result(yt_id, ie='Youtube', video_id=yt_id)) + + return self.playlist_result( + entries, story_id, + re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage, default='') or None)) class ORFONBase(InfoExtractor):