Back-port from yt-dlp PR #2149

Include MediasetShowIE, DRM work-around, etc; add MediasetClipIE
This commit is contained in:
dirkf 2022-01-01 19:37:30 +00:00
parent 3efdb2758d
commit b663bba81d
2 changed files with 258 additions and 45 deletions

View File

@ -644,7 +644,11 @@ from .massengeschmacktv import MassengeschmackTVIE
from .matchtv import MatchTVIE from .matchtv import MatchTVIE
from .mdr import MDRIE from .mdr import MDRIE
from .medaltv import MedalTVIE from .medaltv import MedalTVIE
from .mediaset import MediasetIE from .mediaset import (
MediasetIE,
MediasetClipIE,
MediasetShowIE,
)
from .mediasite import ( from .mediasite import (
MediasiteIE, MediasiteIE,
MediasiteCatalogIE, MediasiteCatalogIE,

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import functools
import re import re
from .theplatform import ThePlatformBaseIE from .theplatform import ThePlatformBaseIE
@ -10,7 +11,11 @@ from ..compat import (
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
GeoRestrictedError,
int_or_none, int_or_none,
OnDemandPagedList,
try_get,
urljoin,
update_url_query, update_url_query,
) )
@ -30,37 +35,110 @@ class MediasetIE(ThePlatformBaseIE):
''' '''
_TESTS = [{ _TESTS = [{
# full episode # full episode
'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824', 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102',
'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'md5': 'a7e75c6384871f322adb781d3bd72c26',
'info_dict': { 'info_dict': {
'id': 'FAFU000000661824', 'id': 'F310575103000102',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Quarta puntata', 'title': 'Episodio 1',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'description': 'md5:e8017b7d7194e9bfb75299c2b8d81e02',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1414.26, 'duration': 2682.0,
'upload_date': '20161107', 'upload_date': '20210530',
'series': 'Hello Goodbye', 'series': 'Mr Wrong - Lezioni d\'amore',
'timestamp': 1478532900, 'timestamp': 1622413946,
'uploader': 'Rete 4', 'uploader': 'Canale 5',
'uploader_id': 'R4', 'uploader_id': 'C5',
'season': 'Season 1',
'episode': 'Episode 1',
'season_number': 1,
'episode_number': 1,
'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}],
}, },
'skip': 'Geo restricted',
}, { }, {
'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
'md5': '288532f0ad18307705b01e581304cd7b', 'md5': '1276f966ac423d16ba255ce867de073e',
'info_dict': { 'info_dict': {
'id': 'F309013801000501', 'id': 'F309013801000501',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Puntata del 25 maggio', 'title': 'Puntata del 25 maggio',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'description': 'md5:ee2e456e3eb1dba5e814596655bb5296',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 6565.007, 'duration': 6565.008,
'upload_date': '20180526', 'upload_date': '20200903',
'series': 'Matrix', 'series': 'Matrix',
'timestamp': 1527326245, 'timestamp': 1599172492,
'uploader': 'Canale 5', 'uploader': 'Canale 5',
'uploader_id': 'C5', 'uploader_id': 'C5',
'season': 'Season 5',
'episode': 'Episode 5',
'season_number': 5,
'episode_number': 5,
'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
}, },
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801',
'md5': 'd1650ac9ff944f185556126a736df148',
'info_dict': {
'id': 'F303843101017801',
'ext': 'mp4',
'title': 'Episodio 69 - Pezzo di luna',
'description': 'md5:7c32c8ec4118b72588b9412f11353f73',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 263.008,
'upload_date': '20200902',
'series': 'Camera Café 5',
'timestamp': 1599064700,
'uploader': 'Italia 1',
'uploader_id': 'I1',
'season': 'Season 5',
'episode': 'Episode 178',
'season_number': 5,
'episode_number': 178,
'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}],
},
'skip': 'Geo restricted',
}, {
'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601',
'md5': '567e9ad375b7a27a0e370650f572a1e3',
'info_dict': {
'id': 'F303843107000601',
'ext': 'mp4',
'title': 'Episodio 51 - Tu chi sei?',
'description': 'md5:42ef006e56824cc31787a547590923f4',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 367.021,
'upload_date': '20200902',
'series': 'Camera Café 5',
'timestamp': 1599069817,
'uploader': 'Italia 1',
'uploader_id': 'I1',
'season': 'Season 5',
'episode': 'Episode 6',
'season_number': 5,
'episode_number': 6,
'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}],
},
'skip': 'Geo restricted',
}, {
# movie
'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101',
'md5': '720440187a2ae26af8148eb9e6b901ed',
'info_dict': {
'id': 'F006474501000101',
'ext': 'mp4',
'title': 'Selvaggi',
'description': 'md5:cfdedbbfdd12d4d0e5dcf1fa1b75284f',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 5233.01,
'upload_date': '20210729',
'timestamp': 1627594716,
'uploader': 'Cine34',
'uploader_id': 'B6',
'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}],
},
'skip': 'Geo restricted',
}, { }, {
# clip # clip
'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
@ -127,6 +205,22 @@ class MediasetIE(ThePlatformBaseIE):
video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
def _check_drm_formats(self, tp_formats, video_id):
has_nondrm, drm_manifest = False, ''
for f in tp_formats:
if '_sampleaes/' in (f.get('manifest_url') or ''):
drm_manifest = drm_manifest or f['manifest_url']
f['has_drm'] = True
if not has_nondrm and not f.get('has_drm') and f.get('manifest_url'):
has_nondrm = True
nodrm_manifest = re.sub(r'_sampleaes/(\w+)_fp_', r'/\1_no_', drm_manifest)
if has_nondrm or nodrm_manifest == drm_manifest:
return
tp_formats.extend(self._extract_m3u8_formats(
nodrm_manifest, video_id, m3u8_id='hls', fatal=False) or [])
def _real_extract(self, url): def _real_extract(self, url):
guid = self._match_id(url) guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid tp_path = 'PR1GhC/media/guid/2702976343/' + guid
@ -134,49 +228,164 @@ class MediasetIE(ThePlatformBaseIE):
formats = [] formats = []
subtitles = {} subtitles = {}
first_e = None first_e = geo_e = None
for asset_type in ('SD', 'HD'): asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD'
# TODO: fixup ISM+none manifest URLs # TODO: fixup ISM+none manifest URLs
for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'): for f in ('MPEG4', 'M3U'):
try: try:
tp_formats, tp_subtitles = self._extract_theplatform_smil( tp_formats, tp_subtitles = self._extract_theplatform_smil(
update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
'mbr': 'true', 'mbr': 'true',
'formats': f, 'formats': f,
'assetTypes': asset_type, 'assetTypes': asset_type,
}), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type)) }), guid, 'Downloading %s SMIL data' % (f.split('+')[0]))
except ExtractorError as e: except ExtractorError as e:
if not first_e: if not first_e:
first_e = e first_e = e
break if not geo_e and isinstance(e, GeoRestrictedError):
for tp_f in tp_formats: geo_e = e
tp_f['quality'] = 1 if asset_type == 'HD' else 0 continue
formats.extend(tp_formats) self._check_drm_formats(tp_formats, guid)
subtitles = self._merge_subtitles(subtitles, tp_subtitles) formats.extend(tp_formats)
if first_e and not formats: subtitles = self._merge_subtitles(subtitles, tp_subtitles)
# check for errors and report them
if (first_e or geo_e) and not formats:
if geo_e:
raise geo_e
if 'None of the available releases match' in first_e.message:
raise ExtractorError('No non-DRM formats available', cause=first_e)
raise first_e raise first_e
self._sort_formats(formats) self._sort_formats(formats)
fields = []
for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
fields.extend(templ % repl for repl in repls)
feed_data = self._download_json( feed_data = self._download_json(
'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid, 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid,
guid, fatal=False, query={'fields': ','.join(fields)}) guid, fatal=False)
if feed_data: if feed_data:
publish_info = feed_data.get('mediasetprogram$publishInfo') or {} publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
thumbnails = feed_data.get('thumbnails') or {}
thumbnail = None
for key, value in thumbnails.items():
if key.startswith('image_keyframe_poster-'):
thumbnail = value.get('url')
break
info.update({ info.update({
'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')), 'description': info.get('description') or feed_data.get('description') or feed_data.get('longDescription'),
'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
'series': feed_data.get('mediasetprogram$brandTitle'),
'uploader': publish_info.get('description'), 'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'), 'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')), 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
'thumbnail': thumbnail,
}) })
if feed_data.get('programType') == 'episode':
info.update({
'episode_number': int_or_none(
feed_data.get('tvSeasonEpisodeNumber')),
'season_number': int_or_none(
feed_data.get('tvSeasonNumber')),
'series': feed_data.get('mediasetprogram$brandTitle'),
})
info.update({ info.update({
'id': guid, 'id': guid,
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
}) })
return info return info
class MediasetClipIE(MediasetIE):
_VALID_URL = r'https?://(?:www\.)?\w+\.mediaset\.it/video/(?:[^/]+/)*[\w-]+_(?P<id>\d+)\.s?html?'
_TESTS = [{
'url': 'https://www.grandefratello.mediaset.it/video/ventinovesima-puntata_27071.shtml',
'info_dict': {
'id': 'F310293901002901',
'ext': 'mp4',
},
'skip': 'Geo restricted, DRM content',
}]
def _real_extract(self, url):
clip_id = self._match_id(url)
webpage = self._download_webpage(url, clip_id)
guid = self._search_regex(
(r'''var\s*_onplay_guid\s*=\s*(?P<q>'|"|\b)(?P<guid>[\dA-Z]{16,})(?P=q)\s*;''',
r'\bGUID\s+(?P<guid>[\dA-Z]{16,})\b', ),
webpage, 'clip GUID', group='guid')
return self.url_result('mediaset:%s' % guid, ie='Mediaset', video_id=clip_id)
class MediasetShowIE(MediasetIE):
_VALID_URL = r'''(?x)
(?:
https?://
(?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?:
(?:fiction|programmi-tv|serie-tv)/(?:.+?/)?
(?:[a-z-]+)_SE(?P<id>\d{12})
(?:,ST(?P<st>\d{12}))?
(?:,sb(?P<sb>\d{9}))?$
)
)
'''
_TESTS = [{
# TV Show webpage (general webpage)
'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061',
'info_dict': {
'id': '000000000061',
'title': 'Le Iene',
},
'playlist_mincount': 7,
}, {
# TV Show webpage (specific season)
'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
'info_dict': {
'id': '000000002763',
'title': 'Le Iene',
},
'playlist_mincount': 7,
}, {
# TV Show specific playlist (with multiple pages)
'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
'info_dict': {
'id': '100013375',
'title': 'I servizi',
},
'playlist_mincount': 50,
}]
_BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d'
_PAGE_SIZE = 25
_match_valid_url = lambda s, u: re.match(s._VALID_URL, u)
def _fetch_page(self, sb, page):
lower_limit = page * self._PAGE_SIZE + 1
upper_limit = lower_limit + self._PAGE_SIZE - 1
content = self._download_json(
self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb)
for entry in content.get('entries') or []:
res = self.url_result('mediaset:' + entry['guid'])
if res:
res['playlist_title'] = entry['mediasetprogram$subBrandDescription']
yield res
def _real_extract(self, url):
playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb')
if not sb:
page = self._download_webpage(url, st or playlist_id)
entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url))
for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)]
title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None)
or self._og_search_title(page))
return self.playlist_result(entries, st or playlist_id, title)
entries = OnDemandPagedList(
functools.partial(self._fetch_page, sb),
self._PAGE_SIZE)
# slice explicitly, as no __getitem__ in OnDemandPagedList yet
title = try_get(entries, lambda x: x.getslice(0, 1)[0]['playlist_title'])
return self.playlist_result(entries, sb, title)