2014-01-21 23:07:02 +00:00
from __future__ import unicode_literals
2013-06-23 19:34:03 +00:00
import re
2015-02-18 17:27:45 +00:00
from . common import InfoExtractor
2014-12-13 11:24:42 +00:00
from . . compat import (
2013-07-14 11:41:46 +00:00
compat_urllib_parse ,
2015-01-30 15:57:59 +00:00
compat_str ,
2014-12-13 11:24:42 +00:00
)
from . . utils import (
2013-06-23 19:34:03 +00:00
ExtractorError ,
2014-01-21 23:17:33 +00:00
find_xpath_attr ,
2014-01-20 21:11:34 +00:00
fix_xml_ampersands ,
2014-03-08 21:06:28 +00:00
HEADRequest ,
2015-11-21 16:18:17 +00:00
sanitized_Request ,
2014-03-08 19:06:20 +00:00
unescapeHTML ,
2014-01-21 19:54:47 +00:00
url_basename ,
RegexNotFoundError ,
2013-06-23 19:34:03 +00:00
)
2014-01-21 23:17:33 +00:00
2013-07-14 12:02:04 +00:00
def _media_xml_tag ( tag ) :
return ' { http://search.yahoo.com/mrss/} %s ' % tag
2013-06-23 19:34:03 +00:00
2013-07-14 11:41:46 +00:00
2015-02-18 17:27:45 +00:00
class MTVServicesInfoExtractor ( InfoExtractor ) :
2014-03-08 19:06:20 +00:00
_MOBILE_TEMPLATE = None
2015-04-25 14:22:20 +00:00
_LANG = None
2014-06-22 17:19:26 +00:00
2013-07-14 11:41:46 +00:00
@staticmethod
def _id_from_uri ( uri ) :
return uri . split ( ' : ' ) [ - 1 ]
# This was originally implemented for ComedyCentral, but it also works here
@staticmethod
def _transform_rtmp_url ( rtmp_video_url ) :
m = re . match ( r ' ^rtmpe?://.*?/(?P<finalid>gsp \ ..+?/.*)$ ' , rtmp_video_url )
if not m :
2013-11-17 21:11:39 +00:00
return rtmp_video_url
2014-11-12 00:10:08 +00:00
base = ' http://viacommtvstrmfs.fplive.net/ '
2013-07-14 12:29:15 +00:00
return base + m . group ( ' finalid ' )
2014-06-22 17:19:26 +00:00
def _get_feed_url ( self , uri ) :
return self . _FEED_URL
2013-07-14 12:29:15 +00:00
def _get_thumbnail_url ( self , uri , itemdoc ) :
2013-12-03 13:58:24 +00:00
search_path = ' %s / %s ' % ( _media_xml_tag ( ' group ' ) , _media_xml_tag ( ' thumbnail ' ) )
thumb_node = itemdoc . find ( search_path )
if thumb_node is None :
return None
else :
return thumb_node . attrib [ ' url ' ]
2013-07-14 11:41:46 +00:00
2014-03-08 19:06:20 +00:00
def _extract_mobile_video_formats ( self , mtvn_id ) :
webpage_url = self . _MOBILE_TEMPLATE % mtvn_id
2015-11-21 16:18:17 +00:00
req = sanitized_Request ( webpage_url )
2014-03-08 19:06:20 +00:00
# Otherwise we get a webpage that would execute some javascript
2015-01-24 17:07:21 +00:00
req . add_header ( ' User-Agent ' , ' curl/7 ' )
2014-03-08 19:06:20 +00:00
webpage = self . _download_webpage ( req , mtvn_id ,
2014-11-23 20:39:15 +00:00
' Downloading mobile page ' )
2014-03-08 21:06:28 +00:00
metrics_url = unescapeHTML ( self . _search_regex ( r ' <a href= " (http://metrics.+?) " ' , webpage , ' url ' ) )
req = HEADRequest ( metrics_url )
response = self . _request_webpage ( req , mtvn_id , ' Resolving url ' )
url = response . geturl ( )
# Transform the url to get the best quality:
url = re . sub ( r ' .+pxE=mp4 ' , ' http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4 ' , url , 1 )
2014-11-23 19:41:03 +00:00
return [ { ' url ' : url , ' ext ' : ' mp4 ' } ]
2014-03-08 19:06:20 +00:00
def _extract_video_formats ( self , mdoc , mtvn_id ) :
2015-08-28 15:45:29 +00:00
if re . match ( r ' .*/(error_country_block \ .swf|geoblock \ .mp4|copyright_error \ .flv(?: \ ?geo \ b.+?)?)$ ' , mdoc . find ( ' .//src ' ) . text ) is not None :
2014-03-08 19:06:20 +00:00
if mtvn_id is not None and self . _MOBILE_TEMPLATE is not None :
2014-03-08 21:06:28 +00:00
self . to_screen ( ' The normal version is not available from your '
2014-11-23 20:39:15 +00:00
' country, trying with the mobile version ' )
2014-03-08 19:06:20 +00:00
return self . _extract_mobile_video_formats ( mtvn_id )
2014-03-08 18:43:18 +00:00
raise ExtractorError ( ' This video is not available from your country. ' ,
2014-11-23 20:39:15 +00:00
expected = True )
2013-07-14 11:41:46 +00:00
2013-10-04 09:10:04 +00:00
formats = [ ]
for rendition in mdoc . findall ( ' .//rendition ' ) :
try :
_ , _ , ext = rendition . attrib [ ' type ' ] . partition ( ' / ' )
rtmp_video_url = rendition . find ( ' ./src ' ) . text
2015-02-01 11:08:21 +00:00
if rtmp_video_url . endswith ( ' siteunavail.png ' ) :
continue
formats . append ( {
' ext ' : ext ,
' url ' : self . _transform_rtmp_url ( rtmp_video_url ) ,
' format_id ' : rendition . get ( ' bitrate ' ) ,
' width ' : int ( rendition . get ( ' width ' ) ) ,
' height ' : int ( rendition . get ( ' height ' ) ) ,
} )
2013-10-04 09:10:04 +00:00
except ( KeyError , TypeError ) :
raise ExtractorError ( ' Invalid rendition field. ' )
2014-05-17 03:49:41 +00:00
self . _sort_formats ( formats )
2013-10-04 09:10:04 +00:00
return formats
2013-07-14 11:41:46 +00:00
2015-01-30 15:57:59 +00:00
def _extract_subtitles ( self , mdoc , mtvn_id ) :
subtitles = { }
for transcript in mdoc . findall ( ' .//transcript ' ) :
if transcript . get ( ' kind ' ) != ' captions ' :
continue
lang = transcript . get ( ' srclang ' )
2015-02-18 17:27:45 +00:00
subtitles [ lang ] = [ {
' url ' : compat_str ( typographic . get ( ' src ' ) ) ,
' ext ' : typographic . get ( ' format ' )
} for typographic in transcript . findall ( ' ./typographic ' ) ]
return subtitles
2015-01-30 15:57:59 +00:00
2013-07-14 11:41:46 +00:00
def _get_video_info ( self , itemdoc ) :
uri = itemdoc . find ( ' guid ' ) . text
video_id = self . _id_from_uri ( uri )
self . report_extraction ( video_id )
2013-07-14 12:02:04 +00:00
mediagen_url = itemdoc . find ( ' %s / %s ' % ( _media_xml_tag ( ' group ' ) , _media_xml_tag ( ' content ' ) ) ) . attrib [ ' url ' ]
2013-10-28 22:37:01 +00:00
# Remove the templates, like &device={device}
2014-01-21 23:07:02 +00:00
mediagen_url = re . sub ( r ' &[^=]*?= { .*?}(?=(&|$)) ' , ' ' , mediagen_url )
2013-07-14 11:41:46 +00:00
if ' acceptMethods ' not in mediagen_url :
2015-08-28 15:27:36 +00:00
mediagen_url + = ' & ' if ' ? ' in mediagen_url else ' ? '
mediagen_url + = ' acceptMethods=fms '
2014-01-21 23:21:27 +00:00
2014-01-21 18:57:38 +00:00
mediagen_doc = self . _download_xml ( mediagen_url , video_id ,
2014-11-23 20:39:15 +00:00
' Downloading video urls ' )
2013-07-14 11:41:46 +00:00
2015-04-15 15:02:34 +00:00
item = mediagen_doc . find ( ' ./video/item ' )
if item is not None and item . get ( ' type ' ) == ' t ext ' :
message = ' %s returned error: ' % self . IE_NAME
if item . get ( ' code ' ) is not None :
message + = ' %s - ' % item . get ( ' code ' )
message + = item . text
raise ExtractorError ( message , expected = True )
2013-07-14 11:41:46 +00:00
description_node = itemdoc . find ( ' description ' )
if description_node is not None :
2013-10-10 17:53:44 +00:00
description = description_node . text . strip ( )
2013-07-14 11:41:46 +00:00
else :
description = None
2013-10-04 09:10:04 +00:00
2014-01-21 23:17:33 +00:00
title_el = None
if title_el is None :
title_el = find_xpath_attr (
itemdoc , ' .// { http://search.yahoo.com/mrss/}category ' ,
' scheme ' , ' urn:mtvn:video_title ' )
if title_el is None :
2014-02-11 18:07:30 +00:00
title_el = itemdoc . find ( ' .// { http://search.yahoo.com/mrss/}title ' )
if title_el is None :
2015-08-28 16:10:49 +00:00
title_el = itemdoc . find ( ' .//title ' ) or itemdoc . find ( ' ./title ' )
2014-02-06 03:15:11 +00:00
if title_el . text is None :
title_el = None
2014-01-21 23:17:33 +00:00
title = title_el . text
if title is None :
raise ExtractorError ( ' Could not find video title ' )
2014-01-22 02:49:16 +00:00
title = title . strip ( )
2014-01-21 23:17:33 +00:00
2014-03-08 19:06:20 +00:00
# This a short id that's used in the webpage urls
mtvn_id = None
mtvn_id_node = find_xpath_attr ( itemdoc , ' .// { http://search.yahoo.com/mrss/}category ' ,
2014-11-23 20:39:15 +00:00
' scheme ' , ' urn:mtvn:id ' )
2014-03-08 19:06:20 +00:00
if mtvn_id_node is not None :
mtvn_id = mtvn_id_node . text
2013-12-03 13:21:06 +00:00
return {
2014-01-21 23:17:33 +00:00
' title ' : title ,
2014-03-08 19:06:20 +00:00
' formats ' : self . _extract_video_formats ( mediagen_doc , mtvn_id ) ,
2015-01-30 15:57:59 +00:00
' subtitles ' : self . _extract_subtitles ( mediagen_doc , mtvn_id ) ,
2013-10-04 09:10:04 +00:00
' id ' : video_id ,
' thumbnail ' : self . _get_thumbnail_url ( uri , itemdoc ) ,
' description ' : description ,
}
2013-07-14 11:41:46 +00:00
def _get_videos_info ( self , uri ) :
video_id = self . _id_from_uri ( uri )
2014-06-22 17:19:26 +00:00
feed_url = self . _get_feed_url ( uri )
2013-07-14 11:41:46 +00:00
data = compat_urllib_parse . urlencode ( { ' uri ' : uri } )
2015-04-25 14:22:20 +00:00
info_url = feed_url + ' ? '
if self . _LANG :
info_url + = ' lang= %s & ' % self . _LANG
info_url + = data
2015-08-28 15:24:54 +00:00
return self . _get_videos_info_from_url ( info_url , video_id )
def _get_videos_info_from_url ( self , url , video_id ) :
2013-12-10 11:45:22 +00:00
idoc = self . _download_xml (
2015-08-28 15:24:54 +00:00
url , video_id ,
2014-01-21 23:07:02 +00:00
' Downloading info ' , transform_source = fix_xml_ampersands )
2014-11-20 15:25:19 +00:00
return self . playlist_result (
[ self . _get_video_info ( item ) for item in idoc . findall ( ' .//item ' ) ] )
2013-06-23 19:34:03 +00:00
2014-01-21 19:54:47 +00:00
def _real_extract ( self , url ) :
title = url_basename ( url )
webpage = self . _download_webpage ( url , title )
try :
2014-01-22 10:35:17 +00:00
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
# or http://media.mtvnservices.com/{mgid}
og_url = self . _og_search_video_url ( webpage )
mgid = url_basename ( og_url )
if mgid . endswith ( ' .swf ' ) :
mgid = mgid [ : - 4 ]
2014-01-21 19:54:47 +00:00
except RegexNotFoundError :
2014-07-13 11:15:18 +00:00
mgid = None
if mgid is None or ' : ' not in mgid :
2014-01-30 18:04:33 +00:00
mgid = self . _search_regex (
[ r ' data-mgid= " (.*?) " ' , r ' swfobject.embedSWF \ ( " .*?(mgid:.*?) " ' ] ,
2015-09-26 13:45:43 +00:00
webpage , ' mgid ' , default = None )
if not mgid :
sm4_embed = self . _html_search_meta (
' sm4:video:embed ' , webpage , ' sm4 embed ' , default = ' ' )
mgid = self . _search_regex (
r ' embed/(mgid:.+?)[ " \' &?/] ' , sm4_embed , ' mgid ' )
2015-01-30 15:57:59 +00:00
videos_info = self . _get_videos_info ( mgid )
return videos_info
2014-01-21 19:54:47 +00:00
2013-12-03 13:58:24 +00:00
2014-06-22 17:19:26 +00:00
class MTVServicesEmbeddedIE ( MTVServicesInfoExtractor ) :
IE_NAME = ' mtvservices:embedded '
_VALID_URL = r ' https?://media \ .mtvnservices \ .com/embed/(?P<mgid>.+?)( \ ?|/|$) '
_TEST = {
# From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
' url ' : ' http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid % 3D1043906 %26u ri % 3Dmgid % 3Auma % 3Avideo % 3Amtv.com % 3A1043906 ' ,
' md5 ' : ' cb349b21a7897164cede95bd7bf3fbb9 ' ,
' info_dict ' : {
' id ' : ' 1043906 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Peter Dinklage Sums Up \' Game Of Thrones \' In 45 Seconds ' ,
' description ' : ' " Sexy sexy sexy, stabby stabby stabby, beautiful language, " says Peter Dinklage as he tries summarizing " Game of Thrones " in under a minute. ' ,
} ,
}
2015-09-26 13:46:42 +00:00
@staticmethod
def _extract_url ( webpage ) :
mobj = re . search (
r ' <iframe[^>]+?src=([ " \' ])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?) \ 1 ' , webpage )
if mobj :
return mobj . group ( ' url ' )
2014-06-22 17:19:26 +00:00
def _get_feed_url ( self , uri ) :
video_id = self . _id_from_uri ( uri )
site_id = uri . replace ( video_id , ' ' )
2014-11-14 18:02:18 +00:00
config_url = ( ' http://media.mtvnservices.com/pmt/e1/players/ {0} / '
2014-11-23 20:39:15 +00:00
' context4/context5/config.xml ' . format ( site_id ) )
2014-06-22 17:19:26 +00:00
config_doc = self . _download_xml ( config_url , video_id )
feed_node = config_doc . find ( ' .//feed ' )
feed_url = feed_node . text . strip ( ) . split ( ' ? ' ) [ 0 ]
return feed_url
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
mgid = mobj . group ( ' mgid ' )
return self . _get_videos_info ( mgid )
2013-12-03 13:58:24 +00:00
class MTVIE ( MTVServicesInfoExtractor ) :
2013-12-16 21:05:28 +00:00
_VALID_URL = r ''' (?x)^https?://
( ? : ( ? : www \. ) ? mtv \. com / videos / . + ? / ( ? P < videoid > [ 0 - 9 ] + ) / [ ^ / ] + $ |
m \. mtv \. com / videos / video \. rbml \? . * ? id = ( ? P < mgid > [ ^ & ] + ) ) '''
2013-12-03 13:58:24 +00:00
_FEED_URL = ' http://www.mtv.com/player/embed/AS3/rss/ '
_TESTS = [
{
2014-01-21 23:07:02 +00:00
' url ' : ' http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml ' ,
' md5 ' : ' 850f3f143316b1e71fa56a4edfd6e0f8 ' ,
' info_dict ' : {
2015-02-01 11:08:21 +00:00
' id ' : ' 853555 ' ,
' ext ' : ' mp4 ' ,
2014-01-21 23:07:02 +00:00
' title ' : ' Taylor Swift - " Ours (VH1 Storytellers) " ' ,
' description ' : ' Album: Taylor Swift performs " Ours " for VH1 Storytellers at Harvey Mudd College. ' ,
2013-12-03 13:58:24 +00:00
} ,
} ,
]
def _get_thumbnail_url ( self , uri , itemdoc ) :
return ' http://mtv.mtvnimages.com/uri/ ' + uri
2013-06-23 19:34:03 +00:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' videoid ' )
2013-12-31 16:21:44 +00:00
uri = mobj . groupdict ( ) . get ( ' mgid ' )
2013-12-16 21:05:28 +00:00
if uri is None :
webpage = self . _download_webpage ( url , video_id )
2014-11-23 19:41:03 +00:00
2013-12-16 21:05:28 +00:00
# Some videos come from Vevo.com
2015-02-01 11:08:21 +00:00
m_vevo = re . search (
r ' (?s)isVevoVideo = true;.*?vevoVideoId = " (.*?) " ; ' , webpage )
2013-12-16 21:05:28 +00:00
if m_vevo :
2014-11-23 20:20:46 +00:00
vevo_id = m_vevo . group ( 1 )
2014-01-21 23:07:02 +00:00
self . to_screen ( ' Vevo video detected: %s ' % vevo_id )
2013-12-16 21:05:28 +00:00
return self . url_result ( ' vevo: %s ' % vevo_id , ie = ' Vevo ' )
2014-11-23 19:41:03 +00:00
2014-01-21 23:07:02 +00:00
uri = self . _html_search_regex ( r ' /uri/(.*?) \ ? ' , webpage , ' uri ' )
2013-07-14 11:41:46 +00:00
return self . _get_videos_info ( uri )
2014-01-21 19:59:31 +00:00
class MTVIggyIE ( MTVServicesInfoExtractor ) :
IE_NAME = ' mtviggy.com '
_VALID_URL = r ' https?://www \ .mtviggy \ .com/videos/.+ '
_TEST = {
' url ' : ' http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/ ' ,
' info_dict ' : {
' id ' : ' 984696 ' ,
' ext ' : ' mp4 ' ,
2014-01-22 01:04:51 +00:00
' title ' : ' Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet ' ,
2014-01-21 19:59:31 +00:00
}
}
_FEED_URL = ' http://all.mtvworldverticals.com/feed-xml/ '
2015-08-25 22:06:44 +00:00
2015-08-28 15:24:54 +00:00
2015-08-25 22:06:44 +00:00
class MTVDEIE ( MTVServicesInfoExtractor ) :
IE_NAME = ' mtv.de '
2015-08-28 15:41:54 +00:00
_VALID_URL = r ' https?://(?:www \ .)?mtv \ .de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id> \ d+)-[^/#?]+/*(?:[#?].*)?$ '
2015-08-28 15:24:54 +00:00
_TESTS = [ {
' url ' : ' http://www.mtv.de/artists/10571-cro/videos/61131-traum ' ,
' info_dict ' : {
' id ' : ' music_video-a50bc5f0b3aa4b3190aa ' ,
' ext ' : ' mp4 ' ,
' title ' : ' MusicVideo_cro-traum ' ,
' description ' : ' Cro - Traum ' ,
2015-08-25 22:06:44 +00:00
} ,
2015-08-28 15:24:54 +00:00
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2015-08-28 15:31:38 +00:00
} , {
# mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
' url ' : ' http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen ' ,
' info_dict ' : {
' id ' : ' local_playlist-f5ae778b9832cc837189 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1 ' ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2015-08-28 15:41:54 +00:00
} , {
# single video in pagePlaylist with different id
' url ' : ' http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3 ' ,
' info_dict ' : {
' id ' : ' local_playlist-4e760566473c4c8c5344 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1 ' ,
' description ' : ' MTV Movies Supercut ' ,
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2015-08-28 15:24:54 +00:00
} ]
2015-08-25 22:06:44 +00:00
def _real_extract ( self , url ) :
2015-08-28 15:24:54 +00:00
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
playlist = self . _parse_json (
self . _search_regex (
r ' window \ .pagePlaylist \ s*= \ s*( \ [.+? \ ]); \ n ' , webpage , ' page playlist ' ) ,
video_id )
2015-08-25 22:06:44 +00:00
2015-08-28 15:41:54 +00:00
# news pages contain single video in playlist with different id
if len ( playlist ) == 1 :
return self . _get_videos_info_from_url ( playlist [ 0 ] [ ' mrss ' ] , video_id )
2015-08-25 22:06:44 +00:00
for item in playlist :
2015-08-28 15:24:54 +00:00
item_id = item . get ( ' id ' )
if item_id and compat_str ( item_id ) == video_id :
return self . _get_videos_info_from_url ( item [ ' mrss ' ] , video_id )