[closertotruth] Add new extractor

Removed print statement from code. Replaced two regex searches with the corret ones. Removed some unnecessary semicolumns fixed title extraction refactored everything to search_regex
2024-11-19 20:10:25 +00:00 · 2016-02-26 13:31:52 +01:00 · 2016-02-26 13:31:52 +01:00 · 5650b0d582
commit 5650b0d582
parent bf4b3b6bd9
2 changed files with 62 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -115,6 +115,7 @@ from .cinemassacre import CinemassacreIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
 from .clubic import ClubicIE
 from .clyp import ClypIE
--- a/youtube_dl/extractor/closertotruth.py
+++ b/youtube_dl/extractor/closertotruth.py
@ -0,0 +1,61 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 import itertools
 import hashlib
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
    unified_strdate,
 )
 class CloserToTruthIE(InfoExtractor):
    _VALID_URL = r'http?://(?:www\.)?closertotruth\.com/series/\S+#video-(?P<id>\w+)'
    _TESTS = [{
        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
        'md5': '2aa5b8971633d86fe32152827846a5b4',
        'info_dict': {
            'id': '3688',
            'ext': 'mov',
            'title': 'Solutions to the Mind-Body Problem? -  Dean W.Zimmerman '
        }
    },{
        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-4048',
        'md5': 'a3882bb6e453720d8a7a3983f58abd04',
        'info_dict': {
            'id': '4048',
            'ext': 'mov',
            'title': 'Solutions to the Mind-Body Problem? -  John Searle '
        }
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        #compose title for video
        video_title = self._search_regex(r'<title>(.+) \|.+</title>', webpage, 'video title')
        entry_id = self._search_regex(r'<a href="\S+" id="video-'+video_id+'" data-kaltura="(\w+)">.+<span.+<\/a>', webpage, "video entry_id")
        interviewee_name = re.sub(r'(<[^>]+>)', '',self._search_regex(r'<a href="\S+" id="video-'+video_id+'" data-kaltura="\w+">(.+)<span.+<\/a>', webpage, "video interviewee_name"))
        video_title = video_title + ' - ' + interviewee_name
        #extract the partner id for kaltura.com
        p_id = self._search_regex(r'<script src="http://cdnapi\.kaltura\.com/p/(?P<p>\w+)/sp/\w+/\S+/partner_id/\w+"></script>', webpage, "kaltura partner_id")
        #request video url at kaltura API
        #from: http://knowledge.kaltura.com/faq/how-retrieve-download-or-streaming-url-using-api-calls
        api_request_url = 'http://www.kaltura.com/p/'+p_id+'/sp/0/playManifest/entryId/'+entry_id+'/protocol/HTTPS/flavorParamId/0/video.mp4'
        api_response = self._download_webpage(api_request_url, video_id)
        video_url =  self._search_regex(r'<media url="(\S+)"', api_response, "video url")
        return {
            'url': video_url,
            'id': video_id,
            'title': video_title,
        }