From 25a35cb38a472731d9f487309f6bcff94ee4918c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 19 Nov 2020 20:01:24 +0100 Subject: [PATCH] [googledrive] fix format extraction(closes #26979) --- youtube_dl/extractor/googledrive.py | 58 +++++++++++------------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index f2cc57e44..de8c80e36 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, int_or_none, lowercase_escape, + try_get, update_url_query, ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor): # video can't be watched anonymously due to view count limit reached, # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', - 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', - 'info_dict': { - 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', - 'ext': 'mp4', - 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', - } + 'only_matching': True, }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'info_dict': { - 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', - 'ext': 'mp4', - 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', - 'duration': 189, - }, 'only_matching': True, }, { 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id) + video_info = compat_parse_qs(self._download_webpage( + 'https://drive.google.com/get_video_info', + video_id, query={'docid': video_id})) - title = self._search_regex( - r'"title"\s*,\s*"([^"]+)', webpage, 'title', - default=None) or self._og_search_title(webpage) - duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', - default=None)) + def get_value(key): + return try_get(video_info, lambda x: x[key][0]) + + reason = get_value('reason') + title = get_value('title') + if not title and reason: + raise ExtractorError(reason, expected=True) formats = [] - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map', default='').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, - 'fmt_list', default='').split(',') + fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') + fmt_list = (get_value('fmt_list') or '').split(',') if fmt_stream_map and fmt_list: resolutions = {} for fmt in fmt_list: @@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor): if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) - if not formats: - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason, expected=True) + if not formats and reason: + raise ExtractorError(reason, expected=True) self._sort_formats(formats) - hl = self._search_regex( - r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + hl = get_value('hl') subtitles_id = None - ttsurl = self._search_regex( - r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + ttsurl = get_value('ttsurl') if ttsurl: # the video Id for subtitles will be the last value in the ttsurl # query string @@ -279,8 +263,8 @@ class GoogleDriveIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, + 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, + 'duration': int_or_none(get_value('length_seconds')), 'formats': formats, 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'automatic_captions': self.extract_automatic_captions(