From 58f6c2112d55cdd77ad76b323760bb934d7e7576 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jan 2021 14:07:56 +0100 Subject: [PATCH] [minds] improve extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/minds.py | 288 ++++++++++++++++------------- 2 files changed, 161 insertions(+), 129 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 90012fc4f..29b0e615e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -653,8 +653,8 @@ from .microsoftvirtualacademy import ( ) from .minds import ( MindsIE, - MindsActivityIE, MindsChannelIE, + MindsGroupIE, ) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 4523d0938..8e9f0f825 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -1,164 +1,196 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import (int_or_none, sanitized_Request, str_or_none, - unified_strdate) +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) -class MindsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' - _TEST = { +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ 'url': 'https://www.minds.com/media/100000000000086822', 'md5': '215a658184a419764852239d4970b045', 'info_dict': { 'id': '100000000000086822', 'ext': 'mp4', 'title': 'Minds intro sequence', - 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', - 'uploader_id': '100000000000000341', - 'description': '', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', 'upload_date': '20130524', 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', }, - 'params': { - 'skip_download': True, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', }, - } + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(video_api_url, video_id, headers=headers, - query={'children': 'false'}) + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + formats = [] - owner = data.get('ownerObj', {}) + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) - transcodes = data.get('transcodes', {}) - # These keys are the width so keep the highest width last - keys = sorted(transcodes.keys()) + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') - for format_id in keys: - is_numeric = re.match('^[0-9]+\.mp4', format_id) - video_url = transcodes[format_id] - info = { - 'url': video_url, - 'format_id': format_id, - 'http_headers': headers, - } - if is_numeric: - info['width'] = int(format_id.split('.')[0]) - formats.append(info) + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] - uploader_id = str_or_none(owner.get('guid') or - data.get('owner_guid') or - owner.get('legacy_guid') or - owner.get('owner_guid')) - description = str_or_none(data.get('description')) - if description: - description = description.strip() - uploader_url = age_limit = thumbnail = None - - if owner.get('username'): - uploader_url = 'https://www.minds.com/%s' % owner.get('username') - if data.get('mature') is True: - age_limit = 18 - - thumbnail_api_url = data.get('thumbnail_src') - if thumbnail_api_url: - req = sanitized_Request(thumbnail_api_url) - req.get_method = lambda: 'HEAD' - res = self._request_webpage(req, video_id) - if res.headers.get('content-type', '').startswith('image/'): - thumbnail = getattr(res, 'url', None) - tags = data.get('tags', '').strip() - if isinstance(tags, compat_str) and tags: - tags = [x.strip() for x in tags.split(',')] - else: - tags = None - category = data.get('category') - if isinstance(category, compat_str) and category: - category = [category] - else: - category = None + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() return { 'id': video_id, - 'title': data['title'], + 'title': entity.get('title') or video_id, 'formats': formats, - 'description': description, - 'license': str_or_none(data.get('license')), - 'creator': str_or_none(owner.get('name') or owner.get('username')), - 'release_date': unified_strdate(data.get('time_created')), - 'timestamp': int_or_none(data.get('time_created')), + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': int_or_none(data.get('play:count')), - 'like_count': int_or_none(data.get('thumbs:up:count')), - 'dislike_count': int_or_none(data.get('thumbs:down:count')), - 'average_rating': int_or_none(data.get('rating')), - 'age_limit': age_limit, - 'categories': [str_or_none(data.get('category'))], + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), 'tags': tags, - # As of 20181020 the API is returning `false` for this value both - # at top level and within the entity.comments:count path. The only - # other way to get this is to fetch all comments and count. - 'comment_count': int_or_none(data.get('comments:count')), + 'comment_count': int_or_none(entity.get('comments:count')), 'thumbnail': thumbnail, } -class MindsActivityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 - def _real_extract(self, url): - guid = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, guid, headers=headers) - return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) - - -class MindsChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' - - def _real_extract(self, url): - channel_name = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, channel_name, headers=headers) - channel = data.get('channel', {}) - params = {'limit': 12, 'offset': ''} - api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] - entries = [] + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 while True: - data = self._download_json(api_url, channel['guid'], - headers=headers, query=params) - activity = data.get('activity', []) - if len(activity) == 0 or not data.get('load-next'): - break - for info in activity: - if info.get('custom_type') != 'video': + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: continue - entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) - params['offset'] = data['load-next'] - return self.playlist_result(entries, - playlist_title='%s activity' % channel_name) + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + }