[pornhub] Improve title extraction (Closes #9777)

This commit is contained in:
Sergey M․ 2016-06-14 04:57:59 +07:00
parent 4cef70db6c
commit 6c3760292c
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools import itertools
@ -39,7 +40,25 @@ class PornHubIE(InfoExtractor):
'dislike_count': int, 'dislike_count': int,
'comment_count': int, 'comment_count': int,
'age_limit': 18, 'age_limit': 18,
} },
}, {
# non-ASCII title
'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
'info_dict': {
'id': '1331683002',
'ext': 'mp4',
'title': '重庆婷婷女王足交',
'uploader': 'cj397186295',
'duration': 1753,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True, 'only_matching': True,
@ -76,19 +95,25 @@ class PornHubIE(InfoExtractor):
'PornHub said: %s' % error_msg, 'PornHub said: %s' % error_msg,
expected=True, video_id=video_id) expected=True, video_id=video_id)
# video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore.
title = self._html_search_meta(
'twitter:title', webpage, default=None) or self._search_regex(
(r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
webpage, 'title', group='title')
flashvars = self._parse_json( flashvars = self._parse_json(
self._search_regex( self._search_regex(
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id) video_id)
if flashvars: if flashvars:
video_title = flashvars.get('video_title')
thumbnail = flashvars.get('image_url') thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration')) duration = int_or_none(flashvars.get('video_duration'))
else: else:
video_title, thumbnail, duration = [None] * 3 title, thumbnail, duration = [None] * 3
if not video_title:
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex( video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
@ -137,7 +162,7 @@ class PornHubIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'uploader': video_uploader, 'uploader': video_uploader,
'title': video_title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': duration, 'duration': duration,
'view_count': view_count, 'view_count': view_count,