This repository has been archived on 2024-07-02. You can view files and clone it, but cannot push or open issues or pull requests.
Netflix-videos-downloader/utils/modules/pycaption/webvtt.py
2021-09-01 02:57:54 +05:00

471 lines
16 KiB
Python

import re
import six
import sys
import datetime
from copy import deepcopy
from .base import (
BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode
)
from .geometry import Layout
from .exceptions import (
CaptionReadError, CaptionReadSyntaxError, CaptionReadNoCaptions,
InvalidInputError
)
# A WebVTT timing line has both start/end times and layout related settings
# (referred to as 'cue settings' in the documentation)
# The following pattern captures [start], [end] and [cue settings] if existent
from pycaption.geometry import HorizontalAlignmentEnum
TIMING_LINE_PATTERN = re.compile('^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$')
TIMESTAMP_PATTERN = re.compile('^(\d+):(\d{2})(:\d{2})?\.(\d{3})')
VOICE_SPAN_PATTERN = re.compile('<v(\\.\\w+)* ([^>]*)>')
OTHER_SPAN_PATTERN = (
re.compile(
'</?([cibuv]|ruby|rt|lang|(\d+):(\d{2})(:\d{2})?\.(\d{3})).*?>'
)
) # These WebVTT tags are stripped off the cues on conversion
WEBVTT_VERSION_OF = {
HorizontalAlignmentEnum.LEFT: 'left',
HorizontalAlignmentEnum.CENTER: 'middle',
HorizontalAlignmentEnum.RIGHT: 'right',
HorizontalAlignmentEnum.START: 'start',
HorizontalAlignmentEnum.END: 'end'
}
DEFAULT_ALIGNMENT = 'middle'
def microseconds(h, m, s, f):
"""
Returns an integer representing a number of microseconds
:rtype: int
"""
return (int(h) * 3600 + int(m) * 60 + int(s)) * 1000000 + int(f) * 1000
class WebVTTReader(BaseReader):
def __init__(self, ignore_timing_errors=True, *args, **kwargs):
"""
:param ignore_timing_errors: Whether to ignore timing checks
"""
self.ignore_timing_errors = ignore_timing_errors
def detect(self, content):
return 'WEBVTT' in content
def read(self, content, lang='en-US'):
if type(content) != six.text_type:
raise InvalidInputError('The content is not a unicode string.')
caption_set = CaptionSet({lang: self._parse(content.splitlines())})
if caption_set.is_empty():
raise CaptionReadNoCaptions("empty caption file")
return caption_set
def _parse(self, lines):
captions = CaptionList()
start = None
end = None
nodes = []
layout_info = None
found_timing = False
for i, line in enumerate(lines):
if '-->' in line:
found_timing = True
timing_line = i
last_start_time = captions[-1].start if captions else 0
try:
start, end, layout_info = self._parse_timing_line(
line, last_start_time)
except CaptionReadError as e:
new_message = '%s (line %d)' % (e.args[0], timing_line)
six.reraise(type(e), type(e)(new_message), sys.exc_info()[2])
elif '' == line:
if found_timing:
if not nodes:
raise CaptionReadSyntaxError(
'Cue without content. (line %d)' % timing_line)
else:
found_timing = False
caption = Caption(
start, end, nodes, layout_info=layout_info)
captions.append(caption)
nodes = []
else:
if found_timing:
if nodes:
nodes.append(CaptionNode.create_break())
nodes.append(CaptionNode.create_text(
self._decode(line)))
else:
# it's a comment or some metadata; ignore it
pass
# Add a last caption if there are remaining nodes
if nodes:
caption = Caption(start, end, nodes, layout_info=layout_info)
captions.append(caption)
return captions
def _remove_styles(self, line):
partial_result = VOICE_SPAN_PATTERN.sub('\\2: ', line)
return OTHER_SPAN_PATTERN.sub('', partial_result)
def _validate_timings(self, start, end, last_start_time):
if start is None:
raise CaptionReadSyntaxError(
'Invalid cue start timestamp.')
if end is None:
raise CaptionReadSyntaxError('Invalid cue end timestamp.')
if start > end:
raise CaptionReadError(
'End timestamp is not greater than start timestamp.')
if start < last_start_time:
raise CaptionReadError(
'Start timestamp is not greater than or equal'
'to start timestamp of previous cue.')
def _parse_timing_line(self, line, last_start_time):
"""
:returns: Tuple (int, int, Layout)
"""
m = TIMING_LINE_PATTERN.search(line)
if not m:
raise CaptionReadSyntaxError(
'Invalid timing format.')
start = self._parse_timestamp(m.group(1))
end = self._parse_timestamp(m.group(2))
cue_settings = m.group(3)
if not self.ignore_timing_errors:
self._validate_timings(start, end, last_start_time)
layout_info = None
if cue_settings:
layout_info = Layout(webvtt_positioning=cue_settings)
return start, end, layout_info
def _parse_timestamp(self, timestamp):
"""Returns an integer representing a number of microseconds
:rtype: int
"""
m = TIMESTAMP_PATTERN.search(timestamp)
if not m:
raise CaptionReadSyntaxError(
'Invalid timing format.')
m = m.groups()
if m[2]:
# Timestamp takes the form of [hours]:[minutes]:[seconds].[milliseconds]
return microseconds(m[0], m[1], m[2].replace(":", ""), m[3])
else:
# Timestamp takes the form of [minutes]:[seconds].[milliseconds]
return microseconds(0, m[0], m[1], m[3])
def _decode(self, s):
"""
Convert cue text from WebVTT XML-like format to plain unicode.
:type s: unicode
"""
s = s.strip()
# Covert voice span
s = VOICE_SPAN_PATTERN.sub('\\2: ', s)
# TODO: Add support for other WebVTT tags. For now just strip them
# off the text.
s = OTHER_SPAN_PATTERN.sub('', s)
# Replace WebVTT special XML codes with plain unicode values
s = s.replace('&lt;', '<')
s = s.replace('&gt;', '>')
s = s.replace('&lrm;', '\u200e')
s = s.replace('&rlm;', '\u200f')
s = s.replace('&nbsp;', '\u00a0')
# Must do ampersand last
s = s.replace('&amp;', '&')
return s
class WebVTTWriter(BaseWriter):
HEADER = 'WEBVTT\n\n'
global_layout = None
video_width = None
video_height = None
def write(self, caption_set):
"""
:type caption_set: CaptionSet
"""
output = self.HEADER
if caption_set.is_empty():
return output
caption_set = deepcopy(caption_set)
# TODO: styles. These go into a separate CSS file, which doesn't really
# fit the API here. Figure that out. Though some style stuff can be
# done in-line. This format is a little bit crazy.
# WebVTT's language support seems to be a bit crazy, so let's just
# support a single one for now.
lang = list(caption_set.get_languages())[0]
self.global_layout = caption_set.get_layout_info(lang)
captions = caption_set.get_captions(lang)
return output + '\n'.join(
[self._write_caption(caption_set, caption) for caption in captions])
def _timestamp(self, ts):
td = datetime.timedelta(microseconds=ts)
mm, ss = divmod(td.seconds, 60)
hh, mm = divmod(mm, 60)
s = "%02d:%02d.%03d" % (mm, ss, td.microseconds/1000)
if hh:
s = "%d:%s" % (hh, s)
return s
def _tags_for_style(self, style):
if style == 'italics':
return ['<i>', '</i>']
elif style == 'underline':
return ['<u>', '</u>']
elif style == 'bold':
return ['<b>', '</b>']
else:
return ['', '']
def _calculate_resulting_style(self, style, caption_set):
resulting_style = {}
style_classes = []
if 'classes' in style:
style_classes = style['classes']
elif 'class' in style:
style_classes = [style['class']]
for style_class in style_classes:
sub_style = caption_set.get_style(style_class).copy()
# Recursively resolve class attributes and calculate style
resulting_style.update(self._calculate_resulting_style(sub_style, caption_set))
resulting_style.update(style)
return resulting_style
def _write_caption(self, caption_set, caption):
"""
:type caption: Caption
"""
layout_groups = self._layout_groups(caption.nodes, caption_set)
start = self._timestamp(caption.start)
end = self._timestamp(caption.end)
timespan = "{} --> {}".format(start, end)
output = ''
cue_style_tags = ['', '']
style = self._calculate_resulting_style(caption.style, caption_set)
for key, value in sorted(style.items()):
if value:
tags = self._tags_for_style(key)
# print "tags: " + str(tags) + "\n"
cue_style_tags[0] += tags[0]
cue_style_tags[1] = tags[1] + cue_style_tags[1]
for cue_text, layout in layout_groups:
if not layout:
layout = caption.layout_info or self.global_layout
cue_settings = self._cue_settings_from(layout)
output += timespan + cue_settings + '\n'
output += cue_style_tags[0] + cue_text + cue_style_tags[1] + '\n'
return output
def _cue_settings_from(self, layout):
"""
Return WebVTT cue settings string based on layout info
:type layout: Layout
:rtype: unicode
"""
if not layout:
return ''
# If it's converting from WebVTT to WebVTT, keep positioning info
# unchanged
if layout.webvtt_positioning:
return ' {}'.format(layout.webvtt_positioning)
left_offset = None
top_offset = None
cue_width = None
alignment = None
already_relative = False
if not self.relativize:
if layout.is_relative():
already_relative = True
else:
# There are absolute positioning values for this cue but the
# Writer is explicitly configured not to do any relativization.
# Ignore all positioning for this cue.
return ''
# Ensure that all positioning values are measured using percentage.
# This may raise an exception if layout.is_relative() == False
# If you want to avoid it, you have to turn off relativization by
# initializing this Writer with relativize=False.
if not already_relative:
layout = layout.as_percentage_of(
self.video_width, self.video_height)
# Ensure that when there's a left offset the caption is not pushed out
# of the screen. If the execution got this far it means origin and
# extent are already relative by now.
if self.fit_to_screen:
layout = layout.fit_to_screen()
if layout.origin:
left_offset = layout.origin.x
top_offset = layout.origin.y
if layout.extent:
cue_width = layout.extent.horizontal
if layout.padding:
if layout.padding.start and left_offset:
# Since there is no padding in WebVTT, the left padding is
# added to the total left offset (if it is defined and not
# relative),
if left_offset:
left_offset += layout.padding.start
# and removed from the total cue width
if cue_width:
cue_width -= layout.padding.start
# the right padding is cut out of the total cue width,
if layout.padding.end and cue_width:
cue_width -= layout.padding.end
# the top padding is added to the top offset
# (if it is defined and not relative)
if layout.padding.before and top_offset:
top_offset += layout.padding.before
# and the bottom padding is ignored because the cue box is only as
# long vertically as the text it contains and nothing can be cut
# out
try:
alignment = WEBVTT_VERSION_OF[layout.alignment.horizontal]
except (AttributeError, KeyError):
pass
cue_settings = ''
if alignment and alignment != 'middle':
cue_settings += " align:" + alignment
if left_offset:
cue_settings += " position:{},start".format(six.text_type(left_offset))
if top_offset:
cue_settings += " line:" + six.text_type(top_offset)
if cue_width:
cue_settings += " size:" + six.text_type(cue_width)
return cue_settings
def _layout_groups(self, nodes, caption_set):
"""
Convert a Caption's nodes to WebVTT cue or cues (depending on
whether they have the same positioning or not).
"""
if not nodes:
return []
current_layout = None
# A list with layout groups. Since WebVTT only support positioning
# for different cues, each layout group has to be represented in a
# new cue with the same timing but different positioning settings.
layout_groups = []
# A properly encoded WebVTT string (plain unicode must be properly
# escaped before being appended to this string)
s = ''
for i, node in enumerate(nodes):
if node.type_ == CaptionNode.TEXT:
if s and current_layout and node.layout_info != current_layout:
# If the positioning changes from one text node to
# another, a new WebVTT cue has to be created.
layout_groups.append((s, current_layout))
s = ''
# ATTENTION: This is where the plain unicode node content is
# finally encoded as WebVTT.
s += self._encode(node.content) or '&nbsp;'
current_layout = node.layout_info
elif node.type_ == CaptionNode.STYLE:
resulting_style = self._calculate_resulting_style(node.content, caption_set)
styles = ['italics', 'underline', 'bold']
if not node.start:
styles.reverse()
for style in styles:
if style in resulting_style and resulting_style[style]:
tags = self._tags_for_style(style)
if node.start:
s += tags[0]
else:
s += tags[1]
# TODO: Refactor pycaption and eliminate the concept of a
# "Style node"
elif node.type_ == CaptionNode.BREAK:
if i > 0 and nodes[i - 1].type_ != CaptionNode.TEXT:
s += '&nbsp;'
if i == 0: # cue text starts with a break
s += '&nbsp;'
s += '\n'
if s:
layout_groups.append((s, current_layout))
return layout_groups
def _encode(self, s):
"""
Convert cue text from plain unicode to WebVTT XML-like format
escaping illegal characters. For a list of illegal characters see:
- http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-text-span
:type s: unicode
"""
s = s.replace('&', '&amp;')
s = s.replace('<', '&lt;')
# The substring "-->" is also not allowed according to this:
# - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-block
s = s.replace('-->', '--&gt;')
# The following characters have escaping codes for some reason, but
# they're not illegal, so for now I'll leave this commented out so that
# we stay as close as possible to the specification and avoid doing
# extra stuff "just to be safe".
# s = s.replace(u'>', u'&gt;')
# s = s.replace(u'\u200e', u'&lrm;')
# s = s.replace(u'\u200f', u'&rlm;')
# s = s.replace(u'\u00a0', u'&nbsp;')
return s