Netflix-videos-downloader/utils/modules/pycaption/webvtt.py

import re
import six
import sys
import datetime
from copy import deepcopy


from .base import (
    BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode
)

from .geometry import Layout

from .exceptions import (
    CaptionReadError, CaptionReadSyntaxError, CaptionReadNoCaptions,
    InvalidInputError
)

# A WebVTT timing line has both start/end times and layout related settings
# (referred to as 'cue settings' in the documentation)
# The following pattern captures [start], [end] and [cue settings] if existent
from pycaption.geometry import HorizontalAlignmentEnum

TIMING_LINE_PATTERN = re.compile('^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$')
TIMESTAMP_PATTERN = re.compile('^(\d+):(\d{2})(:\d{2})?\.(\d{3})')
VOICE_SPAN_PATTERN = re.compile('<v(\\.\\w+)* ([^>]*)>')
OTHER_SPAN_PATTERN = (
    re.compile(
        '</?([cibuv]|ruby|rt|lang|(\d+):(\d{2})(:\d{2})?\.(\d{3})).*?>'
    )
)  # These WebVTT tags are stripped off the cues on conversion

WEBVTT_VERSION_OF = {
    HorizontalAlignmentEnum.LEFT: 'left',
    HorizontalAlignmentEnum.CENTER: 'middle',
    HorizontalAlignmentEnum.RIGHT: 'right',
    HorizontalAlignmentEnum.START: 'start',
    HorizontalAlignmentEnum.END: 'end'
}

DEFAULT_ALIGNMENT = 'middle'


def microseconds(h, m, s, f):
    """
    Returns an integer representing a number of microseconds
    :rtype: int
    """
    return (int(h) * 3600 + int(m) * 60 + int(s)) * 1000000 + int(f) * 1000


class WebVTTReader(BaseReader):
    def __init__(self, ignore_timing_errors=True, *args, **kwargs):
        """
        :param ignore_timing_errors: Whether to ignore timing checks
        """
        self.ignore_timing_errors = ignore_timing_errors

    def detect(self, content):
        return 'WEBVTT' in content

    def read(self, content, lang='en-US'):
        if type(content) != six.text_type:
            raise InvalidInputError('The content is not a unicode string.')

        caption_set = CaptionSet({lang: self._parse(content.splitlines())})

        if caption_set.is_empty():
            raise CaptionReadNoCaptions("empty caption file")

        return caption_set

    def _parse(self, lines):
        captions = CaptionList()
        start = None
        end = None
        nodes = []
        layout_info = None
        found_timing = False

        for i, line in enumerate(lines):

            if '-->' in line:
                found_timing = True
                timing_line = i
                last_start_time = captions[-1].start if captions else 0
                try:
                    start, end, layout_info = self._parse_timing_line(
                        line, last_start_time)
                except CaptionReadError as e:
                    new_message = '%s (line %d)' % (e.args[0], timing_line)
                    six.reraise(type(e), type(e)(new_message), sys.exc_info()[2])

            elif '' == line:
                if found_timing:
                    if not nodes:
                        raise CaptionReadSyntaxError(
                            'Cue without content. (line %d)' % timing_line)
                    else:
                        found_timing = False
                        caption = Caption(
                            start, end, nodes, layout_info=layout_info)
                        captions.append(caption)
                        nodes = []
            else:
                if found_timing:
                    if nodes:
                        nodes.append(CaptionNode.create_break())
                    nodes.append(CaptionNode.create_text(
                        self._decode(line)))
                else:
                    # it's a comment or some metadata; ignore it
                    pass

        # Add a last caption if there are remaining nodes
        if nodes:
            caption = Caption(start, end, nodes, layout_info=layout_info)
            captions.append(caption)

        return captions

    def _remove_styles(self, line):
        partial_result = VOICE_SPAN_PATTERN.sub('\\2: ', line)
        return OTHER_SPAN_PATTERN.sub('', partial_result)

    def _validate_timings(self, start, end, last_start_time):
        if start is None:
            raise CaptionReadSyntaxError(
                'Invalid cue start timestamp.')
        if end is None:
            raise CaptionReadSyntaxError('Invalid cue end timestamp.')
        if start > end:
            raise CaptionReadError(
                'End timestamp is not greater than start timestamp.')
        if start < last_start_time:
            raise CaptionReadError(
                'Start timestamp is not greater than or equal'
                'to start timestamp of previous cue.')

    def _parse_timing_line(self, line, last_start_time):
        """
        :returns: Tuple (int, int, Layout)
        """
        m = TIMING_LINE_PATTERN.search(line)
        if not m:
            raise CaptionReadSyntaxError(
                'Invalid timing format.')

        start = self._parse_timestamp(m.group(1))
        end = self._parse_timestamp(m.group(2))

        cue_settings = m.group(3)

        if not self.ignore_timing_errors:
            self._validate_timings(start, end, last_start_time)

        layout_info = None
        if cue_settings:
            layout_info = Layout(webvtt_positioning=cue_settings)

        return start, end, layout_info

    def _parse_timestamp(self, timestamp):
        """Returns an integer representing a number of microseconds
        :rtype: int
        """
        m = TIMESTAMP_PATTERN.search(timestamp)
        if not m:
            raise CaptionReadSyntaxError(
                'Invalid timing format.')

        m = m.groups()

        if m[2]:
            # Timestamp takes the form of [hours]:[minutes]:[seconds].[milliseconds]
            return microseconds(m[0], m[1], m[2].replace(":", ""), m[3])
        else:
            # Timestamp takes the form of [minutes]:[seconds].[milliseconds]
            return microseconds(0, m[0], m[1], m[3])

    def _decode(self, s):
        """
        Convert cue text from WebVTT XML-like format to plain unicode.
        :type s: unicode
        """
        s = s.strip()
        # Covert voice span
        s = VOICE_SPAN_PATTERN.sub('\\2: ', s)
        # TODO: Add support for other WebVTT tags. For now just strip them
        # off the text.
        s = OTHER_SPAN_PATTERN.sub('', s)
        # Replace WebVTT special XML codes with plain unicode values
        s = s.replace('&lt;', '<')
        s = s.replace('&gt;', '>')
        s = s.replace('&lrm;', '\u200e')
        s = s.replace('&rlm;', '\u200f')
        s = s.replace('&nbsp;', '\u00a0')
        # Must do ampersand last
        s = s.replace('&amp;', '&')
        return s


class WebVTTWriter(BaseWriter):
    HEADER = 'WEBVTT\n\n'
    global_layout = None
    video_width = None
    video_height = None

    def write(self, caption_set):
        """
        :type caption_set: CaptionSet
        """
        output = self.HEADER

        if caption_set.is_empty():
            return output

        caption_set = deepcopy(caption_set)

        # TODO: styles. These go into a separate CSS file, which doesn't really
        # fit the API here. Figure that out.  Though some style stuff can be
        # done in-line.  This format is a little bit crazy.

        # WebVTT's language support seems to be a bit crazy, so let's just
        # support a single one for now.
        lang = list(caption_set.get_languages())[0]

        self.global_layout = caption_set.get_layout_info(lang)

        captions = caption_set.get_captions(lang)

        return output + '\n'.join(
            [self._write_caption(caption_set, caption) for caption in captions])

    def _timestamp(self, ts):
        td = datetime.timedelta(microseconds=ts)
        mm, ss = divmod(td.seconds, 60)
        hh, mm = divmod(mm, 60)
        s = "%02d:%02d.%03d" % (mm, ss, td.microseconds/1000)
        if hh:
            s = "%d:%s" % (hh, s)
        return s

    def _tags_for_style(self, style):
        if style == 'italics':
            return ['<i>', '</i>']
        elif style == 'underline':
            return ['<u>', '</u>']
        elif style == 'bold':
            return ['<b>', '</b>']
        else:
            return ['', '']

    def _calculate_resulting_style(self, style, caption_set):
        resulting_style = {}

        style_classes = []
        if 'classes' in style:
            style_classes = style['classes']
        elif 'class' in style:
            style_classes = [style['class']]

        for style_class in style_classes:
            sub_style = caption_set.get_style(style_class).copy()
            # Recursively resolve class attributes and calculate style
            resulting_style.update(self._calculate_resulting_style(sub_style, caption_set))

        resulting_style.update(style)

        return resulting_style

    def _write_caption(self, caption_set, caption):
        """
        :type caption: Caption
        """
        layout_groups = self._layout_groups(caption.nodes, caption_set)

        start = self._timestamp(caption.start)
        end = self._timestamp(caption.end)
        timespan = "{} --> {}".format(start, end)

        output = ''

        cue_style_tags = ['', '']

        style = self._calculate_resulting_style(caption.style, caption_set)
        for key, value in sorted(style.items()):
            if value:
                tags = self._tags_for_style(key)
#                    print "tags: " + str(tags) + "\n"
                cue_style_tags[0] += tags[0]
                cue_style_tags[1] = tags[1] + cue_style_tags[1]

        for cue_text, layout in layout_groups:
            if not layout:
                layout = caption.layout_info or self.global_layout
            cue_settings = self._cue_settings_from(layout)
            output += timespan + cue_settings + '\n'
            output += cue_style_tags[0] + cue_text + cue_style_tags[1] + '\n'

        return output

    def _cue_settings_from(self, layout):
        """
        Return WebVTT cue settings string based on layout info
        :type layout: Layout
        :rtype: unicode
        """
        if not layout:
            return ''

        # If it's converting from WebVTT to WebVTT, keep positioning info
        # unchanged
        if layout.webvtt_positioning:
            return ' {}'.format(layout.webvtt_positioning)

        left_offset = None
        top_offset = None
        cue_width = None
        alignment = None

        already_relative = False
        if not self.relativize:
            if layout.is_relative():
                already_relative = True
            else:
                # There are absolute positioning values for this cue but the
                # Writer is explicitly configured not to do any relativization.
                # Ignore all positioning for this cue.
                return ''

        # Ensure that all positioning values are measured using percentage.
        # This may raise an exception if layout.is_relative() == False
        # If you want to avoid it, you have to turn off relativization by
        # initializing this Writer with relativize=False.
        if not already_relative:
            layout = layout.as_percentage_of(
                self.video_width, self.video_height)

        # Ensure that when there's a left offset the caption is not pushed out
        # of the screen. If the execution got this far it means origin and
        # extent are already relative by now.
        if self.fit_to_screen:
            layout = layout.fit_to_screen()

        if layout.origin:
            left_offset = layout.origin.x
            top_offset = layout.origin.y

        if layout.extent:
            cue_width = layout.extent.horizontal

        if layout.padding:
            if layout.padding.start and left_offset:
                # Since there is no padding in WebVTT, the left padding is
                # added to the total left offset (if it is defined and not
                # relative),
                if left_offset:
                    left_offset += layout.padding.start
                # and removed from the total cue width
                if cue_width:
                    cue_width -= layout.padding.start
            # the right padding is cut out of the total cue width,
            if layout.padding.end and cue_width:
                cue_width -= layout.padding.end
            # the top padding is added to the top offset
            # (if it is defined and not relative)
            if layout.padding.before and top_offset:
                top_offset += layout.padding.before
            # and the bottom padding is ignored because the cue box is only as
            # long vertically as the text it contains and nothing can be cut
            # out

        try:
            alignment = WEBVTT_VERSION_OF[layout.alignment.horizontal]
        except (AttributeError, KeyError):
            pass

        cue_settings = ''

        if alignment and alignment != 'middle':
            cue_settings += " align:" + alignment
        if left_offset:
            cue_settings += " position:{},start".format(six.text_type(left_offset))
        if top_offset:
            cue_settings += " line:" + six.text_type(top_offset)
        if cue_width:
            cue_settings += " size:" + six.text_type(cue_width)

        return cue_settings

    def _layout_groups(self, nodes, caption_set):
        """
        Convert a Caption's nodes to WebVTT cue or cues (depending on
        whether they have the same positioning or not).
        """
        if not nodes:
            return []

        current_layout = None

        # A list with layout groups. Since WebVTT only support positioning
        # for different cues, each layout group has to be represented in a
        # new cue with the same timing but different positioning settings.
        layout_groups = []
        # A properly encoded WebVTT string (plain unicode must be properly
        # escaped before being appended to this string)
        s = ''
        for i, node in enumerate(nodes):
            if node.type_ == CaptionNode.TEXT:
                if s and current_layout and node.layout_info != current_layout:
                    # If the positioning changes from one text node to
                    # another, a new WebVTT cue has to be created.
                    layout_groups.append((s, current_layout))
                    s = ''
                # ATTENTION: This is where the plain unicode node content is
                # finally encoded as WebVTT.
                s += self._encode(node.content) or '&nbsp;'
                current_layout = node.layout_info
            elif node.type_ == CaptionNode.STYLE:
                resulting_style = self._calculate_resulting_style(node.content, caption_set)

                styles = ['italics', 'underline', 'bold']
                if not node.start:
                    styles.reverse()

                for style in styles:
                    if style in resulting_style and resulting_style[style]:
                        tags = self._tags_for_style(style)
                        if node.start:
                            s += tags[0]
                        else:
                            s += tags[1]

                # TODO: Refactor pycaption and eliminate the concept of a
                # "Style node"
            elif node.type_ == CaptionNode.BREAK:
                if i > 0 and nodes[i - 1].type_ != CaptionNode.TEXT:
                    s += '&nbsp;'
                if i == 0:  # cue text starts with a break
                    s += '&nbsp;'
                s += '\n'

        if s:
            layout_groups.append((s, current_layout))
        return layout_groups

    def _encode(self, s):
        """
        Convert cue text from plain unicode to WebVTT XML-like format
        escaping illegal characters. For a list of illegal characters see:
            - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-text-span
        :type s: unicode
        """
        s = s.replace('&', '&amp;')
        s = s.replace('<', '&lt;')

        # The substring "-->" is also not allowed according to this:
        #   - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-block
        s = s.replace('-->', '--&gt;')

        # The following characters have escaping codes for some reason, but
        # they're not illegal, so for now I'll leave this commented out so that
        # we stay as close as possible to the specification and avoid doing
        # extra stuff "just to be safe".
        # s = s.replace(u'>', u'&gt;')
        # s = s.replace(u'\u200e', u'&lrm;')
        # s = s.replace(u'\u200f', u'&rlm;')
        # s = s.replace(u'\u00a0', u'&nbsp;')
        return s