upload

2021-09-01 02:57:54 +05:00
parent 9df940f1fd
commit bf3c3712dd
222 changed files with 1007430 additions and 0 deletions
--- a/utils/modules/pycaption/base.py
+++ b/utils/modules/pycaption/base.py
@@ -0,0 +1,409 @@
+from datetime import timedelta
+from numbers import Number
+from six import text_type
+
+from .exceptions import CaptionReadError, CaptionReadTimingError
+
+DEFAULT_LANGUAGE_CODE = 'en-US'
+
+
+def force_byte_string(content):
+    try:
+        return content.encode('UTF-8')
+    except UnicodeEncodeError:
+        raise RuntimeError('Invalid content encoding')
+    except UnicodeDecodeError:
+        return content
+
+
+class CaptionConverter(object):
+    def __init__(self, captions=None):
+        self.captions = captions if captions else []
+
+    def read(self, content, caption_reader):
+        try:
+            self.captions = caption_reader.read(content)
+        except AttributeError as e:
+            raise Exception(e)
+        return self
+
+    def write(self, caption_writer):
+        try:
+            return caption_writer.write(self.captions)
+        except AttributeError as e:
+            raise Exception(e)
+
+
+class BaseReader(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def detect(self, content):
+        if content:
+            return True
+        else:
+            return False
+
+    def read(self, content):
+        return CaptionSet()
+
+
+class BaseWriter(object):
+    def __init__(self, relativize=True, video_width=None, video_height=None,
+                 fit_to_screen=True):
+        """
+        Initialize writer with the given parameters.
+
+        :param relativize: If True (default), converts absolute positioning
+            values (e.g. px) to percentage. ATTENTION: WebVTT does not support
+            absolute positioning. If relativize is set to False and it finds
+            an absolute positioning parameter for a given caption, it will
+            ignore all positioning for that cue and show it in the default
+            position.
+        :param video_width: The width of the video for which the captions being
+            converted were made. This is necessary for relativization.
+        :param video_height: The height of the video for which the captions
+            being converted were made. This is necessary for relativization.
+        :param fit_to_screen: If extent is not set or
+            if origin + extent > 100%, (re)calculate it based on origin.
+            It is a pycaption fix for caption files that are technically valid
+            but contains inconsistent settings that may cause long captions to
+            be cut out of the screen.
+        """
+        self.relativize = relativize
+        self.video_width = video_width
+        self.video_height = video_height
+        self.fit_to_screen = fit_to_screen
+
+    def _relativize_and_fit_to_screen(self, layout_info):
+        if layout_info:
+            if self.relativize:
+                # Transform absolute values (e.g. px) into percentages
+                layout_info = layout_info.as_percentage_of(
+                    self.video_width, self.video_height)
+            if self.fit_to_screen:
+                # Make sure origin + extent <= 100%
+                layout_info = layout_info.fit_to_screen()
+        return layout_info
+
+    def write(self, content):
+        return content
+
+
+class Style(object):
+    def __init__(self):
+        pass
+
+
+class CaptionNode(object):
+    """
+    A single node within a caption, representing either
+    text, a style, or a linebreak.
+
+    Rules:
+        1. All nodes should have the property layout_info set.
+        The value None means specifically that no positioning information
+        should be specified. Each reader is to supply its own default
+        values (if necessary) when reading their respective formats.
+    """
+
+    TEXT = 1
+    # When and if this is extended, it might be better to turn it into a
+    # property of the node, not a type of node itself.
+    STYLE = 2
+    BREAK = 3
+
+    def __init__(self, type_, layout_info=None):
+        """
+        :type type_: int
+        :type layout_info: Layout
+        """
+        self.type_ = type_
+        self.content = None
+
+        # Boolean. Marks the beginning/ end of a Style node.
+        self.start = None
+        self.layout_info = layout_info
+
+    def __repr__(self):
+        t = self.type_
+
+        if t == CaptionNode.TEXT:
+            return repr(self.content)
+        elif t == CaptionNode.BREAK:
+            return repr('BREAK')
+        elif t == CaptionNode.STYLE:
+            return repr('STYLE: %s %s' % (self.start, self.content))
+        else:
+            raise RuntimeError('Unknown node type: ' + str(t))
+
+    @staticmethod
+    def create_text(text, layout_info=None):
+        data = CaptionNode(CaptionNode.TEXT, layout_info=layout_info)
+        data.content = text
+        return data
+
+    @staticmethod
+    def create_style(start, content, layout_info=None):
+        data = CaptionNode(CaptionNode.STYLE, layout_info=layout_info)
+        data.content = content
+        data.start = start
+        return data
+
+    @staticmethod
+    def create_break(layout_info=None):
+        return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
+
+
+class Caption(object):
+    """
+    A single caption, including the time and styling information
+    for its display.
+    """
+    def __init__(self, start, end, nodes, style={}, layout_info=None):
+        """
+        Initialize the Caption object
+        :param start: The start time in microseconds
+        :type start: Number
+        :param end: The end time in microseconds
+        :type end: Number
+        :param nodes: A list of CaptionNodes
+        :type nodes: list
+        :param style: A dictionary with CSS-like styling rules
+        :type style: dict
+        :param layout_info: A Layout object with the necessary positioning
+            information
+        :type layout_info: Layout
+        """
+        if not isinstance(start, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid start time")
+        if not isinstance(end, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid end time")
+        if not nodes:
+            raise CaptionReadError("Node list cannot be empty")
+        self.start = start
+        self.end = end
+        self.nodes = nodes
+        self.style = style
+        self.layout_info = layout_info
+
+    def is_empty(self):
+        return len(self.nodes) == 0
+
+    def format_start(self, msec_separator=None):
+        """
+        Format the start time value in milliseconds into a string
+        value suitable for some of the supported output formats (ex.
+        SRT, DFXP).
+        """
+        return self._format_timestamp(self.start, msec_separator)
+
+    def format_end(self, msec_separator=None):
+        """
+        Format the end time value in milliseconds into a string value suitable
+        for some of the supported output formats (ex. SRT, DFXP).
+        """
+        return self._format_timestamp(self.end, msec_separator)
+
+    def __repr__(self):
+        return repr(
+            '{start} --> {end}\n{text}'.format(
+                start=self.format_start(),
+                end=self.format_end(),
+                text=self.get_text()
+            )
+        )
+
+    def get_text(self):
+        """
+        Get the text of the caption.
+        """
+        def get_text_for_node(node):
+            if node.type_ == CaptionNode.TEXT:
+                return node.content
+            if node.type_ == CaptionNode.BREAK:
+                return '\n'
+            return ''
+        text_nodes = [get_text_for_node(node) for node in self.nodes]
+        return ''.join(text_nodes).strip()
+
+    def _format_timestamp(self, value, msec_separator=None):
+        datetime_value = timedelta(milliseconds=(int(value / 1000)))
+
+        str_value = text_type(datetime_value)[:11]
+        if not datetime_value.microseconds:
+            str_value += '.000'
+
+        if msec_separator is not None:
+            str_value = str_value.replace(".", msec_separator)
+
+        return '0' + str_value
+
+
+class CaptionList(list):
+    """ A list of captions with a layout object attached to it """
+    def __init__(self, iterable=None, layout_info=None):
+        """
+        :param iterable: An iterator used to populate the caption list
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self.layout_info = layout_info
+        args = [iterable] if iterable else []
+        super(CaptionList, self).__init__(*args)
+
+    def __getslice__(self, i, j):
+        return CaptionList(
+            list.__getslice__(self, i, j), layout_info=self.layout_info)
+
+    def __getitem__(self, y):
+        item = list.__getitem__(self, y)
+        if isinstance(item, Caption):
+            return item
+        return CaptionList(item, layout_info=self.layout_info)
+
+    def __add__(self, other):
+        add_is_safe = (
+            not hasattr(other, 'layout_info') or
+            not other.layout_info or
+            self.layout_info == other.layout_info
+        )
+        if add_is_safe:
+            return CaptionList(
+                list.__add__(self, other), layout_info=self.layout_info)
+        else:
+            raise ValueError(
+                "Cannot add CaptionList objects with different layout_info")
+
+    def __mul__(self, other):
+        return CaptionList(
+            list.__mul__(self, other), layout_info=self.layout_info)
+
+    __rmul__ = __mul__
+
+
+class CaptionSet(object):
+    """
+    A set of captions in potentially multiple languages,
+    all representing the same underlying content.
+
+    The .layout_info attribute, keeps information that should be inherited
+    by all the children.
+    """
+    def __init__(self, captions, styles={}, layout_info=None):
+        """
+        :param captions: A dictionary of the format {'language': CaptionList}
+        :param styles: A dictionary with CSS-like styling rules
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self._captions = captions
+        self._styles = styles
+        self.layout_info = layout_info
+
+    def set_captions(self, lang, captions):
+        self._captions[lang] = captions
+
+    def get_languages(self):
+        return list(self._captions.keys())
+
+    def get_captions(self, lang):
+        return self._captions.get(lang, [])
+
+    def add_style(self, selector, rules):
+        """
+        :param selector: The selector indicating the elements to which the
+            rules should be applied.
+        :param rules: A dictionary with CSS-like styling rules.
+        """
+        self._styles[selector] = rules
+
+    def get_style(self, selector):
+        """
+        Returns a dictionary with CSS-like styling rules for a given selector.
+        :param selector: The selector whose rules should be returned (e.g. an
+            element or class name).
+        """
+        return self._styles.get(selector, {})
+
+    def get_styles(self):
+        return sorted(self._styles.items())
+
+    def set_styles(self, styles):
+        self._styles = styles
+
+    def is_empty(self):
+        return all(
+            [len(captions) == 0 for captions in list(self._captions.values())]
+        )
+
+    def set_layout_info(self, lang, layout_info):
+        self._captions[lang].layout_info = layout_info
+
+    def get_layout_info(self, lang):
+        caption_list = self._captions.get(lang)
+        if caption_list:
+            return caption_list.layout_info
+        return None
+
+    def adjust_caption_timing(self, offset=0, rate_skew=1.0):
+        """
+        Adjust the timing according to offset and rate_skew.
+        Skew is applied first, then offset.
+
+        e.g. if skew == 1.1, and offset is 5, a caption originally
+        displayed from 10-11 seconds would instead be at 16-17.1
+        """
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                caption.start = caption.start * rate_skew + offset
+                caption.end = caption.end * rate_skew + offset
+                if caption.start >= 0:
+                    out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+
+# Functions
+def merge_concurrent_captions(caption_set):
+    """Merge captions that have the same start and end times"""
+    for lang in caption_set.get_languages():
+        captions = caption_set.get_captions(lang)
+        last_caption = None
+        concurrent_captions = CaptionList()
+        merged_captions = CaptionList()
+        for caption in captions:
+            if last_caption:
+                last_timespan = last_caption.start, last_caption.end
+                current_timespan = caption.start, caption.end
+                if current_timespan == last_timespan:
+                    concurrent_captions.append(caption)
+                    last_caption = caption
+                    continue
+                else:
+                    merged_captions.append(merge(concurrent_captions))
+            concurrent_captions = [caption]
+            last_caption = caption
+
+        if concurrent_captions:
+            merged_captions.append(merge(concurrent_captions))
+        if merged_captions:
+            caption_set.set_captions(lang, merged_captions)
+    return caption_set
+
+
+def merge(captions):
+    """
+    Merge list of captions into one caption. The start/end times from the first
+    caption are kept.
+    """
+    new_nodes = []
+    for caption in captions:
+        if new_nodes:
+            new_nodes.append(CaptionNode.create_break())
+        for node in caption.nodes:
+            new_nodes.append(node)
+    caption = Caption(
+        captions[0].start, captions[0].end, new_nodes, captions[0].style)
+    return caption