upload
This commit is contained in:
34
utils/modules/pycaption/__init__.py
Normal file
34
utils/modules/pycaption/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from .base import (
|
||||
CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet)
|
||||
from .dfxp import DFXPWriter, DFXPReader
|
||||
from .sami import SAMIReader, SAMIWriter
|
||||
from .srt import SRTReader, SRTWriter
|
||||
from .scc import SCCReader, SCCWriter
|
||||
from .webvtt import WebVTTReader, WebVTTWriter
|
||||
from .exceptions import (
|
||||
CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'CaptionConverter', 'DFXPReader', 'DFXPWriter',
|
||||
'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter',
|
||||
'SCCReader', 'SCCWriter', 'WebVTTReader', 'WebVTTWriter',
|
||||
'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError',
|
||||
'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet'
|
||||
]
|
||||
|
||||
SUPPORTED_READERS = (
|
||||
DFXPReader, WebVTTReader, SAMIReader, SRTReader, SCCReader)
|
||||
|
||||
|
||||
def detect_format(caps):
|
||||
"""
|
||||
Detect the format of the provided caption string.
|
||||
|
||||
:returns: the reader class for the detected format.
|
||||
"""
|
||||
for reader in SUPPORTED_READERS:
|
||||
if reader().detect(caps):
|
||||
return reader
|
||||
|
||||
return None
|
||||
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/base.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/base.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/base.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/base.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/base.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/base.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/base.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/base.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/exceptions.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/geometry.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/sami.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/sami.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/sami.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/sami.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/sami.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/sami.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/sami.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/sami.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/srt.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/srt.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/srt.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/srt.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/srt.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/srt.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/srt.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/srt.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/utils.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/utils.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/utils.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/utils.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/utils.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/utils.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/utils.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/utils.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/__pycache__/webvtt.cpython-39.pyc
Normal file
Binary file not shown.
409
utils/modules/pycaption/base.py
Normal file
409
utils/modules/pycaption/base.py
Normal file
@@ -0,0 +1,409 @@
|
||||
from datetime import timedelta
|
||||
from numbers import Number
|
||||
from six import text_type
|
||||
|
||||
from .exceptions import CaptionReadError, CaptionReadTimingError
|
||||
|
||||
DEFAULT_LANGUAGE_CODE = 'en-US'
|
||||
|
||||
|
||||
def force_byte_string(content):
|
||||
try:
|
||||
return content.encode('UTF-8')
|
||||
except UnicodeEncodeError:
|
||||
raise RuntimeError('Invalid content encoding')
|
||||
except UnicodeDecodeError:
|
||||
return content
|
||||
|
||||
|
||||
class CaptionConverter(object):
|
||||
def __init__(self, captions=None):
|
||||
self.captions = captions if captions else []
|
||||
|
||||
def read(self, content, caption_reader):
|
||||
try:
|
||||
self.captions = caption_reader.read(content)
|
||||
except AttributeError as e:
|
||||
raise Exception(e)
|
||||
return self
|
||||
|
||||
def write(self, caption_writer):
|
||||
try:
|
||||
return caption_writer.write(self.captions)
|
||||
except AttributeError as e:
|
||||
raise Exception(e)
|
||||
|
||||
|
||||
class BaseReader(object):
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def detect(self, content):
|
||||
if content:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def read(self, content):
|
||||
return CaptionSet()
|
||||
|
||||
|
||||
class BaseWriter(object):
|
||||
def __init__(self, relativize=True, video_width=None, video_height=None,
|
||||
fit_to_screen=True):
|
||||
"""
|
||||
Initialize writer with the given parameters.
|
||||
|
||||
:param relativize: If True (default), converts absolute positioning
|
||||
values (e.g. px) to percentage. ATTENTION: WebVTT does not support
|
||||
absolute positioning. If relativize is set to False and it finds
|
||||
an absolute positioning parameter for a given caption, it will
|
||||
ignore all positioning for that cue and show it in the default
|
||||
position.
|
||||
:param video_width: The width of the video for which the captions being
|
||||
converted were made. This is necessary for relativization.
|
||||
:param video_height: The height of the video for which the captions
|
||||
being converted were made. This is necessary for relativization.
|
||||
:param fit_to_screen: If extent is not set or
|
||||
if origin + extent > 100%, (re)calculate it based on origin.
|
||||
It is a pycaption fix for caption files that are technically valid
|
||||
but contains inconsistent settings that may cause long captions to
|
||||
be cut out of the screen.
|
||||
"""
|
||||
self.relativize = relativize
|
||||
self.video_width = video_width
|
||||
self.video_height = video_height
|
||||
self.fit_to_screen = fit_to_screen
|
||||
|
||||
def _relativize_and_fit_to_screen(self, layout_info):
|
||||
if layout_info:
|
||||
if self.relativize:
|
||||
# Transform absolute values (e.g. px) into percentages
|
||||
layout_info = layout_info.as_percentage_of(
|
||||
self.video_width, self.video_height)
|
||||
if self.fit_to_screen:
|
||||
# Make sure origin + extent <= 100%
|
||||
layout_info = layout_info.fit_to_screen()
|
||||
return layout_info
|
||||
|
||||
def write(self, content):
|
||||
return content
|
||||
|
||||
|
||||
class Style(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class CaptionNode(object):
|
||||
"""
|
||||
A single node within a caption, representing either
|
||||
text, a style, or a linebreak.
|
||||
|
||||
Rules:
|
||||
1. All nodes should have the property layout_info set.
|
||||
The value None means specifically that no positioning information
|
||||
should be specified. Each reader is to supply its own default
|
||||
values (if necessary) when reading their respective formats.
|
||||
"""
|
||||
|
||||
TEXT = 1
|
||||
# When and if this is extended, it might be better to turn it into a
|
||||
# property of the node, not a type of node itself.
|
||||
STYLE = 2
|
||||
BREAK = 3
|
||||
|
||||
def __init__(self, type_, layout_info=None):
|
||||
"""
|
||||
:type type_: int
|
||||
:type layout_info: Layout
|
||||
"""
|
||||
self.type_ = type_
|
||||
self.content = None
|
||||
|
||||
# Boolean. Marks the beginning/ end of a Style node.
|
||||
self.start = None
|
||||
self.layout_info = layout_info
|
||||
|
||||
def __repr__(self):
|
||||
t = self.type_
|
||||
|
||||
if t == CaptionNode.TEXT:
|
||||
return repr(self.content)
|
||||
elif t == CaptionNode.BREAK:
|
||||
return repr('BREAK')
|
||||
elif t == CaptionNode.STYLE:
|
||||
return repr('STYLE: %s %s' % (self.start, self.content))
|
||||
else:
|
||||
raise RuntimeError('Unknown node type: ' + str(t))
|
||||
|
||||
@staticmethod
|
||||
def create_text(text, layout_info=None):
|
||||
data = CaptionNode(CaptionNode.TEXT, layout_info=layout_info)
|
||||
data.content = text
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def create_style(start, content, layout_info=None):
|
||||
data = CaptionNode(CaptionNode.STYLE, layout_info=layout_info)
|
||||
data.content = content
|
||||
data.start = start
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def create_break(layout_info=None):
|
||||
return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
|
||||
|
||||
|
||||
class Caption(object):
|
||||
"""
|
||||
A single caption, including the time and styling information
|
||||
for its display.
|
||||
"""
|
||||
def __init__(self, start, end, nodes, style={}, layout_info=None):
|
||||
"""
|
||||
Initialize the Caption object
|
||||
:param start: The start time in microseconds
|
||||
:type start: Number
|
||||
:param end: The end time in microseconds
|
||||
:type end: Number
|
||||
:param nodes: A list of CaptionNodes
|
||||
:type nodes: list
|
||||
:param style: A dictionary with CSS-like styling rules
|
||||
:type style: dict
|
||||
:param layout_info: A Layout object with the necessary positioning
|
||||
information
|
||||
:type layout_info: Layout
|
||||
"""
|
||||
if not isinstance(start, Number):
|
||||
raise CaptionReadTimingError("Captions must be initialized with a"
|
||||
" valid start time")
|
||||
if not isinstance(end, Number):
|
||||
raise CaptionReadTimingError("Captions must be initialized with a"
|
||||
" valid end time")
|
||||
if not nodes:
|
||||
raise CaptionReadError("Node list cannot be empty")
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.nodes = nodes
|
||||
self.style = style
|
||||
self.layout_info = layout_info
|
||||
|
||||
def is_empty(self):
|
||||
return len(self.nodes) == 0
|
||||
|
||||
def format_start(self, msec_separator=None):
|
||||
"""
|
||||
Format the start time value in milliseconds into a string
|
||||
value suitable for some of the supported output formats (ex.
|
||||
SRT, DFXP).
|
||||
"""
|
||||
return self._format_timestamp(self.start, msec_separator)
|
||||
|
||||
def format_end(self, msec_separator=None):
|
||||
"""
|
||||
Format the end time value in milliseconds into a string value suitable
|
||||
for some of the supported output formats (ex. SRT, DFXP).
|
||||
"""
|
||||
return self._format_timestamp(self.end, msec_separator)
|
||||
|
||||
def __repr__(self):
|
||||
return repr(
|
||||
'{start} --> {end}\n{text}'.format(
|
||||
start=self.format_start(),
|
||||
end=self.format_end(),
|
||||
text=self.get_text()
|
||||
)
|
||||
)
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Get the text of the caption.
|
||||
"""
|
||||
def get_text_for_node(node):
|
||||
if node.type_ == CaptionNode.TEXT:
|
||||
return node.content
|
||||
if node.type_ == CaptionNode.BREAK:
|
||||
return '\n'
|
||||
return ''
|
||||
text_nodes = [get_text_for_node(node) for node in self.nodes]
|
||||
return ''.join(text_nodes).strip()
|
||||
|
||||
def _format_timestamp(self, value, msec_separator=None):
|
||||
datetime_value = timedelta(milliseconds=(int(value / 1000)))
|
||||
|
||||
str_value = text_type(datetime_value)[:11]
|
||||
if not datetime_value.microseconds:
|
||||
str_value += '.000'
|
||||
|
||||
if msec_separator is not None:
|
||||
str_value = str_value.replace(".", msec_separator)
|
||||
|
||||
return '0' + str_value
|
||||
|
||||
|
||||
class CaptionList(list):
|
||||
""" A list of captions with a layout object attached to it """
|
||||
def __init__(self, iterable=None, layout_info=None):
|
||||
"""
|
||||
:param iterable: An iterator used to populate the caption list
|
||||
:param Layout layout_info: A Layout object with the positioning info
|
||||
"""
|
||||
self.layout_info = layout_info
|
||||
args = [iterable] if iterable else []
|
||||
super(CaptionList, self).__init__(*args)
|
||||
|
||||
def __getslice__(self, i, j):
|
||||
return CaptionList(
|
||||
list.__getslice__(self, i, j), layout_info=self.layout_info)
|
||||
|
||||
def __getitem__(self, y):
|
||||
item = list.__getitem__(self, y)
|
||||
if isinstance(item, Caption):
|
||||
return item
|
||||
return CaptionList(item, layout_info=self.layout_info)
|
||||
|
||||
def __add__(self, other):
|
||||
add_is_safe = (
|
||||
not hasattr(other, 'layout_info') or
|
||||
not other.layout_info or
|
||||
self.layout_info == other.layout_info
|
||||
)
|
||||
if add_is_safe:
|
||||
return CaptionList(
|
||||
list.__add__(self, other), layout_info=self.layout_info)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot add CaptionList objects with different layout_info")
|
||||
|
||||
def __mul__(self, other):
|
||||
return CaptionList(
|
||||
list.__mul__(self, other), layout_info=self.layout_info)
|
||||
|
||||
__rmul__ = __mul__
|
||||
|
||||
|
||||
class CaptionSet(object):
|
||||
"""
|
||||
A set of captions in potentially multiple languages,
|
||||
all representing the same underlying content.
|
||||
|
||||
The .layout_info attribute, keeps information that should be inherited
|
||||
by all the children.
|
||||
"""
|
||||
def __init__(self, captions, styles={}, layout_info=None):
|
||||
"""
|
||||
:param captions: A dictionary of the format {'language': CaptionList}
|
||||
:param styles: A dictionary with CSS-like styling rules
|
||||
:param Layout layout_info: A Layout object with the positioning info
|
||||
"""
|
||||
self._captions = captions
|
||||
self._styles = styles
|
||||
self.layout_info = layout_info
|
||||
|
||||
def set_captions(self, lang, captions):
|
||||
self._captions[lang] = captions
|
||||
|
||||
def get_languages(self):
|
||||
return list(self._captions.keys())
|
||||
|
||||
def get_captions(self, lang):
|
||||
return self._captions.get(lang, [])
|
||||
|
||||
def add_style(self, selector, rules):
|
||||
"""
|
||||
:param selector: The selector indicating the elements to which the
|
||||
rules should be applied.
|
||||
:param rules: A dictionary with CSS-like styling rules.
|
||||
"""
|
||||
self._styles[selector] = rules
|
||||
|
||||
def get_style(self, selector):
|
||||
"""
|
||||
Returns a dictionary with CSS-like styling rules for a given selector.
|
||||
:param selector: The selector whose rules should be returned (e.g. an
|
||||
element or class name).
|
||||
"""
|
||||
return self._styles.get(selector, {})
|
||||
|
||||
def get_styles(self):
|
||||
return sorted(self._styles.items())
|
||||
|
||||
def set_styles(self, styles):
|
||||
self._styles = styles
|
||||
|
||||
def is_empty(self):
|
||||
return all(
|
||||
[len(captions) == 0 for captions in list(self._captions.values())]
|
||||
)
|
||||
|
||||
def set_layout_info(self, lang, layout_info):
|
||||
self._captions[lang].layout_info = layout_info
|
||||
|
||||
def get_layout_info(self, lang):
|
||||
caption_list = self._captions.get(lang)
|
||||
if caption_list:
|
||||
return caption_list.layout_info
|
||||
return None
|
||||
|
||||
def adjust_caption_timing(self, offset=0, rate_skew=1.0):
|
||||
"""
|
||||
Adjust the timing according to offset and rate_skew.
|
||||
Skew is applied first, then offset.
|
||||
|
||||
e.g. if skew == 1.1, and offset is 5, a caption originally
|
||||
displayed from 10-11 seconds would instead be at 16-17.1
|
||||
"""
|
||||
for lang in self.get_languages():
|
||||
captions = self.get_captions(lang)
|
||||
out_captions = CaptionList()
|
||||
for caption in captions:
|
||||
caption.start = caption.start * rate_skew + offset
|
||||
caption.end = caption.end * rate_skew + offset
|
||||
if caption.start >= 0:
|
||||
out_captions.append(caption)
|
||||
self.set_captions(lang, out_captions)
|
||||
|
||||
|
||||
# Functions
|
||||
def merge_concurrent_captions(caption_set):
|
||||
"""Merge captions that have the same start and end times"""
|
||||
for lang in caption_set.get_languages():
|
||||
captions = caption_set.get_captions(lang)
|
||||
last_caption = None
|
||||
concurrent_captions = CaptionList()
|
||||
merged_captions = CaptionList()
|
||||
for caption in captions:
|
||||
if last_caption:
|
||||
last_timespan = last_caption.start, last_caption.end
|
||||
current_timespan = caption.start, caption.end
|
||||
if current_timespan == last_timespan:
|
||||
concurrent_captions.append(caption)
|
||||
last_caption = caption
|
||||
continue
|
||||
else:
|
||||
merged_captions.append(merge(concurrent_captions))
|
||||
concurrent_captions = [caption]
|
||||
last_caption = caption
|
||||
|
||||
if concurrent_captions:
|
||||
merged_captions.append(merge(concurrent_captions))
|
||||
if merged_captions:
|
||||
caption_set.set_captions(lang, merged_captions)
|
||||
return caption_set
|
||||
|
||||
|
||||
def merge(captions):
|
||||
"""
|
||||
Merge list of captions into one caption. The start/end times from the first
|
||||
caption are kept.
|
||||
"""
|
||||
new_nodes = []
|
||||
for caption in captions:
|
||||
if new_nodes:
|
||||
new_nodes.append(CaptionNode.create_break())
|
||||
for node in caption.nodes:
|
||||
new_nodes.append(node)
|
||||
caption = Caption(
|
||||
captions[0].start, captions[0].end, new_nodes, captions[0].style)
|
||||
return caption
|
||||
2
utils/modules/pycaption/dfxp/__init__.py
Normal file
2
utils/modules/pycaption/dfxp/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .base import *
|
||||
from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter
|
||||
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/base.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/dfxp/__pycache__/extras.cpython-39.pyc
Normal file
Binary file not shown.
1306
utils/modules/pycaption/dfxp/base.py
Normal file
1306
utils/modules/pycaption/dfxp/base.py
Normal file
File diff suppressed because it is too large
Load Diff
248
utils/modules/pycaption/dfxp/extras.py
Normal file
248
utils/modules/pycaption/dfxp/extras.py
Normal file
@@ -0,0 +1,248 @@
|
||||
# We thought about making pycaption.base objects immutable. This would be nice
|
||||
# in a lot of cases, but since the transformations on them could be quite
|
||||
# complex, the deepcopy method is good enough sometimes.
|
||||
from copy import deepcopy
|
||||
|
||||
from .base import DFXPWriter, DFXP_DEFAULT_REGION
|
||||
from ..base import BaseWriter, CaptionNode, merge_concurrent_captions
|
||||
|
||||
from xml.sax.saxutils import escape
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
LEGACY_DFXP_BASE_MARKUP = '''
|
||||
<tt xmlns="http://www.w3.org/ns/ttml"
|
||||
xmlns:tts="http://www.w3.org/ns/ttml#styling">
|
||||
<head>
|
||||
<styling/>
|
||||
<layout/>
|
||||
</head>
|
||||
<body/>
|
||||
</tt>
|
||||
'''
|
||||
|
||||
LEGACY_DFXP_DEFAULT_STYLE = {
|
||||
'color': 'white',
|
||||
'font-family': 'monospace',
|
||||
'font-size': '1c',
|
||||
}
|
||||
|
||||
LEGACY_DFXP_DEFAULT_STYLE_ID = 'default'
|
||||
LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom'
|
||||
|
||||
LEGACY_DFXP_DEFAULT_REGION = {
|
||||
'text-align': 'center',
|
||||
'display-align': 'after'
|
||||
}
|
||||
|
||||
|
||||
class SinglePositioningDFXPWriter(DFXPWriter):
|
||||
"""A dfxp writer, that ignores all positioning, using a single provided value
|
||||
"""
|
||||
def __init__(self, default_positioning=DFXP_DEFAULT_REGION,
|
||||
*args, **kwargs):
|
||||
super(SinglePositioningDFXPWriter, self).__init__(*args, **kwargs)
|
||||
self.default_positioning = default_positioning
|
||||
|
||||
def write(self, captions_set, force=''):
|
||||
"""Writes a DFXP file using the positioning provided in the initializer
|
||||
|
||||
:type captions_set: pycaption.base.CaptionSet
|
||||
:param force: only write this language, if available in the CaptionSet
|
||||
:rtype: unicode
|
||||
"""
|
||||
captions_set = self._create_single_positioning_caption_set(
|
||||
captions_set, self.default_positioning)
|
||||
|
||||
return super(SinglePositioningDFXPWriter, self).write(captions_set, force) # noqa
|
||||
|
||||
@staticmethod
|
||||
def _create_single_positioning_caption_set(caption_set, positioning):
|
||||
"""Return a caption where all the positioning information was
|
||||
replaced from positioning
|
||||
|
||||
:type caption_set: pycaption.base.CaptionSet
|
||||
:rtype: pycaption.base.CaptionSet
|
||||
"""
|
||||
# If SinglePositioningDFXPWriter would modify the state of the caption
|
||||
# set, any writer using the same caption_set thereafter would be
|
||||
# affected. At the moment we know we don't use any other writers, but
|
||||
# this is important and mustn't be neglected
|
||||
caption_set = deepcopy(caption_set)
|
||||
caption_set = merge_concurrent_captions(caption_set)
|
||||
caption_set.layout_info = positioning
|
||||
|
||||
for lang in caption_set.get_languages():
|
||||
caption_set.set_layout_info(lang, positioning)
|
||||
|
||||
caption_list = caption_set.get_captions(lang)
|
||||
for caption in caption_list:
|
||||
caption.layout_info = positioning
|
||||
|
||||
for node in caption.nodes:
|
||||
if hasattr(node, 'layout_info'):
|
||||
node.layout_info = positioning
|
||||
|
||||
for _, style in caption_set.get_styles():
|
||||
if 'text-align' in style:
|
||||
style.pop('text-align')
|
||||
|
||||
return caption_set
|
||||
|
||||
|
||||
class LegacyDFXPWriter(BaseWriter):
|
||||
"""Ported the legacy DFXPWriter from 0.4.5"""
|
||||
def __init__(self, *args, **kw):
|
||||
self.p_style = False
|
||||
self.open_span = False
|
||||
|
||||
def write(self, caption_set, force=''):
|
||||
caption_set = deepcopy(caption_set)
|
||||
caption_set = merge_concurrent_captions(caption_set)
|
||||
|
||||
dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml')
|
||||
dfxp.find('tt')['xml:lang'] = "en"
|
||||
|
||||
for style_id, style in caption_set.get_styles():
|
||||
if style != {}:
|
||||
dfxp = self._recreate_styling_tag(style_id, style, dfxp)
|
||||
if not caption_set.get_styles():
|
||||
dfxp = self._recreate_styling_tag(
|
||||
LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp)
|
||||
|
||||
# XXX For now we will always use this default region. In the future if
|
||||
# regions are provided, they will be kept
|
||||
dfxp = self._recreate_region_tag(
|
||||
LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp)
|
||||
|
||||
body = dfxp.find('body')
|
||||
|
||||
if force:
|
||||
langs = [self._force_language(force, caption_set.get_languages())]
|
||||
else:
|
||||
langs = caption_set.get_languages()
|
||||
|
||||
for lang in langs:
|
||||
div = dfxp.new_tag('div')
|
||||
div['xml:lang'] = '%s' % lang
|
||||
|
||||
for caption in caption_set.get_captions(lang):
|
||||
if caption.style:
|
||||
caption_style = caption.style
|
||||
caption_style.update({'region': LEGACY_DFXP_DEFAULT_REGION_ID})
|
||||
else:
|
||||
caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID,
|
||||
'region': LEGACY_DFXP_DEFAULT_REGION_ID}
|
||||
p = self._recreate_p_tag(caption, caption_style, dfxp)
|
||||
div.append(p)
|
||||
|
||||
body.append(div)
|
||||
|
||||
caption_content = dfxp.prettify(formatter=None)
|
||||
return caption_content
|
||||
|
||||
# force the DFXP to only have one language, trying to match on "force"
|
||||
def _force_language(self, force, langs):
|
||||
for lang in langs:
|
||||
if force == lang:
|
||||
return lang
|
||||
|
||||
return langs[-1]
|
||||
|
||||
def _recreate_region_tag(self, region_id, styling, dfxp):
|
||||
dfxp_region = dfxp.new_tag('region')
|
||||
dfxp_region.attrs.update({'xml:id': region_id})
|
||||
|
||||
attributes = self._recreate_style(styling, dfxp)
|
||||
dfxp_region.attrs.update(attributes)
|
||||
|
||||
new_tag = dfxp.new_tag('region')
|
||||
new_tag.attrs.update({'xml:id': region_id})
|
||||
if dfxp_region != new_tag:
|
||||
dfxp.find('layout').append(dfxp_region)
|
||||
return dfxp
|
||||
|
||||
def _recreate_styling_tag(self, style, content, dfxp):
|
||||
dfxp_style = dfxp.new_tag('style')
|
||||
dfxp_style.attrs.update({'xml:id': style})
|
||||
|
||||
attributes = self._recreate_style(content, dfxp)
|
||||
dfxp_style.attrs.update(attributes)
|
||||
|
||||
new_tag = dfxp.new_tag('style')
|
||||
new_tag.attrs.update({'xml:id': style})
|
||||
if dfxp_style != new_tag:
|
||||
dfxp.find('styling').append(dfxp_style)
|
||||
|
||||
return dfxp
|
||||
|
||||
def _recreate_p_tag(self, caption, caption_style, dfxp):
|
||||
start = caption.format_start()
|
||||
end = caption.format_end()
|
||||
p = dfxp.new_tag("p", begin=start, end=end)
|
||||
p.string = self._recreate_text(caption, dfxp)
|
||||
|
||||
if dfxp.find("style", {"xml:id": "p"}):
|
||||
p['style'] = 'p'
|
||||
|
||||
p.attrs.update(self._recreate_style(caption_style, dfxp))
|
||||
|
||||
return p
|
||||
|
||||
def _recreate_text(self, caption, dfxp):
|
||||
line = ''
|
||||
|
||||
for node in caption.nodes:
|
||||
if node.type_ == CaptionNode.TEXT:
|
||||
line += escape(node.content) + ' '
|
||||
|
||||
elif node.type_ == CaptionNode.BREAK:
|
||||
line = line.rstrip() + '<br/>\n '
|
||||
|
||||
elif node.type_ == CaptionNode.STYLE:
|
||||
line = self._recreate_span(line, node, dfxp)
|
||||
|
||||
return line.rstrip()
|
||||
|
||||
def _recreate_span(self, line, node, dfxp):
|
||||
if node.start:
|
||||
styles = ''
|
||||
|
||||
content_with_style = self._recreate_style(node.content, dfxp)
|
||||
for style, value in list(content_with_style.items()):
|
||||
styles += ' %s="%s"' % (style, value)
|
||||
|
||||
if styles:
|
||||
if self.open_span:
|
||||
line = line.rstrip() + '</span> '
|
||||
line += '<span%s>' % styles
|
||||
self.open_span = True
|
||||
|
||||
elif self.open_span:
|
||||
line = line.rstrip() + '</span> '
|
||||
self.open_span = False
|
||||
|
||||
return line
|
||||
|
||||
def _recreate_style(self, content, dfxp):
|
||||
dfxp_style = {}
|
||||
|
||||
if 'region' in content:
|
||||
if dfxp.find('region', {'xml:id': content['region']}):
|
||||
dfxp_style['region'] = content['region']
|
||||
if 'class' in content:
|
||||
if dfxp.find("style", {"xml:id": content['class']}):
|
||||
dfxp_style['style'] = content['class']
|
||||
if 'text-align' in content:
|
||||
dfxp_style['tts:textAlign'] = content['text-align']
|
||||
if 'italics' in content:
|
||||
dfxp_style['tts:fontStyle'] = 'italic'
|
||||
if 'font-family' in content:
|
||||
dfxp_style['tts:fontFamily'] = content['font-family']
|
||||
if 'font-size' in content:
|
||||
dfxp_style['tts:fontSize'] = content['font-size']
|
||||
if 'color' in content:
|
||||
dfxp_style['tts:color'] = content['color']
|
||||
if 'display-align' in content:
|
||||
dfxp_style['tts:displayAlign'] = content['display-align']
|
||||
|
||||
return dfxp_style
|
||||
61702
utils/modules/pycaption/english.pickle
Normal file
61702
utils/modules/pycaption/english.pickle
Normal file
File diff suppressed because it is too large
Load Diff
40
utils/modules/pycaption/exceptions.py
Normal file
40
utils/modules/pycaption/exceptions.py
Normal file
@@ -0,0 +1,40 @@
|
||||
|
||||
|
||||
class CaptionReadError(Exception):
|
||||
"""
|
||||
Generic error raised when the reading of the caption file failed.
|
||||
"""
|
||||
def __str__(self):
|
||||
return "%s(%s)" % (self.__class__.__name__, self.args)
|
||||
|
||||
|
||||
class CaptionReadNoCaptions(CaptionReadError):
|
||||
"""
|
||||
Error raised when the provided caption file was not containing any
|
||||
actual captions.
|
||||
"""
|
||||
|
||||
|
||||
class CaptionReadSyntaxError(CaptionReadError):
|
||||
"""
|
||||
Error raised when the provided caption file has syntax errors and could
|
||||
not be parsed.
|
||||
"""
|
||||
|
||||
|
||||
class CaptionReadTimingError(CaptionReadError):
|
||||
"""
|
||||
Error raised when a Caption is initialized with invalid timings.
|
||||
"""
|
||||
|
||||
|
||||
class RelativizationError(Exception):
|
||||
"""
|
||||
Error raised when absolute positioning cannot be converted to
|
||||
percentage
|
||||
"""
|
||||
|
||||
|
||||
class InvalidInputError(RuntimeError):
|
||||
""" Error raised when the input is invalid (i.e. a unicode string)
|
||||
"""
|
||||
916
utils/modules/pycaption/geometry.py
Normal file
916
utils/modules/pycaption/geometry.py
Normal file
@@ -0,0 +1,916 @@
|
||||
"""
|
||||
This module implements the classes used to represent positioning information.
|
||||
|
||||
CONVENTIONS:
|
||||
* None of the methods should modify the state of the objects on which they're
|
||||
called. If the values of an object need to be recalculated, the method
|
||||
responsible for the recalculation should return a new object with the
|
||||
necessary modifications.
|
||||
"""
|
||||
import six
|
||||
|
||||
from enum import Enum
|
||||
from .exceptions import RelativizationError
|
||||
|
||||
|
||||
class UnitEnum(Enum):
|
||||
"""Enumeration-like object, specifying the units of measure for length
|
||||
|
||||
Usage:
|
||||
unit = UnitEnum.PIXEL
|
||||
unit = UnitEnum.EM
|
||||
if unit == UnitEnum.CELL :
|
||||
...
|
||||
"""
|
||||
PIXEL = 'px'
|
||||
EM = 'em'
|
||||
PERCENT = '%'
|
||||
CELL = 'c'
|
||||
PT = 'pt'
|
||||
|
||||
|
||||
class VerticalAlignmentEnum(Enum):
|
||||
"""Enumeration object, specifying the allowed vertical alignment options
|
||||
|
||||
Usage:
|
||||
alignment = VerticalAlignmentEnum.TOP
|
||||
if alignment == VerticalAlignmentEnum.BOTTOM:
|
||||
...
|
||||
"""
|
||||
TOP = 'top'
|
||||
CENTER = 'center'
|
||||
BOTTOM = 'bottom'
|
||||
|
||||
|
||||
class HorizontalAlignmentEnum(Enum):
|
||||
"""Enumeration object specifying the horizontal alignment preferences
|
||||
"""
|
||||
LEFT = 'left'
|
||||
CENTER = 'center'
|
||||
RIGHT = 'right'
|
||||
START = 'start'
|
||||
END = 'end'
|
||||
|
||||
|
||||
class Alignment(object):
|
||||
def __init__(self, horizontal, vertical):
|
||||
"""
|
||||
:type horizontal: HorizontalAlignmentEnum
|
||||
:param horizontal: HorizontalAlignmentEnum member
|
||||
:type vertical: VerticalAlignmentEnum
|
||||
:param vertical: VerticalAlignmentEnum member
|
||||
"""
|
||||
self.horizontal = horizontal
|
||||
self.vertical = vertical
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.horizontal) * 83 +
|
||||
hash(self.vertical) * 89 +
|
||||
97
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.horizontal == other.horizontal and
|
||||
self.vertical == other.vertical
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return "<Alignment ({horizontal} {vertical})>".format(
|
||||
horizontal=self.horizontal, vertical=self.vertical
|
||||
)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns a tuple of the useful information regarding this object
|
||||
"""
|
||||
return self.horizontal, self.vertical
|
||||
|
||||
@classmethod
|
||||
def from_horizontal_and_vertical_align(cls, text_align=None,
|
||||
display_align=None):
|
||||
horizontal_obj = None
|
||||
vertical_obj = None
|
||||
|
||||
if text_align == 'left':
|
||||
horizontal_obj = HorizontalAlignmentEnum.LEFT
|
||||
if text_align == 'start':
|
||||
horizontal_obj = HorizontalAlignmentEnum.START
|
||||
if text_align == 'center':
|
||||
horizontal_obj = HorizontalAlignmentEnum.CENTER
|
||||
if text_align == 'right':
|
||||
horizontal_obj = HorizontalAlignmentEnum.RIGHT
|
||||
if text_align == 'end':
|
||||
horizontal_obj = HorizontalAlignmentEnum.END
|
||||
|
||||
if display_align == 'before':
|
||||
vertical_obj = VerticalAlignmentEnum.TOP
|
||||
if display_align == 'center':
|
||||
vertical_obj = VerticalAlignmentEnum.CENTER
|
||||
if display_align == 'after':
|
||||
vertical_obj = VerticalAlignmentEnum.BOTTOM
|
||||
|
||||
if not any([horizontal_obj, vertical_obj]):
|
||||
return None
|
||||
return cls(horizontal_obj, vertical_obj)
|
||||
|
||||
|
||||
class TwoDimensionalObject(object):
|
||||
"""Adds a couple useful methods to its subclasses, nothing fancy.
|
||||
"""
|
||||
@classmethod
|
||||
# TODO - highly cachable. Should use WeakValueDictionary here to return
|
||||
# flyweights, not new objects.
|
||||
def from_xml_attribute(cls, attribute):
|
||||
"""Instantiate the class from a value of the type "4px" or "5%"
|
||||
or any number concatenated with a measuring unit (member of UnitEnum)
|
||||
|
||||
:type attribute: unicode
|
||||
"""
|
||||
horizontal, vertical = six.text_type(attribute).split(' ')
|
||||
horizontal = Size.from_string(horizontal)
|
||||
vertical = Size.from_string(vertical)
|
||||
|
||||
return cls(horizontal, vertical)
|
||||
|
||||
|
||||
class Stretch(TwoDimensionalObject):
|
||||
"""Used for specifying the extent of a rectangle (how much it stretches),
|
||||
or the padding in a rectangle (how much space should be left empty until
|
||||
text can be displayed)
|
||||
"""
|
||||
def __init__(self, horizontal, vertical):
|
||||
"""Use the .from_xxx methods. They know what's best for you.
|
||||
|
||||
:type horizontal: Size
|
||||
:type vertical: Size
|
||||
"""
|
||||
for parameter in [horizontal, vertical]:
|
||||
if not isinstance(parameter, Size):
|
||||
raise ValueError("Stretch must be initialized with two valid "
|
||||
"Size objects.")
|
||||
self.horizontal = horizontal
|
||||
self.vertical = vertical
|
||||
|
||||
def is_measured_in(self, measure_unit):
|
||||
"""Whether the stretch is only measured in the provided units
|
||||
|
||||
:param measure_unit: a UnitEnum member
|
||||
:return: True/False
|
||||
"""
|
||||
return (
|
||||
self.horizontal.unit == measure_unit and
|
||||
self.vertical.unit == measure_unit
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return '<Stretch ({horizontal}, {vertical})>'.format(
|
||||
horizontal=self.horizontal, vertical=self.vertical
|
||||
)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns a tuple of the useful attributes of this object"""
|
||||
return (
|
||||
None if not self.horizontal else self.horizontal.serialized(),
|
||||
None if not self.vertical else self.vertical.serialized()
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.horizontal == other.horizontal and
|
||||
self.vertical == other.vertical
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.horizontal) * 59 +
|
||||
hash(self.vertical) * 61 +
|
||||
67
|
||||
)
|
||||
|
||||
def __bool__(self):
|
||||
return True if self.horizontal or self.vertical else False
|
||||
|
||||
def to_xml_attribute(self, **kwargs):
|
||||
"""Returns a unicode representation of this object as an xml attribute
|
||||
"""
|
||||
return '{horizontal} {vertical}'.format(
|
||||
horizontal=self.horizontal.to_xml_attribute(),
|
||||
vertical=self.vertical.to_xml_attribute()
|
||||
)
|
||||
|
||||
def is_relative(self):
|
||||
"""
|
||||
Returns True if all dimensions are expressed as percentages,
|
||||
False otherwise.
|
||||
"""
|
||||
is_relative = True
|
||||
if self.horizontal:
|
||||
is_relative &= self.horizontal.is_relative()
|
||||
if self.vertical:
|
||||
is_relative &= self.vertical.is_relative()
|
||||
return is_relative
|
||||
|
||||
def as_percentage_of(self, video_width, video_height):
|
||||
"""
|
||||
Converts absolute units (e.g. px, pt etc) to percentage
|
||||
"""
|
||||
return Stretch(
|
||||
self.horizontal.as_percentage_of(video_width=video_width),
|
||||
self.vertical.as_percentage_of(video_height=video_height)
|
||||
)
|
||||
|
||||
|
||||
class Region(object):
|
||||
"""Represents the spatial coordinates of a rectangle
|
||||
|
||||
Don't instantiate by hand. use Region.from_points or Region.from_extent
|
||||
"""
|
||||
@classmethod
|
||||
def from_points(cls, p1, p2):
|
||||
"""Create a rectangle, knowing 2 points on the plane.
|
||||
We assume that p1 is in the upper left (closer to the origin)
|
||||
|
||||
:param p1: Point instance
|
||||
:param p2: Point instance
|
||||
:return: a Point instance
|
||||
"""
|
||||
inst = cls()
|
||||
inst._p1 = p1
|
||||
inst._p2 = p2
|
||||
return inst
|
||||
|
||||
@classmethod
|
||||
def from_extent(cls, extent, origin):
|
||||
"""Create a rectangle, knowing its upper left origin, and
|
||||
spatial extension
|
||||
|
||||
:type extent: Stretch
|
||||
:type origin: Point
|
||||
:return: a Point instance
|
||||
"""
|
||||
inst = cls()
|
||||
inst._extent = extent
|
||||
inst._origin = origin
|
||||
return inst
|
||||
|
||||
@property
|
||||
def extent(self):
|
||||
"""How wide this rectangle stretches (horizontally and vertically)
|
||||
"""
|
||||
if hasattr(self, '_extent'):
|
||||
return self._extent
|
||||
else:
|
||||
return self._p1 - self._p2
|
||||
|
||||
@property
|
||||
def origin(self):
|
||||
"""Out of its 4 points, returns the one closest to the origin
|
||||
"""
|
||||
if hasattr(self, '_origin'):
|
||||
return self._origin
|
||||
else:
|
||||
return Point.align_from_origin(self._p1, self._p2)[0]
|
||||
|
||||
upper_left_point = origin
|
||||
|
||||
@property
|
||||
def lower_right_point(self):
|
||||
"""The point furthest from the origin from the rectangle's 4 points
|
||||
"""
|
||||
if hasattr(self, '_p2'):
|
||||
return Point.align_from_origin(self._p1, self._p2)[1]
|
||||
else:
|
||||
return self.origin.add_extent(self.extent)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.extent == other.extent and
|
||||
self.origin == other.origin
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.origin) * 71 +
|
||||
hash(self.extent) * 73 +
|
||||
79
|
||||
)
|
||||
|
||||
|
||||
class Point(TwoDimensionalObject):
|
||||
"""Represent a point in 2d space.
|
||||
"""
|
||||
def __init__(self, x, y):
|
||||
"""
|
||||
:type x: Size
|
||||
:type y: Size
|
||||
"""
|
||||
for parameter in [x, y]:
|
||||
if not isinstance(parameter, Size):
|
||||
raise ValueError("Point must be initialized with two valid "
|
||||
"Size objects.")
|
||||
self.x = x
|
||||
self.y = y
|
||||
|
||||
def __sub__(self, other):
|
||||
"""Returns an Stretch object, if the other point's units are compatible
|
||||
"""
|
||||
return Stretch(abs(self.x - other.x), abs(self.y - other.y))
|
||||
|
||||
def add_stretch(self, stretch):
|
||||
"""Returns another Point instance, whose coordinates are the sum of the
|
||||
current Point's, and the Stretch instance's.
|
||||
"""
|
||||
return Point(self.x + stretch.horizontal, self.y + stretch.vertical)
|
||||
|
||||
def is_relative(self):
|
||||
"""
|
||||
Returns True if all dimensions are expressed as percentages,
|
||||
False otherwise.
|
||||
"""
|
||||
is_relative = True
|
||||
if self.x:
|
||||
is_relative &= self.x.is_relative()
|
||||
if self.y:
|
||||
is_relative &= self.y.is_relative()
|
||||
return is_relative
|
||||
|
||||
def as_percentage_of(self, video_width, video_height):
|
||||
"""
|
||||
Converts absolute units (e.g. px, pt etc) to percentage
|
||||
"""
|
||||
return Point(
|
||||
self.x.as_percentage_of(video_width=video_width),
|
||||
self.y.as_percentage_of(video_height=video_height)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def align_from_origin(cls, p1, p2):
|
||||
"""Returns a tuple of 2 points. The first is closest to the origin
|
||||
on both axes than the second.
|
||||
|
||||
If the 2 points fulfill this condition, returns them (ordered), if not,
|
||||
creates 2 new points.
|
||||
"""
|
||||
if p1.x <= p2.x and p1.y <= p2.y:
|
||||
return p1
|
||||
if p1.x >= p2.x and p1.y >= p2.y:
|
||||
return p2
|
||||
else:
|
||||
return (Point(min(p1.x, p2.x), min(p1.y, p2.y)),
|
||||
Point(max(p1.x, p2.x), max(p1.y, p2.y)))
|
||||
|
||||
def __repr__(self):
|
||||
return '<Point ({x}, {y})>'.format(
|
||||
x=self.x, y=self.y
|
||||
)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns the "useful" values of this object.
|
||||
"""
|
||||
return (
|
||||
None if not self.x else self.x.serialized(),
|
||||
None if not self.y else self.y.serialized()
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.x == other.x and
|
||||
self.y == other.y
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.x) * 51 +
|
||||
hash(self.y) * 53 +
|
||||
57
|
||||
)
|
||||
|
||||
def __bool__(self):
|
||||
return True if self.x or self.y else False
|
||||
|
||||
def to_xml_attribute(self, **kwargs):
|
||||
"""Returns a unicode representation of this object as an xml attribute
|
||||
"""
|
||||
return '{x} {y}'.format(
|
||||
x=self.x.to_xml_attribute(), y=self.y.to_xml_attribute())
|
||||
|
||||
|
||||
@six.python_2_unicode_compatible
|
||||
class Size(object):
|
||||
"""Ties together a number with a unit, to represent a size.
|
||||
|
||||
Use as value objects! (don't change after creation)
|
||||
"""
|
||||
def __init__(self, value, unit):
|
||||
"""
|
||||
:param value: A number (float or int will do)
|
||||
:param unit: A UnitEnum member
|
||||
"""
|
||||
if value is None:
|
||||
raise ValueError("Size must be initialized with a value.")
|
||||
if not isinstance(unit,UnitEnum):
|
||||
raise ValueError("Size must be initialized with a valid unit.")
|
||||
|
||||
self.value = float(value)
|
||||
self.unit = unit
|
||||
|
||||
def __sub__(self, other):
|
||||
if self.unit == other.unit:
|
||||
return Size(self.value - other.value, self.unit)
|
||||
else:
|
||||
raise ValueError("The sizes should have the same measure units.")
|
||||
|
||||
def __abs__(self):
|
||||
return Size(abs(self.value), self.unit)
|
||||
|
||||
def __cmp__(self, other):
|
||||
if self.unit == other.unit:
|
||||
# python3 does not have cmp
|
||||
return (self.value > other.value) - (self.value < other.value)
|
||||
else:
|
||||
raise ValueError("The sizes should have the same measure units.")
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.value < other.value
|
||||
|
||||
|
||||
def __add__(self, other):
|
||||
if self.unit == other.unit:
|
||||
return Size(self.value + other.value, self.unit)
|
||||
else:
|
||||
raise ValueError("The sizes should have the same measure units.")
|
||||
|
||||
def is_relative(self):
|
||||
"""
|
||||
Returns True if value is expressed as percentage, False otherwise.
|
||||
"""
|
||||
return self.unit == UnitEnum.PERCENT
|
||||
|
||||
def as_percentage_of(self, video_width=None, video_height=None):
|
||||
"""
|
||||
:param video_width: An integer representing a width in pixels
|
||||
:param video_height: An integer representing a height in pixels
|
||||
"""
|
||||
value = self.value
|
||||
unit = self.unit
|
||||
|
||||
if unit == UnitEnum.PERCENT:
|
||||
return self # Nothing to do here
|
||||
|
||||
# The input must be valid so that any conversion can be done
|
||||
if not (video_width or video_height):
|
||||
raise RelativizationError(
|
||||
"Either video width or height must be given as a reference")
|
||||
elif video_width and video_height:
|
||||
raise RelativizationError(
|
||||
"Only video width or height can be given as reference")
|
||||
|
||||
if unit == UnitEnum.EM:
|
||||
# TODO: Implement proper conversion of em in function of font-size
|
||||
# The em unit is relative to the font-size, to which we currently
|
||||
# have no access. As a workaround, we presume the font-size is 16px,
|
||||
# which is a common default value but not guaranteed.
|
||||
value *= 16
|
||||
unit = UnitEnum.PIXEL
|
||||
|
||||
if unit == UnitEnum.PT:
|
||||
# XXX: we will convert first to "px" and from "px" this will be
|
||||
# converted to percent. we don't take into consideration the
|
||||
# font-size
|
||||
value = value / 72.0 * 96.0
|
||||
unit = UnitEnum.PIXEL
|
||||
|
||||
if unit == UnitEnum.PIXEL:
|
||||
value = value * 100.0 / (video_width or video_height)
|
||||
unit = UnitEnum.PERCENT
|
||||
|
||||
if unit == UnitEnum.CELL:
|
||||
# TODO: Implement proper cell resolution
|
||||
# (w3.org/TR/ttaf1-dfxp/#parameter-attribute-cellResolution)
|
||||
# For now we will use the default values (32 columns and 15 rows)
|
||||
cell_reference = 32 if video_width else 15
|
||||
value = value * 100.0 / cell_reference
|
||||
unit = UnitEnum.PERCENT
|
||||
|
||||
return Size(value, unit)
|
||||
|
||||
@classmethod
|
||||
# TODO - this also looks highly cachable. Should use a WeakValueDict here
|
||||
# to return flyweights
|
||||
def from_string(cls, string):
|
||||
"""Given a string of the form "46px" or "5%" etc., returns the proper
|
||||
size object
|
||||
|
||||
:param string: a number concatenated to any of the UnitEnum members.
|
||||
:type string: unicode
|
||||
:rtype: Size
|
||||
"""
|
||||
|
||||
raw_number = string
|
||||
for unit in list(UnitEnum):
|
||||
if raw_number.endswith(unit.value):
|
||||
raw_number = raw_number.rstrip(unit.value)
|
||||
break
|
||||
else:
|
||||
unit = None
|
||||
|
||||
if unit is not None:
|
||||
value = None
|
||||
try:
|
||||
value = float(raw_number)
|
||||
value = int(raw_number)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if value is None:
|
||||
raise ValueError(
|
||||
"""Couldn't recognize the value "{value}" as a number"""
|
||||
.format(value=raw_number)
|
||||
)
|
||||
instance = cls(value, unit)
|
||||
return instance
|
||||
else:
|
||||
raise ValueError(
|
||||
"The specified value is not valid because its unit "
|
||||
"is not recognized: {value}. "
|
||||
"The only supported units are: {supported}"
|
||||
.format(value=raw_number, supported=', '.join(UnitEnum._member_map_))
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return '<Size ({value} {unit})>'.format(
|
||||
value=self.value, unit=self.unit.value
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
value = round(self.value, 2)
|
||||
if value.is_integer():
|
||||
s = "{}".format(int(value))
|
||||
else:
|
||||
s = "{:.2f}".format(value).rstrip('0').rstrip('.')
|
||||
return "{}{}".format(s, self.unit.value)
|
||||
|
||||
def to_xml_attribute(self, **kwargs):
|
||||
"""Returns a unicode representation of this object, as an xml attribute
|
||||
"""
|
||||
return six.text_type(self)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns the "useful" values of this object"""
|
||||
return self.value, self.unit
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.value == other.value and
|
||||
self.unit == other.unit
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.value) * 41 +
|
||||
hash(self.unit) * 43 +
|
||||
47
|
||||
)
|
||||
|
||||
def __bool__(self):
|
||||
return self.unit in UnitEnum and self.value is not None
|
||||
|
||||
|
||||
class Padding(object):
|
||||
"""Represents padding information. Consists of 4 Size objects, representing
|
||||
padding from (in this order): before (up), after (down), start (left) and
|
||||
end (right).
|
||||
|
||||
A valid Padding object must always have all paddings set and different from
|
||||
None. If this is not true Writers may fail for they rely on this assumption.
|
||||
"""
|
||||
def __init__(self, before=None, after=None, start=None, end=None):
|
||||
"""
|
||||
:type before: Size
|
||||
:type after: Size
|
||||
:type start: Size
|
||||
:type end: Size
|
||||
"""
|
||||
self.before = before # top
|
||||
self.after = after # bottom
|
||||
self.start = start # left
|
||||
self.end = end # right
|
||||
|
||||
for attr in ['before', 'after', 'start', 'end']:
|
||||
# Ensure that a Padding object always explicitly defines all
|
||||
# four possible paddings
|
||||
if not isinstance(getattr(self, attr), Size):
|
||||
# Sets default padding (0%)
|
||||
setattr(self, attr, Size(0, UnitEnum.PERCENT))
|
||||
|
||||
@classmethod
|
||||
def from_xml_attribute(cls, attribute):
|
||||
"""As per the docs, the style attribute can contain 1,2,3 or 4 values.
|
||||
|
||||
If 1 value: apply to all edges
|
||||
If 2: first applies to before and after, second to start and end
|
||||
If 3: first applies to before, second to start and end, third to after
|
||||
If 4: before, end, after, start;
|
||||
|
||||
http://www.w3.org/TR/ttaf1-dfxp/#style-attribute-padding
|
||||
|
||||
:param attribute: a string like object, representing a dfxp attr. value
|
||||
:return: a Padding object
|
||||
"""
|
||||
values_list = six.text_type(attribute).split(' ')
|
||||
sizes = []
|
||||
|
||||
for value in values_list:
|
||||
sizes.append(Size.from_string(value))
|
||||
|
||||
if len(sizes) == 1:
|
||||
return cls(sizes[0], sizes[0], sizes[0], sizes[0])
|
||||
elif len(sizes) == 2:
|
||||
return cls(sizes[0], sizes[0], sizes[1], sizes[1])
|
||||
elif len(sizes) == 3:
|
||||
return cls(sizes[0], sizes[2], sizes[1], sizes[1])
|
||||
elif len(sizes) == 4:
|
||||
return cls(sizes[0], sizes[2], sizes[3], sizes[1])
|
||||
else:
|
||||
raise ValueError('The provided value "{value}" could not be '
|
||||
"parsed into the a padding. Check out "
|
||||
"http://www.w3.org/TR/ttaf1-dfxp/"
|
||||
"#style-attribute-padding for the definition "
|
||||
"and examples".format(value=attribute))
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"<Padding (before: {before}, after: {after}, start: {start}, "
|
||||
"end: {end})>".format(
|
||||
before=self.before, after=self.after, start=self.start,
|
||||
end=self.end
|
||||
)
|
||||
)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns a tuple containing the useful values of this object
|
||||
"""
|
||||
return (
|
||||
None if not self.before else self.before.serialized(),
|
||||
None if not self.after else self.after.serialized(),
|
||||
None if not self.start else self.start.serialized(),
|
||||
None if not self.end else self.end.serialized()
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
other and
|
||||
type(self) == type(other) and
|
||||
self.before == other.before and
|
||||
self.after == other.after and
|
||||
self.start == other.start and
|
||||
self.end == other.end
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.before) * 19 +
|
||||
hash(self.after) * 23 +
|
||||
hash(self.start) * 29 +
|
||||
hash(self.end) * 31 +
|
||||
37
|
||||
)
|
||||
|
||||
def to_xml_attribute(
|
||||
self, attribute_order=('before', 'end', 'after', 'start'),
|
||||
**kwargs):
|
||||
"""Returns a unicode representation of this object as an xml attribute
|
||||
|
||||
TODO - should extend the attribute_order tuple to contain 4 tuples,
|
||||
so we can reduce the output length to 3, 2 or 1 element.
|
||||
|
||||
:type attribute_order: tuple
|
||||
:param attribute_order: the order that the attributes should be
|
||||
serialized
|
||||
"""
|
||||
try:
|
||||
string_list = []
|
||||
for attrib in attribute_order:
|
||||
if hasattr(self, attrib):
|
||||
string_list.append(
|
||||
getattr(self, attrib).to_xml_attribute())
|
||||
except AttributeError:
|
||||
# A Padding object with attributes set to None is considered
|
||||
# invalid. All four possible paddings must be set. If one of them
|
||||
# is not, this error is raised.
|
||||
raise ValueError("The attribute order specified is invalid.")
|
||||
|
||||
return ' '.join(string_list)
|
||||
|
||||
def as_percentage_of(self, video_width, video_height):
|
||||
return Padding(
|
||||
self.before.as_percentage_of(video_height=video_height),
|
||||
self.after.as_percentage_of(video_height=video_height),
|
||||
self.start.as_percentage_of(video_width=video_width),
|
||||
self.end.as_percentage_of(video_width=video_width)
|
||||
)
|
||||
|
||||
def is_relative(self):
|
||||
is_relative = True
|
||||
if self.before:
|
||||
is_relative &= self.before.is_relative()
|
||||
if self.after:
|
||||
is_relative &= self.after.is_relative()
|
||||
if self.start:
|
||||
is_relative &= self.start.is_relative()
|
||||
if self.end:
|
||||
is_relative &= self.end.is_relative()
|
||||
return is_relative
|
||||
|
||||
|
||||
class Layout(object):
|
||||
"""Should encapsulate all the information needed to determine (as correctly
|
||||
as possible) the layout (positioning) of elements on the screen.
|
||||
|
||||
Inheritance of this property, from the CaptionSet to its children is
|
||||
specific for each caption type.
|
||||
"""
|
||||
def __init__(self, origin=None, extent=None, padding=None, alignment=None,
|
||||
webvtt_positioning=None, inherit_from=None):
|
||||
"""
|
||||
:type origin: Point
|
||||
:param origin: The point on the screen which is the top left vertex
|
||||
of a rectangular region where the captions should be placed
|
||||
|
||||
:type extent: Stretch
|
||||
:param extent: The width and height of the rectangle where the caption
|
||||
should be placed on the screen.
|
||||
|
||||
:type padding: Padding
|
||||
:param padding: The padding of the text inside the region described
|
||||
by the origin and the extent
|
||||
|
||||
:type alignment: Alignment
|
||||
|
||||
:type webvtt_positioning: unicode
|
||||
:param webvtt_positioning: A string with the raw WebVTT cue settings.
|
||||
This is used so that WebVTT positioning isn't lost on conversion
|
||||
from WebVTT to WebVTT. It is needed only because pycaption
|
||||
currently doesn't support reading positioning from WebVTT.
|
||||
|
||||
:type inherit_from: Layout
|
||||
:param inherit_from: A Layout with the positioning parameters to be
|
||||
used if not specified by the positioning arguments,
|
||||
"""
|
||||
|
||||
self.origin = origin
|
||||
self.extent = extent
|
||||
self.padding = padding
|
||||
self.alignment = alignment
|
||||
self.webvtt_positioning = webvtt_positioning
|
||||
|
||||
if inherit_from:
|
||||
for attr_name in ['origin', 'extent', 'padding', 'alignment']:
|
||||
attr = getattr(self, attr_name)
|
||||
if not attr:
|
||||
setattr(self, attr_name, getattr(inherit_from, attr_name))
|
||||
|
||||
def __bool__(self):
|
||||
return any([
|
||||
self.origin, self.extent, self.padding, self.alignment,
|
||||
self.webvtt_positioning
|
||||
])
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"<Layout (origin: {origin}, extent: {extent}, "
|
||||
"padding: {padding}, alignment: {alignment})>".format(
|
||||
origin=self.origin, extent=self.extent, padding=self.padding,
|
||||
alignment=self.alignment
|
||||
)
|
||||
)
|
||||
|
||||
def serialized(self):
|
||||
"""Returns nested tuple containing the "useful" values of this object
|
||||
"""
|
||||
return (
|
||||
None if not self.origin else self.origin.serialized(),
|
||||
None if not self.extent else self.extent.serialized(),
|
||||
None if not self.padding else self.padding.serialized(),
|
||||
None if not self.alignment else self.alignment.serialized()
|
||||
)
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
type(self) == type(other) and
|
||||
self.origin == other.origin and
|
||||
self.extent == other.extent and
|
||||
self.padding == other.padding and
|
||||
self.alignment == other.alignment
|
||||
)
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __hash__(self):
|
||||
return hash(
|
||||
hash(self.origin) * 7
|
||||
+ hash(self.extent) * 11
|
||||
+ hash(self.padding) * 13
|
||||
+ hash(self.alignment) * 5
|
||||
+ 17
|
||||
)
|
||||
|
||||
def is_relative(self):
|
||||
"""
|
||||
Returns True if all positioning values are expressed as percentages,
|
||||
False otherwise.
|
||||
"""
|
||||
is_relative = True
|
||||
if self.origin:
|
||||
is_relative &= self.origin.is_relative()
|
||||
if self.extent:
|
||||
is_relative &= self.extent.is_relative()
|
||||
if self.padding:
|
||||
is_relative &= self.padding.is_relative()
|
||||
return is_relative
|
||||
|
||||
def as_percentage_of(self, video_width, video_height):
|
||||
params = {'alignment': self.alignment}
|
||||
# We don't need to preserve webvtt_positioning on Layout
|
||||
# transformations because, if it is set, the WebVTT writer
|
||||
# returns as soon as it's found and the transformations are
|
||||
# never triggered.
|
||||
for attr_name in ['origin', 'extent', 'padding']:
|
||||
attr = getattr(self, attr_name)
|
||||
if attr:
|
||||
params[attr_name] = attr.as_percentage_of(video_width,
|
||||
video_height)
|
||||
return Layout(**params)
|
||||
|
||||
def fit_to_screen(self):
|
||||
"""
|
||||
If extent is not set or if origin + extent > 100%, (re)calculate it
|
||||
based on origin. It is a pycaption fix for caption files that are
|
||||
technically valid but contain inconsistent settings that may cause
|
||||
long captions to be cut out of the screen.
|
||||
|
||||
ATTENTION: This must be called on relativized objects (such as the one
|
||||
returned by as_percentage_of). All units are presumed to be percentages.
|
||||
"""
|
||||
|
||||
if self.origin:
|
||||
# Calculated values to be used if replacement is needed
|
||||
diff_horizontal = Size(100 - self.origin.x.value, UnitEnum.PERCENT)
|
||||
diff_vertical = Size(100 - self.origin.y.value, UnitEnum.PERCENT)
|
||||
if not self.extent:
|
||||
# Extent is not set, use the calculated values
|
||||
new_extent = Stretch(diff_horizontal, diff_vertical)
|
||||
else:
|
||||
# Extent is set but may have inconsistent values,
|
||||
# e.g. origin="35% 25%" extent="80% 80%", which would cause
|
||||
# captions to end horizontally at 115% and vertically at 105%,
|
||||
# which would result in them being cut out of the screen.
|
||||
# In this case, the horizontal and vertical values are
|
||||
# corrected so that origin + extent = 100%.
|
||||
bottom_right = self.origin.add_stretch(self.extent)
|
||||
|
||||
found_absolute_unit = False
|
||||
if bottom_right.x.unit != UnitEnum.PERCENT:
|
||||
found_absolute_unit = True
|
||||
elif bottom_right.x.unit != UnitEnum.PERCENT:
|
||||
found_absolute_unit = True
|
||||
|
||||
if found_absolute_unit:
|
||||
raise ValueError("Units must be relativized before extent "
|
||||
"can be calculated based on origin.")
|
||||
|
||||
new_horizontal = self.extent.horizontal
|
||||
new_vertical = self.extent.vertical
|
||||
# If extent is set but it's inconsistent, replace with
|
||||
# calculated values
|
||||
if bottom_right.x.value > 100:
|
||||
new_horizontal = diff_horizontal
|
||||
if bottom_right.y.value > 100:
|
||||
new_vertical = diff_vertical
|
||||
|
||||
new_extent = Stretch(new_horizontal, new_vertical)
|
||||
|
||||
return Layout(
|
||||
origin=self.origin,
|
||||
extent=new_extent,
|
||||
padding=self.padding,
|
||||
alignment=self.alignment
|
||||
# We don't need to preserve webvtt_positioning on Layout
|
||||
# transformations because, if it is set, the WebVTT writer
|
||||
# returns as soon as it's found and the transformations are
|
||||
# never triggered.
|
||||
)
|
||||
|
||||
return self
|
||||
805
utils/modules/pycaption/sami.py
Normal file
805
utils/modules/pycaption/sami.py
Normal file
@@ -0,0 +1,805 @@
|
||||
"""
|
||||
The classes in this module handle SAMI reading and writing. It supports several
|
||||
CSS attributes, some of which are handled as positioning settings (and applied
|
||||
to Layout objects) and others as simple styling (applied to legacy style nodes).
|
||||
|
||||
The following attributes are handled as positioning:
|
||||
|
||||
'text-align' # Converted to Alignment
|
||||
'margin-top'
|
||||
'margin-right'
|
||||
'margin-bottom'
|
||||
'margin-left'
|
||||
|
||||
OBS:
|
||||
* Margins are converted to Padding
|
||||
* Margins defined inline are not supported
|
||||
TODO: Add support for inline margins
|
||||
|
||||
Any other CSS the BeautifulSoup library manages to parse is handled as simple
|
||||
styling and applied to style nodes. However, apparently only these are actually
|
||||
used by writers on conversion:
|
||||
|
||||
'font-family'
|
||||
'font-size'
|
||||
'font-style'
|
||||
'color'
|
||||
OBS:
|
||||
* Other parameters are preserved, but not if they're specified inline.
|
||||
TODO:
|
||||
Make this less confusing. Confirm whether these really are the only
|
||||
supported styling attributes and make it more clear, perhaps by listing
|
||||
them in constants in the beginning of the file and using them to filter
|
||||
out unneeded attributes either everywhere in the code or not at all, but
|
||||
most importantly regardless of whether they're defined inline or not,
|
||||
because this is irrelevant.
|
||||
|
||||
"""
|
||||
import re
|
||||
import six
|
||||
from logging import FATAL
|
||||
from collections import deque
|
||||
from copy import deepcopy
|
||||
from future.backports.html.parser import HTMLParseError
|
||||
|
||||
from html.parser import HTMLParser
|
||||
from html.entities import name2codepoint
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from cssutils import parseString, log, css as cssutils_css
|
||||
|
||||
from .base import (
|
||||
BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
|
||||
DEFAULT_LANGUAGE_CODE)
|
||||
from .exceptions import (
|
||||
CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError)
|
||||
from .geometry import Layout, Alignment, Padding, Size
|
||||
|
||||
|
||||
# change cssutils default logging
|
||||
log.setLevel(FATAL)
|
||||
|
||||
|
||||
SAMI_BASE_MARKUP = '''
|
||||
<sami>
|
||||
<head>
|
||||
<style type="text/css"/>
|
||||
</head>
|
||||
<body/>
|
||||
</sami>'''
|
||||
|
||||
|
||||
class SAMIReader(BaseReader):
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super(SAMIReader, self).__init__(*args, **kw)
|
||||
self.line = []
|
||||
self.first_alignment = None
|
||||
|
||||
def detect(self, content):
|
||||
if '<sami' in content.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def read(self, content):
|
||||
if type(content) != six.text_type:
|
||||
raise InvalidInputError('The content is not a unicode string.')
|
||||
|
||||
content, doc_styles, doc_langs = (
|
||||
self._get_sami_parser_class()().feed(content))
|
||||
sami_soup = self._get_xml_parser_class()(content)
|
||||
|
||||
# Get the global layout that applies to all <p> tags
|
||||
global_layout = self._build_layout(doc_styles.get('p', {}))
|
||||
|
||||
caption_dict = {}
|
||||
for language in doc_langs:
|
||||
lang_layout = None
|
||||
for target, styling in list(doc_styles.items()):
|
||||
if target not in ['p', 'sync', 'span']:
|
||||
if styling.get('lang', None) == language:
|
||||
lang_layout = self._build_layout(
|
||||
doc_styles.get(target, {}),
|
||||
inherit_from=global_layout
|
||||
)
|
||||
break
|
||||
lang_layout = lang_layout or global_layout
|
||||
lang_captions = self._translate_lang(
|
||||
language, sami_soup, lang_layout)
|
||||
|
||||
caption_dict[language] = lang_captions
|
||||
|
||||
caption_set = CaptionSet(
|
||||
caption_dict,
|
||||
layout_info=global_layout
|
||||
)
|
||||
|
||||
# Convert styles from CSS to internal representation
|
||||
for style in list(doc_styles.items()):
|
||||
style = (style[0], self._translate_parsed_style(style[1]))
|
||||
|
||||
caption_set.set_styles(doc_styles)
|
||||
|
||||
if caption_set.is_empty():
|
||||
raise CaptionReadNoCaptions("empty caption file")
|
||||
|
||||
return caption_set
|
||||
|
||||
@staticmethod
|
||||
def _get_sami_parser_class():
|
||||
"""Hook method for providing custom SAMIParser classes"""
|
||||
return SAMIParser
|
||||
|
||||
@staticmethod
|
||||
def _get_xml_parser_class():
|
||||
"""Hook method for providing a custom XML parser class"""
|
||||
return BeautifulSoup
|
||||
|
||||
def _build_layout(self, styles, inherit_from=None):
|
||||
"""
|
||||
:type styles: dict
|
||||
:param styles: a dictionary with CSS-like styling rules
|
||||
|
||||
:type inherit_from: Layout
|
||||
:param inherit_from: The Layout with values to be used in case the
|
||||
positioning settings in the styles parameter don't specify
|
||||
something.
|
||||
"""
|
||||
alignment = Alignment.from_horizontal_and_vertical_align(
|
||||
text_align=styles.get('text-align')
|
||||
)
|
||||
return self._get_layout_class()(
|
||||
origin=None,
|
||||
extent=None,
|
||||
padding=self._get_padding(styles),
|
||||
alignment=alignment,
|
||||
inherit_from=inherit_from
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_layout_class():
|
||||
"""Hook method for providing a custom Layout class"""
|
||||
return Layout
|
||||
|
||||
def _get_padding(self, styles):
|
||||
margin_before = self._get_size(styles, 'margin-top')
|
||||
margin_after = self._get_size(styles, 'margin-bottom')
|
||||
margin_start = self._get_size(styles, 'margin-left')
|
||||
margin_end = self._get_size(styles, 'margin-right')
|
||||
if not any([margin_before, margin_after, margin_start, margin_end]):
|
||||
return None
|
||||
return Padding(
|
||||
before=margin_before, # top
|
||||
after=margin_after, # bottom
|
||||
start=margin_start, # left
|
||||
end=margin_end # right
|
||||
)
|
||||
|
||||
def _get_size(self, styles, style_label):
|
||||
value_from_style = styles.get(style_label, None)
|
||||
if not value_from_style:
|
||||
return None
|
||||
return Size.from_string(value_from_style)
|
||||
|
||||
def _translate_lang(self, language, sami_soup, parent_layout):
|
||||
"""
|
||||
For a given language, translate the SAMI XML to internal list of
|
||||
captions.
|
||||
|
||||
:rtype: list
|
||||
"""
|
||||
captions = CaptionList(layout_info=parent_layout)
|
||||
milliseconds = 0
|
||||
|
||||
for p in sami_soup.select('p[lang|=%s]' % language):
|
||||
milliseconds = int(float(p.parent['start']))
|
||||
start = milliseconds * 1000
|
||||
end = 0
|
||||
|
||||
if captions != [] and captions[-1].end == 0:
|
||||
captions[-1].end = milliseconds * 1000
|
||||
|
||||
if p.get_text().strip():
|
||||
self.first_alignment = None
|
||||
styles = self._translate_attrs(p)
|
||||
layout_info = self._build_layout(styles,
|
||||
inherit_from=parent_layout)
|
||||
self.line = []
|
||||
|
||||
self._translate_tag(p, layout_info)
|
||||
caption_layout = self._get_layout_class()(
|
||||
alignment=self.first_alignment,
|
||||
inherit_from=layout_info
|
||||
)
|
||||
for node in self.line:
|
||||
node.layout_info = Layout(
|
||||
alignment=self.first_alignment,
|
||||
inherit_from=node.layout_info
|
||||
)
|
||||
self.first_alignment = None
|
||||
|
||||
caption = Caption(start, end, self.line, styles, caption_layout)
|
||||
captions.append(caption)
|
||||
|
||||
if captions and captions[-1].end == 0:
|
||||
# Arbitrarily make this last 4 seconds. Not ideal...
|
||||
captions[-1].end = (milliseconds + 4000) * 1000
|
||||
|
||||
return captions
|
||||
|
||||
def _get_style_name_from_tag(self, tag):
|
||||
if tag == 'i':
|
||||
return 'italics'
|
||||
elif tag == 'b':
|
||||
return 'bold'
|
||||
elif tag == 'u':
|
||||
return 'underline'
|
||||
else:
|
||||
raise RuntimeError("Unknown style tag")
|
||||
|
||||
def _translate_tag(self, tag, inherit_from=None):
|
||||
"""
|
||||
:param inherit_from: A Layout object extracted from an ancestor tag
|
||||
to be attached to leaf nodes
|
||||
"""
|
||||
# convert text
|
||||
if isinstance(tag, NavigableString):
|
||||
# BeautifulSoup apparently handles unescaping character codes
|
||||
# (e.g. &) automatically. The following variable, therefore,
|
||||
# should contain a plain unicode string.
|
||||
# strips indentation whitespace only
|
||||
pattern = re.compile("^(?:[\n\r]+\s*)?(.+)")
|
||||
result = pattern.search(tag)
|
||||
if not result:
|
||||
return
|
||||
tag_text = result.groups()[0]
|
||||
self.line.append(CaptionNode.create_text(tag_text, inherit_from))
|
||||
# convert line breaks
|
||||
elif tag.name == 'br':
|
||||
self.line.append(CaptionNode.create_break(inherit_from))
|
||||
# convert italics, bold, and underline
|
||||
elif tag.name == 'i' or tag.name == 'b' or tag.name == 'u':
|
||||
style_name = self._get_style_name_from_tag(tag.name)
|
||||
self.line.append(
|
||||
CaptionNode.create_style(True, {style_name: True})
|
||||
)
|
||||
# recursively call function for any children elements
|
||||
for a in tag.contents:
|
||||
self._translate_tag(a, inherit_from)
|
||||
self.line.append(
|
||||
CaptionNode.create_style(False, {style_name: True}))
|
||||
elif tag.name == 'span':
|
||||
self._translate_span(tag, inherit_from)
|
||||
else:
|
||||
# recursively call function for any children elements
|
||||
for a in tag.contents:
|
||||
self._translate_tag(a, inherit_from)
|
||||
|
||||
def _translate_span(self, tag, inherit_from=None):
|
||||
# convert tag attributes
|
||||
args = self._translate_attrs(tag)
|
||||
# only include span tag if attributes returned
|
||||
if args:
|
||||
layout_info = self._build_layout(args, inherit_from)
|
||||
# OLD: Create legacy style node
|
||||
# NEW: But pass new layout object
|
||||
node = CaptionNode.create_style(True, args, layout_info)
|
||||
self.line.append(node)
|
||||
# recursively call function for any children elements
|
||||
for a in tag.contents:
|
||||
# NEW: Pass the layout along so that it's eventually attached
|
||||
# to leaf nodes (e.g. text or break)
|
||||
self._translate_tag(a, layout_info)
|
||||
node = CaptionNode.create_style(False, args, layout_info)
|
||||
self.line.append(node)
|
||||
else:
|
||||
for a in tag.contents:
|
||||
self._translate_tag(a, inherit_from)
|
||||
|
||||
def _translate_attrs(self, tag):
|
||||
attrs = {}
|
||||
css_attrs = tag.attrs
|
||||
|
||||
if 'class' in css_attrs:
|
||||
attrs['class'] = css_attrs['class'][0].lower()
|
||||
if 'id' in css_attrs:
|
||||
attrs['class'] = css_attrs['id'].lower()
|
||||
if 'style' in css_attrs:
|
||||
styles = css_attrs['style'].split(';')
|
||||
attrs.update(self._translate_style(attrs, styles))
|
||||
|
||||
return attrs
|
||||
|
||||
# convert attributes from inline CSS
|
||||
def _translate_style(self, attrs, styles):
|
||||
for style in styles:
|
||||
style = style.split(':')
|
||||
if len(style) == 2:
|
||||
css_property, value = style
|
||||
else:
|
||||
continue
|
||||
if css_property == 'text-align':
|
||||
self._save_first_alignment(value.strip())
|
||||
else:
|
||||
self._translate_css_property(attrs, css_property, value)
|
||||
|
||||
return attrs
|
||||
|
||||
def _translate_parsed_style(self, styles):
|
||||
# Keep unknown styles by default
|
||||
attrs = styles
|
||||
for css_property in list(styles.keys()):
|
||||
value = styles[css_property]
|
||||
self._translate_css_property(attrs, css_property, value)
|
||||
|
||||
return attrs
|
||||
|
||||
def _translate_css_property(self, attrs, css_property, value):
|
||||
if css_property == 'font-family':
|
||||
attrs['font-family'] = value.strip()
|
||||
elif css_property == 'font-size':
|
||||
attrs['font-size'] = value.strip()
|
||||
elif css_property == 'font-style' and value.strip() == 'italic':
|
||||
attrs['italics'] = True
|
||||
elif css_property == 'text-decoration' and value.strip() == 'underline':
|
||||
attrs['underline'] = True
|
||||
elif css_property == 'font-weight' and value.strip() == 'bold':
|
||||
attrs['bold'] = True
|
||||
elif css_property == 'lang':
|
||||
attrs['lang'] = value.strip()
|
||||
elif css_property == 'color':
|
||||
attrs['color'] = value.strip()
|
||||
|
||||
def _save_first_alignment(self, align):
|
||||
"""
|
||||
Unlike the other inline CSS attributes parsed in _translate_styles, the
|
||||
'text-align' setting must be applied to a Layout and not to a style
|
||||
because it affects positioning. This Layout must be assigned to the
|
||||
Caption object, and not a Node, because it doesn't make sense to have
|
||||
spans in the same caption with different alignments. Even though the
|
||||
SAMI format seems to in principle accept it, pycaption normalizes to
|
||||
something it can make sense of internally and convert to other formats.
|
||||
|
||||
If there are multiple elements (span, div, etc) in the same line with
|
||||
different alignments, only the first alignment is taken into account.
|
||||
|
||||
If the root element of the caption (sync's first child) has an inline
|
||||
text-align, it is preserved and any children alignment is ignored.
|
||||
|
||||
:param align: A unicode string representing a CSS text-align value
|
||||
"""
|
||||
if not self.first_alignment:
|
||||
self.first_alignment = Alignment.from_horizontal_and_vertical_align( # noqa
|
||||
text_align=align
|
||||
)
|
||||
|
||||
|
||||
class SAMIWriter(BaseWriter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SAMIWriter, self).__init__(*args, **kwargs)
|
||||
self.open_span = False
|
||||
self.last_time = None
|
||||
|
||||
def write(self, caption_set):
|
||||
caption_set = deepcopy(caption_set)
|
||||
sami = BeautifulSoup(SAMI_BASE_MARKUP, "lxml-xml")
|
||||
|
||||
caption_set.layout_info = self._relativize_and_fit_to_screen(
|
||||
caption_set.layout_info)
|
||||
|
||||
primary = None
|
||||
|
||||
for lang in caption_set.get_languages():
|
||||
self.last_time = None
|
||||
if primary is None:
|
||||
primary = lang
|
||||
|
||||
caption_set.set_layout_info(
|
||||
lang,
|
||||
self._relativize_and_fit_to_screen(
|
||||
caption_set.get_layout_info(lang))
|
||||
)
|
||||
|
||||
for caption in caption_set.get_captions(lang):
|
||||
# Loop through all captions/nodes and apply transformations to
|
||||
# layout in function of the provided or default settings
|
||||
caption.layout_info = self._relativize_and_fit_to_screen(
|
||||
caption.layout_info)
|
||||
for node in caption.nodes:
|
||||
node.layout_info = self._relativize_and_fit_to_screen(
|
||||
node.layout_info)
|
||||
sami = self._recreate_p_tag(
|
||||
caption, sami, lang, primary, caption_set)
|
||||
|
||||
stylesheet = self._recreate_stylesheet(caption_set)
|
||||
sami.find('style').append(stylesheet)
|
||||
|
||||
a = sami.prettify(formatter=None).split('\n')
|
||||
caption_content = '\n'.join(a[1:])
|
||||
return caption_content
|
||||
|
||||
def _recreate_p_tag(self, caption, sami, lang, primary, captions):
|
||||
"""
|
||||
Creates a p tag for the given caption, attach it to the sami object
|
||||
and return it.
|
||||
|
||||
:type caption: Caption
|
||||
:type sami: BeautifulSoup
|
||||
:type lang: unicode
|
||||
:type primary: unicode
|
||||
:type captions: CaptionSet
|
||||
|
||||
:rtype: BeautifulSoup
|
||||
"""
|
||||
time = caption.start / 1000
|
||||
|
||||
if self.last_time and time != self.last_time:
|
||||
sami = self._recreate_blank_tag(
|
||||
sami, caption, lang, primary, captions)
|
||||
|
||||
self.last_time = caption.end / 1000
|
||||
|
||||
sami, sync = self._recreate_sync(sami, lang, primary, time)
|
||||
|
||||
p = sami.new_tag("p")
|
||||
|
||||
p_style = ''
|
||||
for attr, value in list(self._recreate_style(caption.style).items()):
|
||||
p_style += '%s:%s;' % (attr, value)
|
||||
if p_style:
|
||||
p['p_style'] = p_style
|
||||
|
||||
p['class'] = self._recreate_p_lang(caption, lang, captions)
|
||||
p.string = self._recreate_text(caption.nodes)
|
||||
|
||||
sync.append(p)
|
||||
|
||||
return sami
|
||||
|
||||
def _recreate_sync(self, sami, lang, primary, time):
|
||||
"""
|
||||
Creates a sync tag for a given language and timing (if it doesn't
|
||||
already exist), attach it to the sami body and return the sami
|
||||
BeautifulSoupobject.
|
||||
|
||||
:type sami: BeautifulSoup
|
||||
:type lang: unicode
|
||||
:type primary: unicode
|
||||
:type time: int
|
||||
|
||||
:rtype: BeautifulSoup
|
||||
"""
|
||||
if lang == primary:
|
||||
sync = sami.new_tag("sync", start="%d" % time)
|
||||
sami.body.append(sync)
|
||||
else:
|
||||
sync = sami.find("sync", start="%d" % time)
|
||||
if sync is None:
|
||||
sami, sync = self._find_closest_sync(sami, time)
|
||||
|
||||
return sami, sync
|
||||
|
||||
def _find_closest_sync(self, sami, time):
|
||||
sync = sami.new_tag("sync", start="%d" % time)
|
||||
|
||||
earlier = sami.find_all("sync", start=lambda x: int(x) < time)
|
||||
if earlier:
|
||||
last_sync = earlier[-1]
|
||||
last_sync.insert_after(sync)
|
||||
else:
|
||||
def later_syncs(start):
|
||||
return int(start) > time
|
||||
later = sami.find_all("sync", start=later_syncs)
|
||||
if later:
|
||||
last_sync = later[0]
|
||||
last_sync.insert_before(sync)
|
||||
return sami, sync
|
||||
|
||||
def _recreate_blank_tag(self, sami, caption, lang, primary, captions):
|
||||
sami, sync = self._recreate_sync(sami, lang, primary, self.last_time)
|
||||
|
||||
p = sami.new_tag("p")
|
||||
p['class'] = self._recreate_p_lang(caption, lang, captions)
|
||||
p.string = ' '
|
||||
|
||||
sync.append(p)
|
||||
|
||||
return sami
|
||||
|
||||
def _recreate_p_lang(self, caption, lang, captions):
|
||||
try:
|
||||
if 'lang' in captions.get_style(caption.style['class']):
|
||||
return caption.style['class']
|
||||
except KeyError:
|
||||
pass
|
||||
return lang
|
||||
|
||||
def _recreate_stylesheet(self, caption_set):
|
||||
stylesheet = '<!--'
|
||||
|
||||
for attr, value in caption_set.get_styles():
|
||||
if value != {}:
|
||||
stylesheet += self._recreate_style_block(
|
||||
attr, value, caption_set.layout_info)
|
||||
|
||||
for lang in caption_set.get_languages():
|
||||
lang_string = 'lang: {}'.format(lang)
|
||||
if lang_string not in stylesheet:
|
||||
stylesheet += self._recreate_style_block(
|
||||
lang, {'lang': lang}, caption_set.get_layout_info(lang))
|
||||
|
||||
return stylesheet + ' -->'
|
||||
|
||||
def _recreate_style_block(self, target, rules, layout_info):
|
||||
"""
|
||||
:param target: A unicode string representing the target of the styling
|
||||
rules.
|
||||
:param rules: A dictionary with CSS-like styling rules.
|
||||
|
||||
:param layout_info: A Layout object providing positioning information
|
||||
to be converted to CSS
|
||||
"""
|
||||
if target not in ['p', 'sync', 'span']:
|
||||
# If it's not a valid SAMI element, then it's a custom class name
|
||||
selector = '.{}'.format(target)
|
||||
else:
|
||||
selector = target
|
||||
|
||||
sami_style = '\n {} {{\n '.format(selector)
|
||||
|
||||
if layout_info and layout_info.padding:
|
||||
rules.update({
|
||||
'margin-top': six.text_type(layout_info.padding.before),
|
||||
'margin-right': six.text_type(layout_info.padding.end),
|
||||
'margin-bottom': six.text_type(layout_info.padding.after),
|
||||
'margin-left': six.text_type(layout_info.padding.start),
|
||||
})
|
||||
|
||||
for attr, value in sorted(self._recreate_style(rules).items()):
|
||||
sami_style += ' {}: {};\n '.format(attr, value)
|
||||
|
||||
return sami_style + '}\n'
|
||||
|
||||
def _recreate_text(self, caption):
|
||||
line = ''
|
||||
|
||||
for node in caption:
|
||||
if node.type_ == CaptionNode.TEXT:
|
||||
line += self._encode(node.content) + ' '
|
||||
elif node.type_ == CaptionNode.BREAK:
|
||||
line = line.rstrip() + '<br/>\n '
|
||||
elif node.type_ == CaptionNode.STYLE:
|
||||
line = self._recreate_line_style(line, node)
|
||||
|
||||
return line.rstrip()
|
||||
|
||||
def _recreate_line_style(self, line, node):
|
||||
if node.start:
|
||||
if self.open_span:
|
||||
line = line.rstrip() + '</span> '
|
||||
line = self._recreate_span(line, node.content)
|
||||
else:
|
||||
if self.open_span:
|
||||
line = line.rstrip() + '</span> '
|
||||
self.open_span = False
|
||||
|
||||
return line
|
||||
|
||||
def _recreate_span(self, line, content):
|
||||
style = ''
|
||||
klass = ''
|
||||
if 'class' in content:
|
||||
klass += ' class="%s"' % content['class']
|
||||
|
||||
for attr, value in list(self._recreate_style(content).items()):
|
||||
style += '%s:%s;' % (attr, value)
|
||||
|
||||
if style or klass:
|
||||
if style:
|
||||
style = ' style="%s"' % style
|
||||
line += '<span%s%s>' % (klass, style)
|
||||
self.open_span = True
|
||||
|
||||
return line
|
||||
|
||||
def _recreate_style(self, rules):
|
||||
"""
|
||||
:param rules: A dictionary with CSS-like styling rules
|
||||
"""
|
||||
sami_style = {}
|
||||
|
||||
for key, value in list(rules.items()):
|
||||
# Recreate original CSS rules from internal style
|
||||
if key == 'italics' and value == True:
|
||||
sami_style['font-style'] = 'italic'
|
||||
elif key == 'bold' and value == True:
|
||||
sami_style['font-weight'] = 'bold'
|
||||
elif key == 'underline' and value == True:
|
||||
sami_style['text-decoration'] = 'underline'
|
||||
else:
|
||||
sami_style[key] = value
|
||||
|
||||
return sami_style
|
||||
|
||||
def _encode(self, s):
|
||||
"""
|
||||
Encodes plain unicode string to proper SAMI file escaping special
|
||||
characters in case they appear in the string.
|
||||
:type s: unicode
|
||||
"""
|
||||
return escape(s)
|
||||
|
||||
|
||||
class SAMIParser(HTMLParser):
|
||||
def __init__(self, *args, **kw):
|
||||
HTMLParser.__init__(self, *args, **kw)
|
||||
self.sami = ''
|
||||
self.line = ''
|
||||
self.styles = {}
|
||||
self.queue = deque()
|
||||
self.langs = set()
|
||||
self.last_element = ''
|
||||
self.name2codepoint = name2codepoint.copy()
|
||||
self.name2codepoint['apos'] = 0x0027
|
||||
self.convert_charrefs = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
"""
|
||||
Override the parser's handling of starttags
|
||||
:param tag: unicode string indicating the tag type (e.g. "head" or "p")
|
||||
:param tag: list of attribute tuples of type (u'name', u'value')
|
||||
"""
|
||||
self.last_element = tag
|
||||
|
||||
# treat divs as spans
|
||||
if tag == 'div':
|
||||
tag = 'span'
|
||||
|
||||
# figure out the caption language of P tags
|
||||
if tag == 'p':
|
||||
lang = self._find_lang(attrs)
|
||||
|
||||
# if no language detected, set it as the default
|
||||
lang = lang or DEFAULT_LANGUAGE_CODE
|
||||
attrs.append(('lang', lang))
|
||||
self.langs.add(lang)
|
||||
|
||||
# clean-up line breaks
|
||||
if tag == 'br':
|
||||
self.sami += "<br/>"
|
||||
# add tag to queue
|
||||
else:
|
||||
# if already in queue, first close tags off in LIFO order
|
||||
while tag in self.queue:
|
||||
closer = self.queue.pop()
|
||||
self.sami += "</%s>" % closer
|
||||
# open new tag in queue
|
||||
self.queue.append(tag)
|
||||
# add tag with attributes
|
||||
for attr, value in attrs:
|
||||
tag += ' %s="%s"' % (attr.lower(), value)
|
||||
self.sami += "<%s>" % tag
|
||||
|
||||
# override the parser's handling of endtags
|
||||
def handle_endtag(self, tag):
|
||||
# treat divs as spans
|
||||
if tag == 'div':
|
||||
tag = 'span'
|
||||
|
||||
# handle incorrectly formatted sync/p tags
|
||||
if tag in ['p', 'sync'] and tag == self.last_element:
|
||||
return
|
||||
|
||||
# close off tags in LIFO order, if matching starting tag in queue
|
||||
while tag in self.queue:
|
||||
closing_tag = self.queue.pop()
|
||||
self.sami += "</%s>" % closing_tag
|
||||
|
||||
def handle_entityref(self, name):
|
||||
if name in ['gt', 'lt']:
|
||||
self.sami += '&%s;' % name
|
||||
else:
|
||||
try:
|
||||
self.sami += chr(self.name2codepoint[name])
|
||||
except (KeyError, ValueError):
|
||||
self.sami += '&%s' % name
|
||||
|
||||
self.last_element = ''
|
||||
|
||||
def handle_charref(self, name):
|
||||
if name[0] == 'x':
|
||||
self.sami += chr(int(name[1:], 16))
|
||||
else:
|
||||
self.sami += chr(int(name))
|
||||
|
||||
# override the parser's handling of data
|
||||
def handle_data(self, data):
|
||||
self.sami += data
|
||||
self.last_element = ''
|
||||
|
||||
# override the parser's feed function
|
||||
def feed(self, data):
|
||||
"""
|
||||
:param data: Raw SAMI unicode string
|
||||
:returns: tuple (unicode, dict, set)
|
||||
"""
|
||||
no_cc = 'no closed captioning available'
|
||||
|
||||
if '<html' in data.lower():
|
||||
raise CaptionReadSyntaxError(
|
||||
'SAMI File seems to be an HTML file.')
|
||||
elif no_cc in data.lower():
|
||||
raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc)
|
||||
|
||||
# try to find style tag in SAMI
|
||||
try:
|
||||
# prevent BS4 error with huge SAMI files with unclosed tags
|
||||
index = data.lower().find("</head>")
|
||||
|
||||
self.styles = self._css_parse(
|
||||
BeautifulSoup(data[:index], "lxml").find('style').get_text())
|
||||
except AttributeError:
|
||||
self.styles = {}
|
||||
|
||||
# fix erroneous italics tags
|
||||
data = data.replace('<i/>', '<i>')
|
||||
|
||||
# fix awkward tags found in some SAMIs
|
||||
data = data.replace(';>', '>')
|
||||
try:
|
||||
HTMLParser.feed(self, data)
|
||||
except HTMLParseError as e:
|
||||
raise CaptionReadSyntaxError(e)
|
||||
|
||||
# close any tags that remain in the queue
|
||||
while self.queue != deque([]):
|
||||
closing_tag = self.queue.pop()
|
||||
self.sami += "</%s>" % closing_tag
|
||||
|
||||
return self.sami, self.styles, self.langs
|
||||
|
||||
# parse the SAMI's stylesheet
|
||||
def _css_parse(self, css):
|
||||
"""
|
||||
Parse styling via cssutils modules
|
||||
:rtype: dict
|
||||
"""
|
||||
sheet = parseString(css)
|
||||
style_sheet = {}
|
||||
|
||||
for rule in sheet:
|
||||
new_style = {}
|
||||
selector = rule.selectorText.lower()
|
||||
if selector[0] in ['#', '.']:
|
||||
selector = selector[1:]
|
||||
# keep any style attributes that are needed
|
||||
for prop in rule.style:
|
||||
if prop.name == 'color':
|
||||
cv = cssutils_css.ColorValue(prop.value)
|
||||
# Code for RGB to hex conversion comes from
|
||||
# http://bit.ly/1kwfBnQ
|
||||
new_style['color'] = "#%02x%02x%02x" % (
|
||||
cv.red, cv.green, cv.blue)
|
||||
else:
|
||||
new_style[prop.name] = prop.value
|
||||
if new_style:
|
||||
style_sheet[selector] = new_style
|
||||
|
||||
return style_sheet
|
||||
|
||||
def _find_lang(self, attrs):
|
||||
for attr, value in attrs:
|
||||
# if lang is an attribute of the tag
|
||||
if attr.lower() == 'lang':
|
||||
return value[:2]
|
||||
# if the P tag has a class, try and find the language
|
||||
if attr.lower() == 'class':
|
||||
try:
|
||||
return self.styles[value.lower()]['lang']
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return None
|
||||
696
utils/modules/pycaption/scc/__init__.py
Normal file
696
utils/modules/pycaption/scc/__init__.py
Normal file
@@ -0,0 +1,696 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
3 types of SCC captions:
|
||||
Roll-Up
|
||||
Paint-On
|
||||
Pop-On
|
||||
|
||||
Commands:
|
||||
94ae - [ENM] - Erase Non-displayed(buffer) Memory
|
||||
942c - [EDM] - Erase Displayed Memory
|
||||
9420 - [RCL] - Resume Caption Loading
|
||||
9429 - [RDC] - Resume Direct Captioning
|
||||
|
||||
9425, 9426, 94a7 - [RU2], [RU3], [RU4] (roll up captions 2,3 or 4 rows)
|
||||
- these commands set the number of expected lines
|
||||
|
||||
94ad - (in CEA-608-E: 142d) - [CR] carriage return.
|
||||
- This actually rolls the captions up as many rows as specified by
|
||||
[RU1], [RU2], or [RU3]
|
||||
|
||||
80 - no-op char. Doesn't do anything, but must be used with other
|
||||
characters, to make a 2 byte word
|
||||
|
||||
97a1, 97a2, 9723 - [TO] move 1, 2 or 3 columns - Tab Over command
|
||||
- this moves the positioning 1, 2, or 3 columns to the right
|
||||
- Nothing regarding this is implemented.
|
||||
|
||||
942f - [EOC] - display the buffer on the screen - End Of Caption
|
||||
... - [PAC] - Preamble address code (can set positioning and style)
|
||||
- All the PACs are specified by the first and second byte combined
|
||||
from pycaption.scc.constants.PAC_BYTES_TO_POSITIONING_MAP
|
||||
|
||||
9429 - [RDC] - Resume Direct Captioning
|
||||
94a4 - (in CEA-608-E: 1424) - [DER] Delete to End of Row
|
||||
|
||||
|
||||
Pop-On:
|
||||
The commands should usually appear in this order. Not strict though, and
|
||||
the the commands don't have to necessarily be on the same row.
|
||||
|
||||
1. 94ae [ENM] (erase non displayed memory)
|
||||
2. 9420 [RCL] (resume caption loading => this command here means we're using Pop-On captions)
|
||||
2.1? [ENM] - if step 0 was skipped?
|
||||
3. [PAC] Positioning/ styling command (can position on columns divisible by 4)
|
||||
The control chars is called Preamble Address Code [PAC].
|
||||
4. If positioning needs to be on columns not divisible by 4, use a [TO] command
|
||||
5. text
|
||||
6. 942c [EDM] - optionally, erase the currently displayed caption
|
||||
7. 942f [EOC] display the caption
|
||||
|
||||
|
||||
Roll-Up:
|
||||
1. [RU2], [RU3] or [RU4] - sets Roll-Up style and depth
|
||||
- these set the Roll-Up style: (characteristic command)
|
||||
2. [CR] to roll the display up 1 row...lol?
|
||||
3. [PAC] - sets the indent of the base row
|
||||
|
||||
|
||||
Paint-On:
|
||||
1. [RDC] - sets the Paint-On style (characteristic command)
|
||||
2. [PAC]
|
||||
3. text
|
||||
4. [PAC]
|
||||
5. text or [DER]
|
||||
|
||||
There are some rules regarding the parity of the commands.
|
||||
|
||||
This resource:
|
||||
http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML
|
||||
specifies that there are interpreters which only work if the commands have an
|
||||
odd parity. This however is not consistent, and we might not handle well
|
||||
these cases. Odd parity of a command means that, converting toe word into
|
||||
binary, should result in an odd number of '1's. The PAC commands obey this
|
||||
rule, but some do not. Some commands that do not are found in the COMMANDS
|
||||
dictionary. This is legacy logic, that I didn't know how to handle, and
|
||||
just carried over when implementing positioning.
|
||||
"""
|
||||
|
||||
import re
|
||||
import math
|
||||
import textwrap
|
||||
from copy import deepcopy
|
||||
|
||||
import six
|
||||
|
||||
from pycaption.base import (
|
||||
BaseReader, BaseWriter, CaptionSet, CaptionNode,
|
||||
)
|
||||
from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError
|
||||
from .constants import (
|
||||
HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS,
|
||||
MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
|
||||
SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
|
||||
PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
|
||||
)
|
||||
from .specialized_collections import (
|
||||
TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
|
||||
InstructionNodeCreator)
|
||||
from .state_machines import DefaultProvidingPositionTracker
|
||||
|
||||
|
||||
class NodeCreatorFactory(object):
|
||||
"""Will return instances of the given node_creator.
|
||||
|
||||
This is used as a means of creating new InstructionNodeCreator instances,
|
||||
because these need to share state beyond their garbage collection, but
|
||||
storing the information at the class level is not good either, because
|
||||
this information must be erased after the reader's .read() operation
|
||||
completes.
|
||||
"""
|
||||
def __init__(self, position_tracker,
|
||||
node_creator=InstructionNodeCreator):
|
||||
self.position_tracker = position_tracker
|
||||
self.node_creator = node_creator
|
||||
|
||||
def new_creator(self):
|
||||
"""Returns a new instance of self.node_creator, initialized with
|
||||
the same italics_tracker, and position_tracker
|
||||
"""
|
||||
return self.node_creator(position_tracker=self.position_tracker)
|
||||
|
||||
def from_list(self, roll_rows):
|
||||
"""Wraps the node_creator's method with the same name
|
||||
|
||||
:param roll_rows: list of node_creator instances
|
||||
|
||||
:return: a node_creator instance
|
||||
"""
|
||||
return self.node_creator.from_list(
|
||||
roll_rows,
|
||||
position_tracker=self.position_tracker
|
||||
)
|
||||
|
||||
|
||||
def get_corrected_end_time(caption):
|
||||
"""If the last caption was never explicitly ended, set its end time to
|
||||
start + 4 seconds
|
||||
|
||||
:param Caption caption: the last caption
|
||||
:rtype: int
|
||||
"""
|
||||
if caption.end:
|
||||
return caption.end
|
||||
|
||||
return caption.start + 4 * 1000 * 1000
|
||||
|
||||
|
||||
class SCCReader(BaseReader):
|
||||
"""Converts a given unicode string to a CaptionSet.
|
||||
|
||||
This can be then later used for converting into any other supported formats
|
||||
"""
|
||||
def __init__(self, *args, **kw):
|
||||
self.caption_stash = CaptionCreator()
|
||||
self.time_translator = _SccTimeTranslator()
|
||||
|
||||
self.node_creator_factory = NodeCreatorFactory(
|
||||
DefaultProvidingPositionTracker()
|
||||
)
|
||||
|
||||
self.last_command = ''
|
||||
|
||||
self.buffer_dict = NotifyingDict()
|
||||
|
||||
self.buffer_dict['pop'] = self.node_creator_factory.new_creator()
|
||||
self.buffer_dict['paint'] = self.node_creator_factory.new_creator()
|
||||
self.buffer_dict['roll'] = self.node_creator_factory.new_creator()
|
||||
|
||||
# Call this method when the active key changes
|
||||
self.buffer_dict.add_change_observer(self._flush_implicit_buffers)
|
||||
self.buffer_dict.set_active('pop')
|
||||
|
||||
self.roll_rows = []
|
||||
self.roll_rows_expected = 0
|
||||
self.simulate_roll_up = False
|
||||
|
||||
self.time = 0
|
||||
|
||||
def detect(self, content):
|
||||
"""Checks whether the given content is a proper SCC file
|
||||
|
||||
:type content: unicode
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
lines = content.splitlines()
|
||||
if lines[0] == HEADER:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
|
||||
"""Converts the unicode string into a CaptionSet
|
||||
|
||||
:type content: six.text_type
|
||||
:param content: The SCC content to be converted to a CaptionSet
|
||||
|
||||
:type lang: six.text_type
|
||||
:param lang: The language of the caption
|
||||
|
||||
:type simulate_roll_up: bool
|
||||
:param simulate_roll_up: If True, when converting to other formats,
|
||||
the resulting captions will contain all the rows that were visible
|
||||
on the screen when the captions were rolling up.
|
||||
|
||||
:type offset: int
|
||||
:param offset:
|
||||
|
||||
:rtype: CaptionSet
|
||||
"""
|
||||
if type(content) != six.text_type:
|
||||
raise InvalidInputError('The content is not a unicode string.')
|
||||
|
||||
self.simulate_roll_up = simulate_roll_up
|
||||
self.time_translator.offset = offset * 1000000
|
||||
# split lines
|
||||
lines = content.splitlines()
|
||||
|
||||
# loop through each line except the first
|
||||
for line in lines[1:]:
|
||||
self._translate_line(line)
|
||||
|
||||
self._flush_implicit_buffers()
|
||||
|
||||
captions = CaptionSet({lang: self.caption_stash.get_all()})
|
||||
|
||||
# check captions for incorrect lengths
|
||||
for cap in captions.get_captions(lang):
|
||||
# if there's an end time on a caption and the difference is
|
||||
# less than .05s kill it (this is likely caused by a standalone
|
||||
# EOC marker in the SCC file)
|
||||
if 0 < cap.end - cap.start < 50000:
|
||||
raise ValueError('unsupported length found in SCC input file: ' + str(cap))
|
||||
|
||||
if captions.is_empty():
|
||||
raise CaptionReadNoCaptions("empty caption file")
|
||||
else:
|
||||
last_caption = captions.get_captions(lang)[-1]
|
||||
last_caption.end = get_corrected_end_time(last_caption)
|
||||
|
||||
return captions
|
||||
|
||||
def _fix_last_timing(self, timing):
|
||||
"""HACK HACK: Certain Paint-On captions don't specify the 942f [EOC]
|
||||
(End Of Caption) command on the same line.
|
||||
If this is a 942f line, also simulate a 942c (Erase Displayed Memory)
|
||||
to properly set the timing on the last caption.
|
||||
|
||||
This method needs some serious attention, because it proves the timing
|
||||
calculation is not done well for Pop-On captions
|
||||
"""
|
||||
# Calculate the end time from the current line
|
||||
time_translator = _SccTimeTranslator()
|
||||
time_translator.start_at(timing)
|
||||
time_translator.offset = self.time_translator.offset
|
||||
|
||||
# But use the current time translator for the start time
|
||||
self.caption_stash.create_and_store(
|
||||
self.buffer, self.time_translator.get_time())
|
||||
|
||||
self.caption_stash.correct_last_timing(time_translator.get_time())
|
||||
self.buffer = self.node_creator_factory.node_creator()
|
||||
|
||||
def _flush_implicit_buffers(self, old_key=None, *args):
|
||||
"""Convert to Captions those buffers whose behavior is implicit.
|
||||
|
||||
The Paint-On buffer is explicit. New captions are created from it
|
||||
with the command 'End Of Caption' [EOC], '942f'
|
||||
|
||||
The other 2 buffers, Roll-Up and Paint-On we treat as "more" implicit,
|
||||
meaning that they can be displayed by a command on the next row.
|
||||
If they're on the last row however, or if the caption type is changing,
|
||||
we make sure to convert the buffers to text, so we don't lose any info.
|
||||
"""
|
||||
if old_key == 'pop':
|
||||
return
|
||||
|
||||
elif old_key is None or old_key == 'roll':
|
||||
if not self.buffer.is_empty():
|
||||
self._roll_up()
|
||||
|
||||
elif old_key is None or old_key == 'paint':
|
||||
# xxx - perhaps the self.buffer property is sufficient
|
||||
if not self.buffer_dict['paint'].is_empty():
|
||||
self.caption_stash.create_and_store(
|
||||
self.buffer_dict['paint'], self.time)
|
||||
|
||||
def _translate_line(self, line):
|
||||
# ignore blank lines
|
||||
if line.strip() == '':
|
||||
return
|
||||
|
||||
# split line in timestamp and words
|
||||
r = re.compile(r"([0-9:;]*)([\s\t]*)((.)*)")
|
||||
parts = r.findall(line.lower())
|
||||
|
||||
# XXX!!!!!! THESE 2 LINES ARE A HACK
|
||||
if parts[0][2].strip() == '942f':
|
||||
self._fix_last_timing(timing=parts[0][0])
|
||||
|
||||
self.time_translator.start_at(parts[0][0])
|
||||
|
||||
# loop through each word
|
||||
for word in parts[0][2].split(' '):
|
||||
# ignore empty results
|
||||
if word.strip() != '':
|
||||
self._translate_word(word)
|
||||
|
||||
def _translate_word(self, word):
|
||||
# count frames for timing
|
||||
self.time_translator.increment_frames()
|
||||
|
||||
# first check if word is a command
|
||||
# TODO - check that all the positioning commands are here, or use
|
||||
# some other strategy to determine if the word is a command.
|
||||
if word in COMMANDS or _is_pac_command(word):
|
||||
self._translate_command(word)
|
||||
|
||||
# second, check if word is a special character
|
||||
elif word in SPECIAL_CHARS:
|
||||
self._translate_special_char(word)
|
||||
|
||||
elif word in EXTENDED_CHARS:
|
||||
self._translate_extended_char(word)
|
||||
|
||||
# third, try to convert word into 2 characters
|
||||
else:
|
||||
self._translate_characters(word)
|
||||
|
||||
def _handle_double_command(self, word):
|
||||
# ensure we don't accidentally use the same command twice
|
||||
if word == self.last_command:
|
||||
self.last_command = ''
|
||||
return True
|
||||
else:
|
||||
self.last_command = word
|
||||
return False
|
||||
|
||||
def _translate_special_char(self, word):
|
||||
# XXX - this looks highly buggy. Why should special chars be ignored
|
||||
# when printed 2 times one after another?
|
||||
if self._handle_double_command(word):
|
||||
return
|
||||
|
||||
self.buffer.add_chars(SPECIAL_CHARS[word])
|
||||
|
||||
def _translate_extended_char(self, word):
|
||||
# XXX - this looks highly buggy. Why would a special char be ignored
|
||||
# if it's printed 2 times one after another?
|
||||
if self._handle_double_command(word):
|
||||
return
|
||||
|
||||
# add to buffer
|
||||
self.buffer.add_chars(EXTENDED_CHARS[word])
|
||||
|
||||
def _translate_command(self, word):
|
||||
if self._handle_double_command(word):
|
||||
return
|
||||
|
||||
# if command is pop_up
|
||||
if word == '9420':
|
||||
self.buffer_dict.set_active('pop')
|
||||
|
||||
# command is paint_on [Resume Direct Captioning]
|
||||
elif word == '9429':
|
||||
self.buffer_dict.set_active('paint')
|
||||
|
||||
self.roll_rows_expected = 1
|
||||
if not self.buffer.is_empty():
|
||||
self.caption_stash.create_and_store(
|
||||
self.buffer, self.time
|
||||
)
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
self.time = self.time_translator.get_time()
|
||||
|
||||
# if command is roll_up 2, 3 or 4 rows
|
||||
elif word in ('9425', '9426', '94a7'):
|
||||
self.buffer_dict.set_active('roll')
|
||||
|
||||
# count how many lines are expected
|
||||
if word == '9425':
|
||||
self.roll_rows_expected = 2
|
||||
elif word == '9426':
|
||||
self.roll_rows_expected = 3
|
||||
elif word == '94a7':
|
||||
self.roll_rows_expected = 4
|
||||
|
||||
# if content is in the queue, turn it into a caption
|
||||
if not self.buffer.is_empty():
|
||||
self.caption_stash.create_and_store(
|
||||
self.buffer, self.time)
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
# set rows to empty, configure start time for caption
|
||||
self.roll_rows = []
|
||||
self.time = self.time_translator.get_time()
|
||||
|
||||
# clear pop_on buffer
|
||||
elif word == '94ae':
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
# display pop_on buffer [End Of Caption]
|
||||
elif word == '942f':
|
||||
self.time = self.time_translator.get_time()
|
||||
self.caption_stash.create_and_store(self.buffer, self.time)
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
# roll up captions [Carriage Return]
|
||||
elif word == '94ad':
|
||||
# display roll-up buffer
|
||||
if not self.buffer.is_empty():
|
||||
self._roll_up()
|
||||
|
||||
# clear screen
|
||||
elif word == '942c':
|
||||
self.roll_rows = []
|
||||
|
||||
# XXX - The 942c command has nothing to do with paint-ons
|
||||
# This however is legacy code, and will break lots of tests if
|
||||
# the proper buffer (self.buffer) is used.
|
||||
# Most likely using `self.buffer` instead of the paint buffer
|
||||
# is the right thing to do, but this needs some further attention.
|
||||
if not self.buffer_dict['paint'].is_empty():
|
||||
self.caption_stash.create_and_store(
|
||||
self.buffer_dict['paint'], self.time)
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
# attempt to add proper end time to last caption(s)
|
||||
self.caption_stash.correct_last_timing(
|
||||
self.time_translator.get_time())
|
||||
|
||||
# if command not one of the aforementioned, add to buffer
|
||||
else:
|
||||
self.buffer.interpret_command(word)
|
||||
|
||||
def _translate_characters(self, word):
|
||||
# split word into the 2 bytes
|
||||
byte1 = word[:2]
|
||||
byte2 = word[2:]
|
||||
|
||||
# check to see if the the bytes are recognized characters
|
||||
if byte1 not in CHARACTERS or byte2 not in CHARACTERS:
|
||||
return
|
||||
|
||||
self.buffer.add_chars(CHARACTERS[byte1], CHARACTERS[byte2])
|
||||
|
||||
@property
|
||||
def buffer(self):
|
||||
"""Returns the currently active buffer
|
||||
"""
|
||||
return self.buffer_dict.get_active()
|
||||
|
||||
@buffer.setter
|
||||
def buffer(self, value):
|
||||
"""Sets a new value to the active key
|
||||
|
||||
:param value: any object
|
||||
"""
|
||||
try:
|
||||
key = self.buffer_dict.active_key
|
||||
self.buffer_dict[key] = value
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
def _roll_up(self):
|
||||
# We expect the active buffer to be the rol buffer
|
||||
if self.simulate_roll_up:
|
||||
if self.roll_rows_expected > 1:
|
||||
if len(self.roll_rows) >= self.roll_rows_expected:
|
||||
self.roll_rows.pop(0)
|
||||
|
||||
self.roll_rows.append(self.buffer)
|
||||
self.buffer = self.node_creator_factory.from_list(
|
||||
self.roll_rows)
|
||||
|
||||
# convert buffer and empty
|
||||
self.caption_stash.create_and_store(self.buffer, self.time)
|
||||
self.buffer = self.node_creator_factory.new_creator()
|
||||
|
||||
# configure time
|
||||
self.time = self.time_translator.get_time()
|
||||
|
||||
# try to insert the proper ending time for the previous caption
|
||||
self.caption_stash.correct_last_timing(self.time, force=True)
|
||||
|
||||
|
||||
class SCCWriter(BaseWriter):
|
||||
|
||||
def __init__(self, *args, **kw):
|
||||
super(SCCWriter, self).__init__(*args, **kw)
|
||||
|
||||
def write(self, caption_set):
|
||||
output = HEADER + '\n\n'
|
||||
|
||||
if caption_set.is_empty():
|
||||
return output
|
||||
|
||||
caption_set = deepcopy(caption_set)
|
||||
|
||||
# Only support one language.
|
||||
lang = list(caption_set.get_languages())[0]
|
||||
captions = caption_set.get_captions(lang)
|
||||
|
||||
# PASS 1: compute codes for each caption
|
||||
codes = [(self._text_to_code(caption), caption.start, caption.end)
|
||||
for caption in captions]
|
||||
|
||||
# PASS 2:
|
||||
# Advance start times so as to have time to write to the pop-on
|
||||
# buffer; possibly remove the previous clear-screen command
|
||||
for index, (code, start, end) in enumerate(codes):
|
||||
code_words = len(code) / 5 + 8
|
||||
code_time_microseconds = code_words * MICROSECONDS_PER_CODEWORD
|
||||
code_start = start - code_time_microseconds
|
||||
if index == 0:
|
||||
continue
|
||||
previous_code, previous_start, previous_end = codes[index-1]
|
||||
if previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start:
|
||||
codes[index-1] = (previous_code, previous_start, None)
|
||||
codes[index] = (code, code_start, end)
|
||||
|
||||
# PASS 3:
|
||||
# Write captions.
|
||||
for (code, start, end) in codes:
|
||||
output += ('%s\t' % self._format_timestamp(start))
|
||||
output += '94ae 94ae 9420 9420 '
|
||||
output += code
|
||||
output += '942c 942c 942f 942f\n\n'
|
||||
if end is not None:
|
||||
output += '%s\t942c 942c\n\n' % self._format_timestamp(end)
|
||||
|
||||
return output
|
||||
|
||||
# Wrap lines at 32 chars
|
||||
@staticmethod
|
||||
def _layout_line(caption):
|
||||
def caption_node_to_text(caption_node):
|
||||
if caption_node.type_ == CaptionNode.TEXT:
|
||||
return six.text_type(caption_node.content)
|
||||
elif caption_node.type_ == CaptionNode.BREAK:
|
||||
return '\n'
|
||||
caption_text = ''.join(
|
||||
[caption_node_to_text(node) for node in caption.nodes])
|
||||
inner_lines = caption_text.split('\n')
|
||||
inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines]
|
||||
return '\n'.join(inner_lines_laid_out)
|
||||
|
||||
@staticmethod
|
||||
def _maybe_align(code):
|
||||
# Finish a half-word with a no-op so we can move to a full word
|
||||
if len(code) % 5 == 2:
|
||||
code += '80 '
|
||||
return code
|
||||
|
||||
@staticmethod
|
||||
def _maybe_space(code):
|
||||
if len(code) % 5 == 4:
|
||||
code += ' '
|
||||
return code
|
||||
|
||||
def _print_character(self, code, char):
|
||||
try:
|
||||
char_code = CHARACTER_TO_CODE[char]
|
||||
except KeyError:
|
||||
try:
|
||||
char_code = SPECIAL_OR_EXTENDED_CHAR_TO_CODE[char]
|
||||
except KeyError:
|
||||
char_code = '91b6' # Use £ as "unknown character" symbol
|
||||
|
||||
if len(char_code) == 2:
|
||||
return code + char_code
|
||||
elif len(char_code) == 4:
|
||||
return self._maybe_align(code) + char_code
|
||||
else:
|
||||
# This should not happen!
|
||||
return code
|
||||
|
||||
def _text_to_code(self, s):
|
||||
code = ''
|
||||
lines = self._layout_line(s).split('\n')
|
||||
for row, line in enumerate(lines):
|
||||
row += 16 - len(lines)
|
||||
# Move cursor to column 0 of the destination row
|
||||
for _ in range(2):
|
||||
code += ('%s%s ' % (PAC_HIGH_BYTE_BY_ROW[row],
|
||||
PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]))
|
||||
# Print the line using the SCC encoding
|
||||
for char in line:
|
||||
code = self._print_character(code, char)
|
||||
code = self._maybe_space(code)
|
||||
code = self._maybe_align(code)
|
||||
return code
|
||||
|
||||
@staticmethod
|
||||
def _format_timestamp(microseconds):
|
||||
seconds_float = microseconds / 1000.0 / 1000.0
|
||||
# Convert to non-drop-frame timecode
|
||||
seconds_float *= 1000.0 / 1001.0
|
||||
hours = math.floor(seconds_float / 3600)
|
||||
seconds_float -= hours * 3600
|
||||
minutes = math.floor(seconds_float / 60)
|
||||
seconds_float -= minutes * 60
|
||||
seconds = math.floor(seconds_float)
|
||||
seconds_float -= seconds
|
||||
frames = math.floor(seconds_float * 30)
|
||||
return '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frames)
|
||||
|
||||
|
||||
class _SccTimeTranslator(object):
|
||||
"""Converts SCC time to microseconds, keeping track of frames passed
|
||||
"""
|
||||
def __init__(self):
|
||||
self._time = '00:00:00;00'
|
||||
|
||||
# microseconds. The offset from which we begin the time calculation
|
||||
self.offset = 0
|
||||
self._frames = 0
|
||||
|
||||
def get_time(self):
|
||||
"""Returns the time, in microseconds. Takes into account the number of
|
||||
frames passed, and the offset
|
||||
|
||||
:rtype: int
|
||||
"""
|
||||
return self._translate_time(
|
||||
self._time[:-2] + six.text_type(int(self._time[-2:]) + self._frames),
|
||||
self.offset
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _translate_time(stamp, offset):
|
||||
"""
|
||||
:param stamp:
|
||||
:type offset: int
|
||||
:param offset: Subtract this many microseconds from the calculated time
|
||||
Helpful for when the captions are off by some time interval.
|
||||
:rtype: int
|
||||
"""
|
||||
if ';' in stamp:
|
||||
# Drop-frame timebase runs at the same rate as wall clock
|
||||
seconds_per_timestamp_second = 1.0
|
||||
else:
|
||||
# Non-drop-frame timebase runs "slow"
|
||||
# 1 second of timecode is longer than an actual second (1.001s)
|
||||
seconds_per_timestamp_second = 1001.0 / 1000.0
|
||||
|
||||
time_split = stamp.replace(';', ':').split(':')
|
||||
|
||||
timestamp_seconds = (int(time_split[0]) * 3600 +
|
||||
int(time_split[1]) * 60 +
|
||||
int(time_split[2]) +
|
||||
int(time_split[3]) / 30.0)
|
||||
|
||||
seconds = timestamp_seconds * seconds_per_timestamp_second
|
||||
microseconds = seconds * 1000 * 1000 - offset
|
||||
|
||||
if microseconds < 0:
|
||||
microseconds = 0
|
||||
|
||||
return microseconds
|
||||
|
||||
def start_at(self, timespec):
|
||||
"""Reset the counter to the given time
|
||||
|
||||
:type timespec: unicode
|
||||
"""
|
||||
self._time = timespec
|
||||
self._frames = 0
|
||||
|
||||
def increment_frames(self):
|
||||
"""After a command was processed, we'd increment the number of frames
|
||||
"""
|
||||
self._frames += 1
|
||||
|
||||
|
||||
def _is_pac_command(word):
|
||||
"""Checks whether the given word is a Preamble Address Code [PAC] command
|
||||
|
||||
:type word: unicode
|
||||
:param word: 4 letter unicode command
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
if not word or len(word) != 4:
|
||||
return False
|
||||
|
||||
byte1, byte2 = word[:2], word[2:]
|
||||
|
||||
try:
|
||||
PAC_BYTES_TO_POSITIONING_MAP[byte1][byte2]
|
||||
except KeyError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-36.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-36.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-37.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-37.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-38.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-38.pyc
Normal file
Binary file not shown.
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-39.pyc
Normal file
BIN
utils/modules/pycaption/scc/__pycache__/constants.cpython-39.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
984
utils/modules/pycaption/scc/constants.py
Normal file
984
utils/modules/pycaption/scc/constants.py
Normal file
@@ -0,0 +1,984 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from itertools import product
|
||||
from future.utils import viewitems
|
||||
|
||||
COMMANDS = {
|
||||
'9420': '',
|
||||
'9429': '',
|
||||
'9425': '',
|
||||
'9426': '',
|
||||
'94a7': '',
|
||||
'942a': '',
|
||||
'94ab': '',
|
||||
'942c': '',
|
||||
'94ae': '',
|
||||
'942f': '',
|
||||
'9779': '<$>{break}<$>',
|
||||
'9775': '<$>{break}<$>',
|
||||
'9776': '<$>{break}<$>',
|
||||
'9770': '<$>{break}<$>',
|
||||
'9773': '<$>{break}<$>',
|
||||
'10c8': '<$>{break}<$>',
|
||||
'10c2': '<$>{break}<$>',
|
||||
'166e': '<$>{break}<$>{italic}<$>',
|
||||
'166d': '<$>{break}<$>',
|
||||
'166b': '<$>{break}<$>',
|
||||
'10c4': '<$>{break}<$>',
|
||||
'9473': '<$>{break}<$>',
|
||||
'977f': '<$>{break}<$>',
|
||||
'977a': '<$>{break}<$>',
|
||||
'1668': '<$>{break}<$>',
|
||||
'1667': '<$>{break}<$>',
|
||||
'1664': '<$>{break}<$>',
|
||||
'1661': '<$>{break}<$>',
|
||||
'10ce': '<$>{break}<$>{italic}<$>',
|
||||
'94c8': '<$>{break}<$>',
|
||||
'94c7': '<$>{break}<$>',
|
||||
'94c4': '<$>{break}<$>',
|
||||
'94c2': '<$>{break}<$>',
|
||||
'94c1': '<$>{break}<$>',
|
||||
'915e': '<$>{break}<$>',
|
||||
'915d': '<$>{break}<$>',
|
||||
'915b': '<$>{break}<$>',
|
||||
'925d': '<$>{break}<$>',
|
||||
'925e': '<$>{break}<$>',
|
||||
'925b': '<$>{break}<$>',
|
||||
'97e6': '<$>{break}<$>',
|
||||
'97e5': '<$>{break}<$>',
|
||||
'97e3': '<$>{break}<$>',
|
||||
'97e0': '<$>{break}<$>',
|
||||
'97e9': '<$>{break}<$>',
|
||||
'9154': '<$>{break}<$>',
|
||||
'9157': '<$>{break}<$>',
|
||||
'9151': '<$>{break}<$>',
|
||||
'9258': '<$>{break}<$>',
|
||||
'9152': '<$>{break}<$>',
|
||||
'9257': '<$>{break}<$>',
|
||||
'9254': '<$>{break}<$>',
|
||||
'9252': '<$>{break}<$>',
|
||||
'9158': '<$>{break}<$>',
|
||||
'9251': '<$>{break}<$>',
|
||||
'94cd': '<$>{break}<$>',
|
||||
'94ce': '<$>{break}<$>{italic}<$>',
|
||||
'94cb': '<$>{break}<$>',
|
||||
'97ef': '<$>{break}<$>{italic}<$>',
|
||||
'1373': '<$>{break}<$>',
|
||||
'97ec': '<$>{break}<$>',
|
||||
'97ea': '<$>{break}<$>',
|
||||
'15c7': '<$>{break}<$>',
|
||||
'974f': '<$>{break}<$>{italic}<$>',
|
||||
'10c1': '<$>{break}<$>',
|
||||
'974a': '<$>{break}<$>',
|
||||
'974c': '<$>{break}<$>',
|
||||
'10c7': '<$>{break}<$>',
|
||||
'976d': '<$>{break}<$>',
|
||||
'15d6': '<$>{break}<$>',
|
||||
'15d5': '<$>{break}<$>',
|
||||
'15d3': '<$>{break}<$>',
|
||||
'15d0': '<$>{break}<$>',
|
||||
'15d9': '<$>{break}<$>',
|
||||
'9745': '<$>{break}<$>',
|
||||
'9746': '<$>{break}<$>',
|
||||
'9740': '<$>{break}<$>',
|
||||
'9743': '<$>{break}<$>',
|
||||
'9749': '<$>{break}<$>',
|
||||
'15df': '<$>{break}<$>',
|
||||
'15dc': '<$>{break}<$>',
|
||||
'15da': '<$>{break}<$>',
|
||||
'15f8': '<$>{break}<$>',
|
||||
'94fe': '<$>{break}<$>',
|
||||
'94fd': '<$>{break}<$>',
|
||||
'94fc': '<$>{break}<$>',
|
||||
'94fb': '<$>{break}<$>',
|
||||
'944f': '<$>{break}<$>{italic}<$>',
|
||||
'944c': '<$>{break}<$>',
|
||||
'944a': '<$>{break}<$>',
|
||||
'92fc': '<$>{break}<$>',
|
||||
'1051': '<$>{break}<$>',
|
||||
'1052': '<$>{break}<$>',
|
||||
'1054': '<$>{break}<$>',
|
||||
'92fe': '<$>{break}<$>',
|
||||
'92fd': '<$>{break}<$>',
|
||||
'1058': '<$>{break}<$>',
|
||||
'157a': '<$>{break}<$>',
|
||||
'157f': '<$>{break}<$>',
|
||||
'9279': '<$>{break}<$>',
|
||||
'94f4': '<$>{break}<$>',
|
||||
'94f7': '<$>{break}<$>',
|
||||
'94f1': '<$>{break}<$>',
|
||||
'9449': '<$>{break}<$>',
|
||||
'92fb': '<$>{break}<$>',
|
||||
'9446': '<$>{break}<$>',
|
||||
'9445': '<$>{break}<$>',
|
||||
'9443': '<$>{break}<$>',
|
||||
'94f8': '<$>{break}<$>',
|
||||
'9440': '<$>{break}<$>',
|
||||
'1057': '<$>{break}<$>',
|
||||
'9245': '<$>{break}<$>',
|
||||
'92f2': '<$>{break}<$>',
|
||||
'1579': '<$>{break}<$>',
|
||||
'92f7': '<$>{break}<$>',
|
||||
'105e': '<$>{break}<$>',
|
||||
'92f4': '<$>{break}<$>',
|
||||
'1573': '<$>{break}<$>',
|
||||
'1570': '<$>{break}<$>',
|
||||
'1576': '<$>{break}<$>',
|
||||
'1575': '<$>{break}<$>',
|
||||
'16c1': '<$>{break}<$>',
|
||||
'16c2': '<$>{break}<$>',
|
||||
'9168': '<$>{break}<$>',
|
||||
'16c7': '<$>{break}<$>',
|
||||
'9164': '<$>{break}<$>',
|
||||
'9167': '<$>{break}<$>',
|
||||
'9161': '<$>{break}<$>',
|
||||
'9162': '<$>{break}<$>',
|
||||
'947f': '<$>{break}<$>',
|
||||
'91c2': '<$>{break}<$>',
|
||||
'91c1': '<$>{break}<$>',
|
||||
'91c7': '<$>{break}<$>',
|
||||
'91c4': '<$>{break}<$>',
|
||||
'13e3': '<$>{break}<$>',
|
||||
'91c8': '<$>{break}<$>',
|
||||
'91d0': '<$>{break}<$>',
|
||||
'13e5': '<$>{break}<$>',
|
||||
'13c8': '<$>{break}<$>',
|
||||
'16cb': '<$>{break}<$>',
|
||||
'16cd': '<$>{break}<$>',
|
||||
'16ce': '<$>{break}<$>{italic}<$>',
|
||||
'916d': '<$>{break}<$>',
|
||||
'916e': '<$>{break}<$>{italic}<$>',
|
||||
'916b': '<$>{break}<$>',
|
||||
'91d5': '<$>{break}<$>',
|
||||
'137a': '<$>{break}<$>',
|
||||
'91cb': '<$>{break}<$>',
|
||||
'91ce': '<$>{break}<$>{italic}<$>',
|
||||
'91cd': '<$>{break}<$>',
|
||||
'13ec': '<$>{break}<$>',
|
||||
'13c1': '<$>{break}<$>',
|
||||
'13ea': '<$>{break}<$>',
|
||||
'13ef': '<$>{break}<$>{italic}<$>',
|
||||
'94f2': '<$>{break}<$>',
|
||||
'97fb': '<$>{break}<$>',
|
||||
'97fc': '<$>{break}<$>',
|
||||
'1658': '<$>{break}<$>',
|
||||
'97fd': '<$>{break}<$>',
|
||||
'97fe': '<$>{break}<$>',
|
||||
'1652': '<$>{break}<$>',
|
||||
'1651': '<$>{break}<$>',
|
||||
'1657': '<$>{break}<$>',
|
||||
'1654': '<$>{break}<$>',
|
||||
'10cb': '<$>{break}<$>',
|
||||
'97f2': '<$>{break}<$>',
|
||||
'97f1': '<$>{break}<$>',
|
||||
'97f7': '<$>{break}<$>',
|
||||
'97f4': '<$>{break}<$>',
|
||||
'165b': '<$>{break}<$>',
|
||||
'97f8': '<$>{break}<$>',
|
||||
'165d': '<$>{break}<$>',
|
||||
'165e': '<$>{break}<$>',
|
||||
'15cd': '<$>{break}<$>',
|
||||
'10cd': '<$>{break}<$>',
|
||||
'9767': '<$>{break}<$>',
|
||||
'9249': '<$>{break}<$>',
|
||||
'1349': '<$>{break}<$>',
|
||||
'91d9': '<$>{break}<$>',
|
||||
'1340': '<$>{break}<$>',
|
||||
'91d3': '<$>{break}<$>',
|
||||
'9243': '<$>{break}<$>',
|
||||
'1343': '<$>{break}<$>',
|
||||
'91d6': '<$>{break}<$>',
|
||||
'1345': '<$>{break}<$>',
|
||||
'1346': '<$>{break}<$>',
|
||||
'9246': '<$>{break}<$>',
|
||||
'94e9': '<$>{break}<$>',
|
||||
'94e5': '<$>{break}<$>',
|
||||
'94e6': '<$>{break}<$>',
|
||||
'94e0': '<$>{break}<$>',
|
||||
'94e3': '<$>{break}<$>',
|
||||
'15ea': '<$>{break}<$>',
|
||||
'15ec': '<$>{break}<$>',
|
||||
'15ef': '<$>{break}<$>{italic}<$>',
|
||||
'16fe': '<$>{break}<$>',
|
||||
'16fd': '<$>{break}<$>',
|
||||
'16fc': '<$>{break}<$>',
|
||||
'16fb': '<$>{break}<$>',
|
||||
'1367': '<$>{break}<$>',
|
||||
'94ef': '<$>{break}<$>{italic}<$>',
|
||||
'94ea': '<$>{break}<$>',
|
||||
'94ec': '<$>{break}<$>',
|
||||
'924a': '<$>{break}<$>',
|
||||
'91dc': '<$>{break}<$>',
|
||||
'924c': '<$>{break}<$>',
|
||||
'91da': '<$>{break}<$>',
|
||||
'91df': '<$>{break}<$>',
|
||||
'134f': '<$>{break}<$>{italic}<$>',
|
||||
'924f': '<$>{break}<$>{italic}<$>',
|
||||
'16f8': '<$>{break}<$>',
|
||||
'16f7': '<$>{break}<$>',
|
||||
'16f4': '<$>{break}<$>',
|
||||
'16f2': '<$>{break}<$>',
|
||||
'16f1': '<$>{break}<$>',
|
||||
'15e0': '<$>{break}<$>',
|
||||
'15e3': '<$>{break}<$>',
|
||||
'15e5': '<$>{break}<$>',
|
||||
'15e6': '<$>{break}<$>',
|
||||
'15e9': '<$>{break}<$>',
|
||||
'9757': '<$>{break}<$>',
|
||||
'9754': '<$>{break}<$>',
|
||||
'9752': '<$>{break}<$>',
|
||||
'9751': '<$>{break}<$>',
|
||||
'9758': '<$>{break}<$>',
|
||||
'92f1': '<$>{break}<$>',
|
||||
'104c': '<$>{break}<$>',
|
||||
'104a': '<$>{break}<$>',
|
||||
'104f': '<$>{break}<$>{italic}<$>',
|
||||
'105d': '<$>{break}<$>',
|
||||
'92f8': '<$>{break}<$>',
|
||||
'975e': '<$>{break}<$>',
|
||||
'975d': '<$>{break}<$>',
|
||||
'975b': '<$>{break}<$>',
|
||||
'1043': '<$>{break}<$>',
|
||||
'1040': '<$>{break}<$>',
|
||||
'1046': '<$>{break}<$>',
|
||||
'1045': '<$>{break}<$>',
|
||||
'1049': '<$>{break}<$>',
|
||||
'9479': '<$>{break}<$>',
|
||||
'917f': '<$>{break}<$>',
|
||||
'9470': '<$>{break}<$>',
|
||||
'9476': '<$>{break}<$>',
|
||||
'917a': '<$>{break}<$>',
|
||||
'9475': '<$>{break}<$>',
|
||||
'927a': '<$>{break}<$>',
|
||||
'927f': '<$>{break}<$>',
|
||||
'134a': '<$>{break}<$>',
|
||||
'15fb': '<$>{break}<$>',
|
||||
'15fc': '<$>{break}<$>',
|
||||
'15fd': '<$>{break}<$>',
|
||||
'15fe': '<$>{break}<$>',
|
||||
'1546': '<$>{break}<$>',
|
||||
'1545': '<$>{break}<$>',
|
||||
'1543': '<$>{break}<$>',
|
||||
'1540': '<$>{break}<$>',
|
||||
'1549': '<$>{break}<$>',
|
||||
'13fd': '<$>{break}<$>',
|
||||
'13fe': '<$>{break}<$>',
|
||||
'13fb': '<$>{break}<$>',
|
||||
'13fc': '<$>{break}<$>',
|
||||
'92e9': '<$>{break}<$>',
|
||||
'92e6': '<$>{break}<$>',
|
||||
'9458': '<$>{break}<$>',
|
||||
'92e5': '<$>{break}<$>',
|
||||
'92e3': '<$>{break}<$>',
|
||||
'92e0': '<$>{break}<$>',
|
||||
'9270': '<$>{break}<$>',
|
||||
'9273': '<$>{break}<$>',
|
||||
'9275': '<$>{break}<$>',
|
||||
'9276': '<$>{break}<$>',
|
||||
'15f1': '<$>{break}<$>',
|
||||
'15f2': '<$>{break}<$>',
|
||||
'15f4': '<$>{break}<$>',
|
||||
'15f7': '<$>{break}<$>',
|
||||
'9179': '<$>{break}<$>',
|
||||
'9176': '<$>{break}<$>',
|
||||
'9175': '<$>{break}<$>',
|
||||
'947a': '<$>{break}<$>',
|
||||
'9173': '<$>{break}<$>',
|
||||
'9170': '<$>{break}<$>',
|
||||
'13f7': '<$>{break}<$>',
|
||||
'13f4': '<$>{break}<$>',
|
||||
'13f2': '<$>{break}<$>',
|
||||
'13f1': '<$>{break}<$>',
|
||||
'92ef': '<$>{break}<$>{italic}<$>',
|
||||
'92ec': '<$>{break}<$>',
|
||||
'13f8': '<$>{break}<$>',
|
||||
'92ea': '<$>{break}<$>',
|
||||
'154f': '<$>{break}<$>{italic}<$>',
|
||||
'154c': '<$>{break}<$>',
|
||||
'154a': '<$>{break}<$>',
|
||||
'16c4': '<$>{break}<$>',
|
||||
'16c8': '<$>{break}<$>',
|
||||
'97c8': '<$>{break}<$>',
|
||||
'164f': '<$>{break}<$>{italic}<$>',
|
||||
'164a': '<$>{break}<$>',
|
||||
'164c': '<$>{break}<$>',
|
||||
'1645': '<$>{break}<$>',
|
||||
'1646': '<$>{break}<$>',
|
||||
'1640': '<$>{break}<$>',
|
||||
'1643': '<$>{break}<$>',
|
||||
'1649': '<$>{break}<$>',
|
||||
'94df': '<$>{break}<$>',
|
||||
'94dc': '<$>{break}<$>',
|
||||
'94da': '<$>{break}<$>',
|
||||
'135b': '<$>{break}<$>',
|
||||
'135e': '<$>{break}<$>',
|
||||
'135d': '<$>{break}<$>',
|
||||
'1370': '<$>{break}<$>',
|
||||
'9240': '<$>{break}<$>',
|
||||
'13e9': '<$>{break}<$>',
|
||||
'1375': '<$>{break}<$>',
|
||||
'1679': '<$>{break}<$>',
|
||||
'1358': '<$>{break}<$>',
|
||||
'1352': '<$>{break}<$>',
|
||||
'1351': '<$>{break}<$>',
|
||||
'1376': '<$>{break}<$>',
|
||||
'1357': '<$>{break}<$>',
|
||||
'1354': '<$>{break}<$>',
|
||||
'1379': '<$>{break}<$>',
|
||||
'94d9': '<$>{break}<$>',
|
||||
'94d6': '<$>{break}<$>',
|
||||
'94d5': '<$>{break}<$>',
|
||||
'15462': '<$>{break}<$>',
|
||||
'94d3': '<$>{break}<$>',
|
||||
'94d0': '<$>{break}<$>',
|
||||
'13e0': '<$>{break}<$>',
|
||||
'13e6': '<$>{break}<$>',
|
||||
'976b': '<$>{break}<$>',
|
||||
'15c4': '<$>{break}<$>',
|
||||
'15c2': '<$>{break}<$>',
|
||||
'15c1': '<$>{break}<$>',
|
||||
'976e': '<$>{break}<$>{italic}<$>',
|
||||
'134c': '<$>{break}<$>',
|
||||
'15c8': '<$>{break}<$>',
|
||||
'92c8': '<$>{break}<$>',
|
||||
'16e9': '<$>{break}<$>',
|
||||
'16e3': '<$>{break}<$>',
|
||||
'16e0': '<$>{break}<$>',
|
||||
'16e6': '<$>{break}<$>',
|
||||
'16e5': '<$>{break}<$>',
|
||||
'91e5': '<$>{break}<$>',
|
||||
'91e6': '<$>{break}<$>',
|
||||
'91e0': '<$>{break}<$>',
|
||||
'91e3': '<$>{break}<$>',
|
||||
'13c4': '<$>{break}<$>',
|
||||
'13c7': '<$>{break}<$>',
|
||||
'91e9': '<$>{break}<$>',
|
||||
'13c2': '<$>{break}<$>',
|
||||
'9762': '<$>{break}<$>',
|
||||
'15ce': '<$>{break}<$>{italic}<$>',
|
||||
'9761': '<$>{break}<$>',
|
||||
'15cb': '<$>{break}<$>',
|
||||
'9764': '<$>{break}<$>',
|
||||
'9768': '<$>{break}<$>',
|
||||
'91ef': '<$>{break}<$>{italic}<$>',
|
||||
'91ea': '<$>{break}<$>',
|
||||
'91ec': '<$>{break}<$>',
|
||||
'13ce': '<$>{break}<$>{italic}<$>',
|
||||
'13cd': '<$>{break}<$>',
|
||||
'97da': '<$>{break}<$>',
|
||||
'13cb': '<$>{break}<$>',
|
||||
'13462': '<$>{break}<$>',
|
||||
'16ec': '<$>{break}<$>',
|
||||
'16ea': '<$>{break}<$>',
|
||||
'16ef': '<$>{break}<$>{italic}<$>',
|
||||
'97c1': '<$>{break}<$>',
|
||||
'97c2': '<$>{break}<$>',
|
||||
'97c4': '<$>{break}<$>',
|
||||
'97c7': '<$>{break}<$>',
|
||||
'92cd': '<$>{break}<$>',
|
||||
'92ce': '<$>{break}<$>{italic}<$>',
|
||||
'92cb': '<$>{break}<$>',
|
||||
'92da': '<$>{break}<$>',
|
||||
'92dc': '<$>{break}<$>',
|
||||
'92df': '<$>{break}<$>',
|
||||
'97df': '<$>{break}<$>',
|
||||
'155b': '<$>{break}<$>',
|
||||
'155e': '<$>{break}<$>',
|
||||
'155d': '<$>{break}<$>',
|
||||
'97dc': '<$>{break}<$>',
|
||||
'1675': '<$>{break}<$>',
|
||||
'1676': '<$>{break}<$>',
|
||||
'1670': '<$>{break}<$>',
|
||||
'1673': '<$>{break}<$>',
|
||||
'16462': '<$>{break}<$>',
|
||||
'97cb': '<$>{break}<$>',
|
||||
'97ce': '<$>{break}<$>{italic}<$>',
|
||||
'97cd': '<$>{break}<$>',
|
||||
'92c4': '<$>{break}<$>',
|
||||
'92c7': '<$>{break}<$>',
|
||||
'92c1': '<$>{break}<$>',
|
||||
'92c2': '<$>{break}<$>',
|
||||
'1551': '<$>{break}<$>',
|
||||
'97d5': '<$>{break}<$>',
|
||||
'97d6': '<$>{break}<$>',
|
||||
'1552': '<$>{break}<$>',
|
||||
'97d0': '<$>{break}<$>',
|
||||
'1554': '<$>{break}<$>',
|
||||
'1557': '<$>{break}<$>',
|
||||
'97d3': '<$>{break}<$>',
|
||||
'1558': '<$>{break}<$>',
|
||||
'167f': '<$>{break}<$>',
|
||||
'137f': '<$>{break}<$>',
|
||||
'167a': '<$>{break}<$>',
|
||||
'92d9': '<$>{break}<$>',
|
||||
'92d0': '<$>{break}<$>',
|
||||
'92d3': '<$>{break}<$>',
|
||||
'92d5': '<$>{break}<$>',
|
||||
'92d6': '<$>{break}<$>',
|
||||
'10dc': '<$>{break}<$>',
|
||||
'9262': '<$>{break}<$>',
|
||||
'9261': '<$>{break}<$>',
|
||||
'91f8': '<$>{break}<$>',
|
||||
'10df': '<$>{break}<$>',
|
||||
'9264': '<$>{break}<$>',
|
||||
'91f4': '<$>{break}<$>',
|
||||
'91f7': '<$>{break}<$>',
|
||||
'91f1': '<$>{break}<$>',
|
||||
'91f2': '<$>{break}<$>',
|
||||
'97d9': '<$>{break}<$>',
|
||||
'9149': '<$>{break}<$>',
|
||||
'9143': '<$>{break}<$>',
|
||||
'9140': '<$>{break}<$>',
|
||||
'9146': '<$>{break}<$>',
|
||||
'9145': '<$>{break}<$>',
|
||||
'9464': '<$>{break}<$>',
|
||||
'9467': '<$>{break}<$>',
|
||||
'9461': '<$>{break}<$>',
|
||||
'9462': '<$>{break}<$>',
|
||||
'9468': '<$>{break}<$>',
|
||||
'914c': '<$>{break}<$>',
|
||||
'914a': '<$>{break}<$>',
|
||||
'914f': '<$>{break}<$>{italic}<$>',
|
||||
'10d3': '<$>{break}<$>',
|
||||
'926b': '<$>{break}<$>',
|
||||
'10d0': '<$>{break}<$>',
|
||||
'10d6': '<$>{break}<$>',
|
||||
'926e': '<$>{break}<$>{italic}<$>',
|
||||
'926d': '<$>{break}<$>',
|
||||
'91fd': '<$>{break}<$>',
|
||||
'91fe': '<$>{break}<$>',
|
||||
'10d9': '<$>{break}<$>',
|
||||
'91fb': '<$>{break}<$>',
|
||||
'91fc': '<$>{break}<$>',
|
||||
'946e': '<$>{break}<$>{italic}<$>',
|
||||
'946d': '<$>{break}<$>',
|
||||
'946b': '<$>{break}<$>',
|
||||
'10da': '<$>{break}<$>',
|
||||
'10d5': '<$>{break}<$>',
|
||||
'9267': '<$>{break}<$>',
|
||||
'9268': '<$>{break}<$>',
|
||||
'16df': '<$>{break}<$>',
|
||||
'16da': '<$>{break}<$>',
|
||||
'16dc': '<$>{break}<$>',
|
||||
'9454': '<$>{break}<$>',
|
||||
'9457': '<$>{break}<$>',
|
||||
'9451': '<$>{break}<$>',
|
||||
'9452': '<$>{break}<$>',
|
||||
'136d': '<$>{break}<$>',
|
||||
'136e': '<$>{break}<$>{italic}<$>',
|
||||
'136b': '<$>{break}<$>',
|
||||
'13d9': '<$>{break}<$>',
|
||||
'13da': '<$>{break}<$>',
|
||||
'13dc': '<$>{break}<$>',
|
||||
'13df': '<$>{break}<$>',
|
||||
'1568': '<$>{break}<$>',
|
||||
'1561': '<$>{break}<$>',
|
||||
'1564': '<$>{break}<$>',
|
||||
'1567': '<$>{break}<$>',
|
||||
'16d5': '<$>{break}<$>',
|
||||
'16d6': '<$>{break}<$>',
|
||||
'16d0': '<$>{break}<$>',
|
||||
'16d3': '<$>{break}<$>',
|
||||
'945d': '<$>{break}<$>',
|
||||
'945e': '<$>{break}<$>',
|
||||
'16d9': '<$>{break}<$>',
|
||||
'945b': '<$>{break}<$>',
|
||||
'156b': '<$>{break}<$>',
|
||||
'156d': '<$>{break}<$>',
|
||||
'156e': '<$>{break}<$>{italic}<$>',
|
||||
'105b': '<$>{break}<$>',
|
||||
'1364': '<$>{break}<$>',
|
||||
'1368': '<$>{break}<$>',
|
||||
'1361': '<$>{break}<$>',
|
||||
'13d0': '<$>{break}<$>',
|
||||
'13d3': '<$>{break}<$>',
|
||||
'13d5': '<$>{break}<$>',
|
||||
'13d6': '<$>{break}<$>',
|
||||
'97a1': '',
|
||||
'97a2': '',
|
||||
'9723': '',
|
||||
'94a1': '',
|
||||
'94a4': '',
|
||||
'94ad': '',
|
||||
'1020': '',
|
||||
'10a1': '',
|
||||
'10a2': '',
|
||||
'1023': '',
|
||||
'10a4': '',
|
||||
'1025': '',
|
||||
'1026': '',
|
||||
'10a7': '',
|
||||
'10a8': '',
|
||||
'1029': '',
|
||||
'102a': '',
|
||||
'10ab': '',
|
||||
'102c': '',
|
||||
'10ad': '',
|
||||
'10ae': '',
|
||||
'102f': '',
|
||||
'97ad': '',
|
||||
'97a4': '',
|
||||
'9725': '',
|
||||
'9726': '',
|
||||
'97a7': '',
|
||||
'97a8': '',
|
||||
'9729': '',
|
||||
'972a': '',
|
||||
'9120': '<$>{end-italic}<$>',
|
||||
'91a1': '',
|
||||
'91a2': '',
|
||||
'9123': '',
|
||||
'91a4': '',
|
||||
'9125': '',
|
||||
'9126': '',
|
||||
'91a7': '',
|
||||
'91a8': '',
|
||||
'9129': '',
|
||||
'912a': '',
|
||||
'91ab': '',
|
||||
'912c': '',
|
||||
'91ad': '',
|
||||
'97ae': '',
|
||||
'972f': '',
|
||||
'91ae': '<$>{italic}<$>',
|
||||
'912f': '<$>{italic}<$>',
|
||||
'94a8': '',
|
||||
'9423': '',
|
||||
'94a2': '',
|
||||
}
|
||||
|
||||
|
||||
CHARACTERS = {
|
||||
'20': ' ',
|
||||
'a1': '!',
|
||||
'a2': '"',
|
||||
'23': '#',
|
||||
'a4': '$',
|
||||
'25': '%',
|
||||
'26': '&',
|
||||
'a7': '\'',
|
||||
'a8': '(',
|
||||
'29': ')',
|
||||
'2a': 'á',
|
||||
'ab': '+',
|
||||
'2c': ',',
|
||||
'ad': '-',
|
||||
'ae': '.',
|
||||
'2f': '/',
|
||||
'b0': '0',
|
||||
'31': '1',
|
||||
'32': '2',
|
||||
'b3': '3',
|
||||
'34': '4',
|
||||
'b5': '5',
|
||||
'b6': '6',
|
||||
'37': '7',
|
||||
'38': '8',
|
||||
'b9': '9',
|
||||
'ba': ':',
|
||||
'3b': ';',
|
||||
'bc': '<',
|
||||
'3d': '=',
|
||||
'3e': '>',
|
||||
'bf': '?',
|
||||
'40': '@',
|
||||
'c1': 'A',
|
||||
'c2': 'B',
|
||||
'43': 'C',
|
||||
'c4': 'D',
|
||||
'45': 'E',
|
||||
'46': 'F',
|
||||
'c7': 'G',
|
||||
'c8': 'H',
|
||||
'49': 'I',
|
||||
'4a': 'J',
|
||||
'cb': 'K',
|
||||
'4c': 'L',
|
||||
'cd': 'M',
|
||||
'ce': 'N',
|
||||
'4f': 'O',
|
||||
'd0': 'P',
|
||||
'51': 'Q',
|
||||
'52': 'R',
|
||||
'd3': 'S',
|
||||
'54': 'T',
|
||||
'd5': 'U',
|
||||
'd6': 'V',
|
||||
'57': 'W',
|
||||
'58': 'X',
|
||||
'd9': 'Y',
|
||||
'da': 'Z',
|
||||
'5b': '[',
|
||||
'dc': 'é',
|
||||
'5d': ']',
|
||||
'5e': 'í',
|
||||
'df': 'ó',
|
||||
'e0': 'ú',
|
||||
'61': 'a',
|
||||
'62': 'b',
|
||||
'e3': 'c',
|
||||
'64': 'd',
|
||||
'e5': 'e',
|
||||
'e6': 'f',
|
||||
'67': 'g',
|
||||
'68': 'h',
|
||||
'e9': 'i',
|
||||
'ea': 'j',
|
||||
'6b': 'k',
|
||||
'ec': 'l',
|
||||
'6d': 'm',
|
||||
'6e': 'n',
|
||||
'ef': 'o',
|
||||
'70': 'p',
|
||||
'f1': 'q',
|
||||
'f2': 'r',
|
||||
'73': 's',
|
||||
'f4': 't',
|
||||
'75': 'u',
|
||||
'76': 'v',
|
||||
'f7': 'w',
|
||||
'f8': 'x',
|
||||
'79': 'y',
|
||||
'7a': 'z',
|
||||
'fb': 'ç',
|
||||
'7c': '÷',
|
||||
'fd': 'Ñ',
|
||||
'fe': 'ñ',
|
||||
'7f': '',
|
||||
'80': ''
|
||||
}
|
||||
|
||||
|
||||
SPECIAL_CHARS = {
|
||||
'91b0': '®',
|
||||
'9131': '°',
|
||||
'9132': '½',
|
||||
'91b3': '¿',
|
||||
'91b4': '™',
|
||||
'91b5': '¢',
|
||||
'91b6': '£',
|
||||
'9137': '♪',
|
||||
'9138': 'à',
|
||||
'91b9': ' ',
|
||||
'91ba': 'è',
|
||||
'913b': 'â',
|
||||
'91bc': 'ê',
|
||||
'913d': 'î',
|
||||
'913e': 'ô',
|
||||
'91bf': 'û'
|
||||
}
|
||||
|
||||
|
||||
EXTENDED_CHARS = {
|
||||
'9220': 'Á',
|
||||
'92a1': 'É',
|
||||
'92a2': 'Ó',
|
||||
'9223': 'Ú',
|
||||
'92a4': 'Ü',
|
||||
'9225': 'ü',
|
||||
'9226': '‘',
|
||||
'92a7': '¡',
|
||||
'92a8': '*',
|
||||
'9229': '’',
|
||||
'922a': '—',
|
||||
'92ab': '©',
|
||||
'922c': '℠',
|
||||
'92ad': '•',
|
||||
'92ae': '“',
|
||||
'922f': '”',
|
||||
'92b0': 'À',
|
||||
'9231': 'Â',
|
||||
'9232': 'Ç',
|
||||
'92b3': 'È',
|
||||
'9234': 'Ê',
|
||||
'92b5': 'Ë',
|
||||
'92b6': 'ë',
|
||||
'9237': 'Î',
|
||||
'9238': 'Ï',
|
||||
'92b9': 'ï',
|
||||
'92ba': 'Ô',
|
||||
'923b': 'Ù',
|
||||
'92bc': 'ù',
|
||||
'923d': 'Û',
|
||||
'923e': '«',
|
||||
'92bf': '»',
|
||||
'1320': 'Ã',
|
||||
'13a1': 'ã',
|
||||
'13a2': 'Í',
|
||||
'1323': 'Ì',
|
||||
'13a4': 'ì',
|
||||
'1325': 'Ò',
|
||||
'1326': 'ò',
|
||||
'13a7': 'Õ',
|
||||
'13a8': 'õ',
|
||||
'1329': '{',
|
||||
'132a': '}',
|
||||
'13ab': '\\',
|
||||
'132c': '^',
|
||||
'13ad': '_',
|
||||
'13ae': '¦',
|
||||
'132f': '~',
|
||||
'13b0': 'Ä',
|
||||
'1331': 'ä',
|
||||
'1332': 'Ö',
|
||||
'13b3': 'ö',
|
||||
'1334': 'ß',
|
||||
'13b5': '¥',
|
||||
'13b6': '¤',
|
||||
'1337': '|',
|
||||
'1338': 'Å',
|
||||
'13b9': 'å',
|
||||
'13ba': 'Ø',
|
||||
'133b': 'ø',
|
||||
'13bc': '┌',
|
||||
'133d': '┐',
|
||||
'133e': '└',
|
||||
'13bf': '┘',
|
||||
}
|
||||
|
||||
|
||||
# Cursor positioning codes
|
||||
PAC_HIGH_BYTE_BY_ROW = [
|
||||
'xx',
|
||||
'91',
|
||||
'91',
|
||||
'92',
|
||||
'92',
|
||||
'15',
|
||||
'15',
|
||||
'16',
|
||||
'16',
|
||||
'97',
|
||||
'97',
|
||||
'10',
|
||||
'13',
|
||||
'13',
|
||||
'94',
|
||||
'94'
|
||||
]
|
||||
PAC_LOW_BYTE_BY_ROW_RESTRICTED = [
|
||||
'xx',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'd0',
|
||||
'70',
|
||||
'd0',
|
||||
'70'
|
||||
]
|
||||
|
||||
# High order bytes come first, then each key contains a list of low bytes.
|
||||
# Any of the values in that list, coupled with the high order byte will
|
||||
# map to the (row, column) tuple.
|
||||
# This particular dictionary will get transformed to a more suitable form for
|
||||
# usage like PAC_BYTES_TO_POSITIONING_MAP[u'91'][u'd6'] = (1, 12)
|
||||
PAC_BYTES_TO_POSITIONING_MAP = {
|
||||
'91': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (1, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (2, 0), # noqa
|
||||
('52', 'd3'): (1, 4),
|
||||
('54', 'd5'): (1, 8),
|
||||
('d6', '57'): (1, 12),
|
||||
('58', 'd9'): (1, 16),
|
||||
('da', '5b'): (1, 20),
|
||||
('dc', '5d'): (1, 24),
|
||||
('5e', 'df'): (1, 28),
|
||||
|
||||
('f2', '73'): (2, 4),
|
||||
('f4', '75'): (2, 8),
|
||||
('76', 'f7'): (2, 12),
|
||||
('f8', '79'): (2, 16),
|
||||
('7a', 'fb'): (2, 20),
|
||||
('7c', 'fd'): (2, 24),
|
||||
('fe', '7f'): (2, 28)
|
||||
},
|
||||
'92': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (3, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (4, 0), # noqa
|
||||
('52', 'd3'): (3, 4),
|
||||
('54', 'd5'): (3, 8),
|
||||
('d6', '57'): (3, 12),
|
||||
('58', 'd9'): (3, 16),
|
||||
('da', '5b'): (3, 20),
|
||||
('dc', '5d'): (3, 24),
|
||||
('5e', 'df'): (3, 28),
|
||||
|
||||
('f2', '73'): (4, 4),
|
||||
('f4', '75'): (4, 8),
|
||||
('76', 'f7'): (4, 12),
|
||||
('f8', '79'): (4, 16),
|
||||
('7a', 'fb'): (4, 20),
|
||||
('7c', 'fd'): (4, 24),
|
||||
('fe', '7f'): (4, 28)
|
||||
},
|
||||
'15': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (5, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (6, 0), # noqa
|
||||
('52', 'd3'): (5, 4),
|
||||
('54', 'd5'): (5, 8),
|
||||
('d6', '57'): (5, 12),
|
||||
('58', 'd9'): (5, 16),
|
||||
('da', '5b'): (5, 20),
|
||||
('dc', '5d'): (5, 24),
|
||||
('5e', 'df'): (5, 28),
|
||||
|
||||
('f2', '73'): (6, 4),
|
||||
('f4', '75'): (6, 8),
|
||||
('76', 'f7'): (6, 12),
|
||||
('f8', '79'): (6, 16),
|
||||
('7a', 'fb'): (6, 20),
|
||||
('7c', 'fd'): (6, 24),
|
||||
('fe', '7f'): (6, 28)
|
||||
},
|
||||
'16': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (7, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (8, 0), # noqa
|
||||
('52', 'd3'): (7, 4),
|
||||
('54', 'd5'): (7, 8),
|
||||
('d6', '57'): (7, 12),
|
||||
('58', 'd9'): (7, 16),
|
||||
('da', '5b'): (7, 20),
|
||||
('dc', '5d'): (7, 24),
|
||||
('5e', 'df'): (7, 28),
|
||||
|
||||
('f2', '73'): (8, 4),
|
||||
('f4', '75'): (8, 8),
|
||||
('76', 'f7'): (8, 12),
|
||||
('f8', '79'): (8, 16),
|
||||
('7a', 'fb'): (8, 20),
|
||||
('7c', 'fd'): (8, 24),
|
||||
('fe', '7f'): (8, 28)
|
||||
},
|
||||
'97': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (9, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (10, 0), # noqa
|
||||
('52', 'd3'): (9, 4),
|
||||
('54', 'd5'): (9, 8),
|
||||
('d6', '57'): (9, 12),
|
||||
('58', 'd9'): (9, 16),
|
||||
('da', '5b'): (9, 20),
|
||||
('dc', '5d'): (9, 24),
|
||||
('5e', 'df'): (9, 28),
|
||||
|
||||
('f2', '73'): (10, 4),
|
||||
('f4', '75'): (10, 8),
|
||||
('76', 'f7'): (10, 12),
|
||||
('f8', '79'): (10, 16),
|
||||
('7a', 'fb'): (10, 20),
|
||||
('7c', 'fd'): (10, 24),
|
||||
('fe', '7f'): (10, 28)
|
||||
},
|
||||
'10': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (11, 0), # noqa
|
||||
('52', 'd3'): (11, 4),
|
||||
('54', 'd5'): (11, 8),
|
||||
('d6', '57'): (11, 12),
|
||||
('58', 'd9'): (11, 16),
|
||||
('da', '5b'): (11, 20),
|
||||
('dc', '5d'): (11, 24),
|
||||
('5e', 'df'): (11, 28),
|
||||
},
|
||||
'13': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (12, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (13, 0), # noqa
|
||||
('52', 'd3'): (12, 4),
|
||||
('54', 'd5'): (12, 8),
|
||||
('d6', '57'): (12, 12),
|
||||
('58', 'd9'): (12, 16),
|
||||
('da', '5b'): (12, 20),
|
||||
('dc', '5d'): (12, 24),
|
||||
('5e', 'df'): (12, 28),
|
||||
|
||||
('f2', '73'): (13, 4),
|
||||
('f4', '75'): (13, 8),
|
||||
('76', 'f7'): (13, 12),
|
||||
('f8', '79'): (13, 16),
|
||||
('7a', 'fb'): (13, 20),
|
||||
('7c', 'fd'): (13, 24),
|
||||
('fe', '7f'): (13, 28)
|
||||
},
|
||||
'94': {
|
||||
('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (14, 0), # noqa
|
||||
('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (15, 0), # noqa
|
||||
('52', 'd3'): (14, 4),
|
||||
('54', 'd5'): (14, 8),
|
||||
('d6', '57'): (14, 12),
|
||||
('58', 'd9'): (14, 16),
|
||||
('da', '5b'): (14, 20),
|
||||
('dc', '5d'): (14, 24),
|
||||
('5e', 'df'): (14, 28),
|
||||
|
||||
('f2', '73'): (15, 4),
|
||||
('f4', '75'): (15, 8),
|
||||
('76', 'f7'): (15, 12),
|
||||
('f8', '79'): (15, 16),
|
||||
('7a', 'fb'): (15, 20),
|
||||
('7c', 'fd'): (15, 24),
|
||||
('fe', '7f'): (15, 28)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _create_position_to_bytes_map(bytes_to_pos):
|
||||
result = {}
|
||||
for high_byte, low_byte_dict in list(bytes_to_pos.items()):
|
||||
|
||||
# must contain mappings to column, to the tuple of possible values
|
||||
for low_byte_list in list(low_byte_dict.keys()):
|
||||
column = bytes_to_pos[high_byte][low_byte_list][1]
|
||||
|
||||
row = bytes_to_pos[high_byte][low_byte_list][0]
|
||||
if row not in result:
|
||||
result[row] = {}
|
||||
|
||||
result[row][column] = (
|
||||
tuple(product([high_byte], low_byte_list)))
|
||||
return result
|
||||
|
||||
# (Almost) the reverse of PAC_BYTES_TO_POSITIONING_MAP. Call with arguments
|
||||
# like for example [15][4] to get the tuple ((u'94', u'f2'), (u'94', u'73'))
|
||||
POSITIONING_TO_PAC_MAP = _create_position_to_bytes_map(
|
||||
PAC_BYTES_TO_POSITIONING_MAP
|
||||
)
|
||||
|
||||
|
||||
def _restructure_bytes_to_position_map(byte_to_pos_map):
|
||||
return {
|
||||
k_: {
|
||||
low_byte: byte_to_pos_map[k_][low_byte_list]
|
||||
for low_byte_list in list(v_.keys()) for low_byte in low_byte_list
|
||||
}
|
||||
for k_, v_ in list(byte_to_pos_map.items())
|
||||
}
|
||||
|
||||
# Now use the dict with arguments like [u'91'][u'75'] directly.
|
||||
PAC_BYTES_TO_POSITIONING_MAP = _restructure_bytes_to_position_map(
|
||||
PAC_BYTES_TO_POSITIONING_MAP)
|
||||
|
||||
|
||||
# Inverted character lookup
|
||||
CHARACTER_TO_CODE = {
|
||||
character: code
|
||||
for code, character in viewitems(CHARACTERS)
|
||||
}
|
||||
|
||||
SPECIAL_OR_EXTENDED_CHAR_TO_CODE = {
|
||||
character: code for code, character in viewitems(EXTENDED_CHARS)
|
||||
}
|
||||
SPECIAL_OR_EXTENDED_CHAR_TO_CODE.update(
|
||||
{character: code for code, character in viewitems(SPECIAL_CHARS)}
|
||||
)
|
||||
|
||||
# Time to transmit a single codeword = 1 second / 29.97
|
||||
MICROSECONDS_PER_CODEWORD = 1000.0 * 1000.0 / (30.0 * 1000.0 / 1001.0)
|
||||
|
||||
|
||||
HEADER = 'Scenarist_SCC V1.0'
|
||||
823
utils/modules/pycaption/scc/specialized_collections.py
Normal file
823
utils/modules/pycaption/scc/specialized_collections.py
Normal file
@@ -0,0 +1,823 @@
|
||||
from ..base import CaptionList, Caption, CaptionNode
|
||||
from ..geometry import (UnitEnum, Size, Layout, Point, Alignment,
|
||||
VerticalAlignmentEnum, HorizontalAlignmentEnum)
|
||||
|
||||
from .constants import PAC_BYTES_TO_POSITIONING_MAP, COMMANDS
|
||||
import collections
|
||||
|
||||
|
||||
class PreCaption(object):
|
||||
"""
|
||||
The Caption class has been refactored and now its instances must be used as
|
||||
immutable objects. Some of the code in this module, however, relied on the
|
||||
fact that Caption instances were mutable. For backwards compatibility,
|
||||
therefore, this class was created to work as a mutable caption data holder
|
||||
used to eventually instantiate an actual Caption object.
|
||||
"""
|
||||
|
||||
def __init__(self, start=0, end=0):
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.nodes = []
|
||||
self.style = {}
|
||||
self.layout_info = None
|
||||
|
||||
def to_real_caption(self):
|
||||
return Caption(
|
||||
self.start, self.end, self.nodes, self.style, self.layout_info
|
||||
)
|
||||
|
||||
|
||||
class TimingCorrectingCaptionList(list):
|
||||
"""List of captions. When appending new elements, it will correct the end time
|
||||
of the last ones, so they end when the new caption gets added.
|
||||
|
||||
"last ones" could mean the last caption `append`ed or all of the last
|
||||
captions with which this list was `extended`
|
||||
|
||||
Also, doesn't allow Nones or empty captions
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TimingCorrectingCaptionList, self).__init__(*args, **kwargs)
|
||||
self._last_batch = ()
|
||||
|
||||
def append(self, p_object):
|
||||
"""When appending a new caption to the list, make sure the last one
|
||||
has an end. Also, don't add empty captions
|
||||
|
||||
:type p_object: Caption | None
|
||||
"""
|
||||
if p_object is None or not p_object.nodes:
|
||||
return
|
||||
|
||||
self._update_last_batch(self._last_batch, p_object)
|
||||
|
||||
self._last_batch = (p_object,)
|
||||
|
||||
super(TimingCorrectingCaptionList, self).append(p_object)
|
||||
|
||||
def extend(self, iterable):
|
||||
"""Adds the elements in the iterable to the list, regarding the first
|
||||
caption's start time as the end time for the previously added
|
||||
caption(s)
|
||||
|
||||
:param iterable: an iterable of Caption instances
|
||||
"""
|
||||
appendable_items = [item for item in iterable if item and item.nodes]
|
||||
self._update_last_batch(self._last_batch, *appendable_items)
|
||||
|
||||
self._last_batch = tuple(appendable_items)
|
||||
|
||||
super(TimingCorrectingCaptionList, self).extend(appendable_items)
|
||||
|
||||
@staticmethod
|
||||
def _update_last_batch(batch, *new_captions):
|
||||
"""Given a batch of captions, sets their end time equal to the start
|
||||
time of the first caption in *new_captions
|
||||
|
||||
The start time of the first caption in new_captions should never be 0.
|
||||
This means an invalid SCC file.
|
||||
|
||||
:type batch: tuple[Caption]
|
||||
:type new_captions: tuple[Caption]
|
||||
"""
|
||||
if not new_captions:
|
||||
return
|
||||
if not new_captions[0]:
|
||||
return
|
||||
if not new_captions[0].nodes:
|
||||
return
|
||||
|
||||
new_caption = new_captions[0]
|
||||
|
||||
if batch and batch[-1].end == 0:
|
||||
for caption in batch:
|
||||
caption.end = new_caption.start
|
||||
|
||||
|
||||
class NotifyingDict(dict):
|
||||
"""Dictionary-like object, that treats one key as 'active',
|
||||
and notifies observers if the active key changed
|
||||
"""
|
||||
# Need an unhashable object as initial value for the active key.
|
||||
# That way we're sure this was never a key in the dict.
|
||||
_guard = {}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(NotifyingDict, self).__init__(*args, **kwargs)
|
||||
self.active_key = self._guard
|
||||
self.observers = []
|
||||
|
||||
def set_active(self, key):
|
||||
"""Sets the active key
|
||||
|
||||
:param key: any hashable object
|
||||
"""
|
||||
if key not in self:
|
||||
raise ValueError('No such key present')
|
||||
|
||||
# Notify observers of the change
|
||||
if key != self.active_key:
|
||||
for observer in self.observers:
|
||||
observer(self.active_key, key)
|
||||
|
||||
self.active_key = key
|
||||
|
||||
def get_active(self):
|
||||
"""Returns the value corresponding to the active key
|
||||
"""
|
||||
if self.active_key is self._guard:
|
||||
raise KeyError('No active key set')
|
||||
|
||||
return self[self.active_key]
|
||||
|
||||
def add_change_observer(self, observer):
|
||||
"""Receives a callable function, which it will call if the active
|
||||
element changes.
|
||||
|
||||
The observer will receive 2 positional arguments: the old and new key
|
||||
|
||||
:param observer: any callable that can be called with 2 positional
|
||||
arguments
|
||||
"""
|
||||
if not isinstance(observer, collections.Callable):
|
||||
raise TypeError('The observer should be callable')
|
||||
|
||||
self.observers.append(observer)
|
||||
|
||||
|
||||
class CaptionCreator(object):
|
||||
"""Creates and maintains a collection of Captions
|
||||
"""
|
||||
def __init__(self):
|
||||
self._collection = TimingCorrectingCaptionList()
|
||||
|
||||
# subset of self._collection;
|
||||
# captions here will be susceptible to time corrections
|
||||
self._still_editing = []
|
||||
|
||||
def correct_last_timing(self, end_time, force=False):
|
||||
"""Called to set the time on the last Caption(s) stored with no end
|
||||
time
|
||||
|
||||
:type force: bool
|
||||
:param force: Set the end time even if there's already an end time
|
||||
|
||||
:type end_time: float
|
||||
:param end_time: microseconds; the end of the caption;
|
||||
"""
|
||||
if not self._still_editing:
|
||||
return
|
||||
|
||||
if force:
|
||||
# Select all last captions
|
||||
captions_to_correct = self._still_editing
|
||||
elif self._still_editing[-1].end == 0:
|
||||
# Only select the last captions if they haven't gotten their
|
||||
# end time set yet
|
||||
captions_to_correct = self._still_editing
|
||||
else:
|
||||
return
|
||||
|
||||
for caption in captions_to_correct:
|
||||
caption.end = end_time
|
||||
|
||||
def create_and_store(self, node_buffer, start):
|
||||
"""Interpreter method, will convert the buffer into one or more Caption
|
||||
objects, storing them internally.
|
||||
|
||||
This method relies on the InstructionNodeCreator's ability to generate
|
||||
InstructionNodes properly, so at this point we can convert
|
||||
_InstructionNodes nodes almost 1:1 to CaptionNodes
|
||||
|
||||
:type node_buffer: InstructionNodeCreator
|
||||
|
||||
:type start: float
|
||||
:param start: the start time in microseconds
|
||||
"""
|
||||
if node_buffer.is_empty():
|
||||
return
|
||||
|
||||
caption = PreCaption()
|
||||
caption.start = start
|
||||
caption.end = 0 # Not yet known; filled in later
|
||||
self._still_editing = [caption]
|
||||
|
||||
for instruction in node_buffer:
|
||||
# skip empty elements
|
||||
if instruction.is_empty():
|
||||
continue
|
||||
|
||||
elif instruction.requires_repositioning():
|
||||
caption = PreCaption()
|
||||
caption.start = start
|
||||
caption.end = 0
|
||||
self._still_editing.append(caption)
|
||||
|
||||
# handle line breaks
|
||||
elif instruction.is_explicit_break():
|
||||
caption.nodes.append(CaptionNode.create_break(
|
||||
layout_info=_get_layout_from_tuple(instruction.position)
|
||||
))
|
||||
|
||||
# handle open italics
|
||||
elif instruction.sets_italics_on():
|
||||
caption.nodes.append(
|
||||
CaptionNode.create_style(
|
||||
True, {'italics': True},
|
||||
layout_info=_get_layout_from_tuple(
|
||||
instruction.position
|
||||
))
|
||||
)
|
||||
|
||||
# handle clone italics
|
||||
elif instruction.sets_italics_off():
|
||||
caption.nodes.append(
|
||||
CaptionNode.create_style(
|
||||
False, {'italics': True},
|
||||
layout_info=_get_layout_from_tuple(
|
||||
instruction.position)
|
||||
))
|
||||
|
||||
# handle text
|
||||
elif instruction.is_text_node():
|
||||
layout_info = _get_layout_from_tuple(instruction.position)
|
||||
caption.nodes.append(
|
||||
CaptionNode.create_text(
|
||||
instruction.get_text(), layout_info=layout_info),
|
||||
)
|
||||
caption.layout_info = layout_info
|
||||
|
||||
self._collection.extend(self._still_editing)
|
||||
|
||||
def get_all(self):
|
||||
"""Returns the Caption collection as a CaptionList
|
||||
|
||||
:rtype: CaptionList
|
||||
"""
|
||||
caption_list = CaptionList()
|
||||
for precap in self._collection:
|
||||
caption_list.append(precap.to_real_caption())
|
||||
return caption_list
|
||||
|
||||
|
||||
class InstructionNodeCreator(object):
|
||||
"""Creates _InstructionNode instances from characters and commands, storing
|
||||
them internally
|
||||
"""
|
||||
def __init__(self, collection=None, position_tracker=None):
|
||||
"""
|
||||
:param collection: an optional collection of nodes
|
||||
|
||||
:param position_tracker:
|
||||
:return:
|
||||
"""
|
||||
if not collection:
|
||||
self._collection = []
|
||||
else:
|
||||
self._collection = collection
|
||||
|
||||
self._position_tracer = position_tracker
|
||||
|
||||
def is_empty(self):
|
||||
"""Whether any text was added to the buffer
|
||||
"""
|
||||
return not any(element.text for element in self._collection)
|
||||
|
||||
def add_chars(self, *chars):
|
||||
"""Adds characters to a text node (last text node, or a new one)
|
||||
|
||||
:param chars: tuple containing text (unicode)
|
||||
"""
|
||||
if not chars:
|
||||
return
|
||||
|
||||
current_position = self._position_tracer.get_current_position()
|
||||
|
||||
# get or create a usable node
|
||||
if (self._collection and self._collection[-1].is_text_node()
|
||||
and not self._position_tracer.is_repositioning_required()):
|
||||
node = self._collection[-1]
|
||||
else:
|
||||
# create first node
|
||||
node = _InstructionNode(position=current_position)
|
||||
self._collection.append(node)
|
||||
|
||||
# handle a simple line break
|
||||
if self._position_tracer.is_linebreak_required():
|
||||
# must insert a line break here
|
||||
self._collection.append(_InstructionNode.create_break(
|
||||
position=current_position))
|
||||
node = _InstructionNode.create_text(current_position)
|
||||
self._collection.append(node)
|
||||
self._position_tracer.acknowledge_linebreak_consumed()
|
||||
|
||||
# handle completely new positioning
|
||||
elif self._position_tracer.is_repositioning_required():
|
||||
self._collection.append(
|
||||
_InstructionNode.create_repositioning_command(
|
||||
current_position
|
||||
)
|
||||
)
|
||||
node = _InstructionNode.create_text(current_position)
|
||||
self._collection.append(node)
|
||||
self._position_tracer.acknowledge_position_changed()
|
||||
|
||||
node.add_chars(*chars)
|
||||
|
||||
def interpret_command(self, command):
|
||||
"""Given a command determines whether tu turn italics on or off,
|
||||
or to set the positioning
|
||||
|
||||
This is mostly used to convert from the legacy-style commands
|
||||
|
||||
:type command: unicode
|
||||
"""
|
||||
self._update_positioning(command)
|
||||
|
||||
text = COMMANDS.get(command, '')
|
||||
|
||||
if 'italic' in text:
|
||||
if 'end' not in text:
|
||||
self._collection.append(
|
||||
_InstructionNode.create_italics_style(
|
||||
self._position_tracer.get_current_position())
|
||||
)
|
||||
else:
|
||||
self._collection.append(
|
||||
_InstructionNode.create_italics_style(
|
||||
self._position_tracer.get_current_position(),
|
||||
turn_on=False
|
||||
)
|
||||
)
|
||||
|
||||
def _update_positioning(self, command):
|
||||
"""Sets the positioning information to use for the next nodes
|
||||
|
||||
:type command: unicode
|
||||
"""
|
||||
if len(command) != 4:
|
||||
return
|
||||
|
||||
first, second = command[:2], command[2:]
|
||||
|
||||
try:
|
||||
positioning = PAC_BYTES_TO_POSITIONING_MAP[first][second]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
self._position_tracer.update_positioning(positioning)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(_format_italics(self._collection))
|
||||
|
||||
@classmethod
|
||||
def from_list(cls, stash_list, position_tracker):
|
||||
"""Having received a list of instances of this class, creates a new
|
||||
instance that contains all the nodes of the previous instances
|
||||
(basically concatenates the many stashes into one)
|
||||
|
||||
:type stash_list: list[InstructionNodeCreator]
|
||||
:param stash_list: a list of instances of this class
|
||||
|
||||
:type position_tracker: .state_machines.DefaultProvidingPositionTracker
|
||||
:param position_tracker: state machine to be interrogated about the
|
||||
positioning when creating a node
|
||||
|
||||
:rtype: InstructionNodeCreator
|
||||
"""
|
||||
instance = cls(position_tracker=position_tracker)
|
||||
new_collection = instance._collection
|
||||
|
||||
for idx, stash in enumerate(stash_list):
|
||||
new_collection.extend(stash._collection)
|
||||
|
||||
# use space to separate the stashes, but don't add final space
|
||||
if idx < len(stash_list) - 1:
|
||||
try:
|
||||
instance._collection[-1].add_chars(' ')
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return instance
|
||||
|
||||
|
||||
def _get_layout_from_tuple(position_tuple):
|
||||
"""Create a Layout object from the positioning information given
|
||||
|
||||
The row can have a value from 1 to 15 inclusive. (vertical positioning)
|
||||
The column can have a value from 0 to 31 inclusive. (horizontal)
|
||||
|
||||
:param position_tuple: a tuple of ints (row, col)
|
||||
:type position_tuple: tuple
|
||||
:rtype: Layout
|
||||
"""
|
||||
if not position_tuple:
|
||||
return None
|
||||
|
||||
row, column = position_tuple
|
||||
|
||||
horizontal = Size(100 * column / 32.0, UnitEnum.PERCENT)
|
||||
vertical = Size(100 * (row - 1) / 15.0, UnitEnum.PERCENT)
|
||||
return Layout(origin=Point(horizontal, vertical),
|
||||
alignment=Alignment(HorizontalAlignmentEnum.LEFT,
|
||||
VerticalAlignmentEnum.TOP)
|
||||
)
|
||||
|
||||
|
||||
class _InstructionNode(object):
|
||||
"""Value object, that can contain text information, or interpretable
|
||||
commands (such as explicit line breaks or turning italics on/off).
|
||||
|
||||
These nodes will be aggregated into a RepresentableNode, which will then
|
||||
be easily converted to a CaptionNode.
|
||||
"""
|
||||
TEXT = 0
|
||||
BREAK = 1
|
||||
ITALICS_ON = 2
|
||||
ITALICS_OFF = 3
|
||||
CHANGE_POSITION = 4
|
||||
|
||||
def __init__(self, text=None, position=None, type_=0):
|
||||
"""
|
||||
:type text: unicode
|
||||
:param position: a tuple of ints (row, column)
|
||||
:param type_: self.TEXT | self.BREAK | self.ITALICS
|
||||
:type type_: int
|
||||
"""
|
||||
self.text = text
|
||||
self.position = position
|
||||
self._type = type_
|
||||
|
||||
def add_chars(self, *args):
|
||||
"""This being a text node, add characters to it.
|
||||
:param args:
|
||||
:type args: tuple[unicode]
|
||||
:return:
|
||||
"""
|
||||
if self.text is None:
|
||||
self.text = ''
|
||||
|
||||
self.text += ''.join(args)
|
||||
|
||||
def is_text_node(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type == self.TEXT
|
||||
|
||||
def is_empty(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
if self._type == self.TEXT:
|
||||
return not self.text
|
||||
|
||||
return False
|
||||
|
||||
def is_explicit_break(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type == self.BREAK
|
||||
|
||||
def sets_italics_on(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type == self.ITALICS_ON
|
||||
|
||||
def sets_italics_off(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type == self.ITALICS_OFF
|
||||
|
||||
def is_italics_node(self):
|
||||
"""
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type in (self.ITALICS_OFF, self.ITALICS_ON)
|
||||
|
||||
def requires_repositioning(self):
|
||||
"""Whether the node must be interpreted as a change in positioning
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._type == self.CHANGE_POSITION
|
||||
|
||||
def get_text(self):
|
||||
"""A little legacy code.
|
||||
"""
|
||||
return ' '.join(self.text.split())
|
||||
|
||||
@classmethod
|
||||
def create_break(cls, position):
|
||||
"""Create a node, interpretable as an explicit line break
|
||||
|
||||
:type position: tuple[int]
|
||||
:param position: a tuple (row, col) containing the positioning info
|
||||
|
||||
:rtype: _InstructionNode
|
||||
"""
|
||||
return cls(type_=cls.BREAK, position=position)
|
||||
|
||||
@classmethod
|
||||
def create_text(cls, position, *chars):
|
||||
"""Create a node interpretable as text
|
||||
|
||||
:type position: tuple[int]
|
||||
:param position: a tuple (row, col) to mark the positioning
|
||||
|
||||
:type chars: tuple[unicode]
|
||||
:param chars: characters to add to the text
|
||||
|
||||
:rtype: _InstructionNode
|
||||
"""
|
||||
return cls(''.join(chars), position=position)
|
||||
|
||||
@classmethod
|
||||
def create_italics_style(cls, position, turn_on=True):
|
||||
"""Create a node, interpretable as a command to switch italics on/off
|
||||
|
||||
:type position: tuple[int]
|
||||
:param position: a tuple (row, col) to mark the positioning
|
||||
|
||||
:type turn_on: bool
|
||||
:param turn_on: whether to turn the italics on or off
|
||||
|
||||
:rtype: _InstructionNode
|
||||
"""
|
||||
return cls(
|
||||
position=position,
|
||||
type_=cls.ITALICS_ON if turn_on else cls.ITALICS_OFF
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create_repositioning_command(cls, position=None):
|
||||
"""Create node interpretable as a command to change the current
|
||||
position
|
||||
|
||||
:type position:
|
||||
"""
|
||||
return cls(type_=cls.CHANGE_POSITION, position=position)
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
if self._type == self.BREAK:
|
||||
extra = 'BR'
|
||||
elif self._type == self.TEXT:
|
||||
extra = '"{}"'.format(self.text)
|
||||
elif self._type in (self.ITALICS_ON, self.ITALICS_OFF):
|
||||
extra = 'italics {}'.format(
|
||||
'on' if self._type == self.ITALICS_ON else 'off'
|
||||
)
|
||||
else:
|
||||
extra = 'change position'
|
||||
|
||||
return '<INode: {extra} >'.format(extra=extra)
|
||||
|
||||
|
||||
def _format_italics(collection):
|
||||
"""Given a raw list of _InstructionNodes, returns a new equivalent list
|
||||
where all the italics nodes properly close and open.
|
||||
|
||||
The list is equivalent in the sense that the SCC commands that would have
|
||||
generated the output list, would have had the exact same visual effect
|
||||
as the ones that generated the output, as far as italics are concerned.
|
||||
|
||||
This is useful because the raw commands read from the SCC can't be used
|
||||
the way they are by the writers for the other formats. Those other writers
|
||||
require the list of CaptionNodes to be formatted in a certain way.
|
||||
|
||||
Note: Using state machines to manage the italics didn't work well because
|
||||
we're using state machines already to track the position, and their
|
||||
interactions got crazy.
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = _skip_initial_italics_off_nodes(collection)
|
||||
|
||||
new_collection = _skip_empty_text_nodes(new_collection)
|
||||
|
||||
# after this step we're guaranteed a proper ordering of the nodes
|
||||
new_collection = _skip_redundant_italics_nodes(new_collection)
|
||||
|
||||
# after this, we're guaranteed that the italics are properly contained
|
||||
# within their context
|
||||
new_collection = _close_italics_before_repositioning(new_collection)
|
||||
|
||||
# all nodes will be closed after this step
|
||||
new_collection = _ensure_final_italics_node_closes(new_collection)
|
||||
|
||||
# removes pairs of italics nodes that don't do anything noticeable
|
||||
new_collection = _remove_noop_italics(new_collection)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _remove_noop_on_off_italics(collection):
|
||||
"""Return an equivalent list to `collection`. It removes the italics node
|
||||
pairs that don't surround text nodes, if those nodes are in the order:
|
||||
on, off
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = []
|
||||
to_commit = None
|
||||
|
||||
for node in collection:
|
||||
if node.is_italics_node() and node.sets_italics_on():
|
||||
to_commit = node
|
||||
continue
|
||||
|
||||
elif node.is_italics_node() and node.sets_italics_off():
|
||||
if to_commit:
|
||||
to_commit = None
|
||||
continue
|
||||
else:
|
||||
if to_commit:
|
||||
new_collection.append(to_commit)
|
||||
to_commit = None
|
||||
|
||||
new_collection.append(node)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _remove_noon_off_on_italics(collection):
|
||||
"""Removes pairs of off-on italics nodes, that don't surround any other
|
||||
node
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:return: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = []
|
||||
to_commit = None
|
||||
|
||||
for node in collection:
|
||||
if node.is_italics_node() and node.sets_italics_off():
|
||||
to_commit = node
|
||||
continue
|
||||
|
||||
elif node.is_italics_node() and node.sets_italics_on():
|
||||
if to_commit:
|
||||
to_commit = None
|
||||
continue
|
||||
else:
|
||||
if to_commit:
|
||||
new_collection.append(to_commit)
|
||||
to_commit = None
|
||||
|
||||
new_collection.append(node)
|
||||
|
||||
if to_commit:
|
||||
new_collection.append(to_commit)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _remove_noop_italics(collection):
|
||||
"""Return an equivalent list to `collection`. It removes the italics node
|
||||
pairs that don't surround text nodes
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = _remove_noop_on_off_italics(collection)
|
||||
|
||||
new_collection = _remove_noon_off_on_italics(new_collection)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _skip_initial_italics_off_nodes(collection):
|
||||
"""Return a collection like the one given, but without the
|
||||
initial <Italics OFF> nodes
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = []
|
||||
can_add_italics_off_nodes = False
|
||||
|
||||
for node in collection:
|
||||
if node.is_italics_node():
|
||||
if node.sets_italics_on():
|
||||
can_add_italics_off_nodes = True
|
||||
new_collection.append(node)
|
||||
elif can_add_italics_off_nodes:
|
||||
new_collection.append(node)
|
||||
else:
|
||||
new_collection.append(node)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _skip_empty_text_nodes(collection):
|
||||
"""Return an iterable containing all the nodes in the previous
|
||||
collection except for the empty text nodes
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
return [node for node in collection
|
||||
if not (node.is_text_node() and node.is_empty())]
|
||||
|
||||
|
||||
def _skip_redundant_italics_nodes(collection):
|
||||
"""Return a list where the <Italics On> nodes only appear after
|
||||
<Italics OFF>, and vice versa. This ignores the other node types, and
|
||||
only removes redundant italic nodes
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = []
|
||||
state = None
|
||||
|
||||
for node in collection:
|
||||
if node.is_italics_node():
|
||||
if state is None:
|
||||
state = node.sets_italics_on()
|
||||
new_collection.append(node)
|
||||
continue
|
||||
# skip the nodes that are like the previous
|
||||
if node.sets_italics_on() is state:
|
||||
continue
|
||||
else:
|
||||
state = node.sets_italics_on()
|
||||
new_collection.append(node)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _close_italics_before_repositioning(collection):
|
||||
"""Make sure that for every opened italic node, there's a corresponding
|
||||
closing node.
|
||||
|
||||
Will insert a closing italic node, before each repositioning node
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = []
|
||||
|
||||
italics_on = False
|
||||
last_italics_on_node = None
|
||||
|
||||
for idx, node in enumerate(collection):
|
||||
if node.is_italics_node() and node.sets_italics_on():
|
||||
italics_on = True
|
||||
last_italics_on_node = node
|
||||
if node.is_italics_node() and node.sets_italics_off():
|
||||
italics_on = False
|
||||
if node.requires_repositioning() and italics_on:
|
||||
# Append an italics closing node before the position change
|
||||
new_collection.append(
|
||||
_InstructionNode.create_italics_style(
|
||||
# The position info of this new node should be the same
|
||||
position=last_italics_on_node.position,
|
||||
turn_on=False
|
||||
)
|
||||
)
|
||||
new_collection.append(node)
|
||||
# Append an italics opening node after the positioning change
|
||||
new_collection.append(
|
||||
_InstructionNode.create_italics_style(
|
||||
position=node.position
|
||||
)
|
||||
)
|
||||
continue
|
||||
new_collection.append(node)
|
||||
|
||||
return new_collection
|
||||
|
||||
|
||||
def _ensure_final_italics_node_closes(collection):
|
||||
"""The final italics command needs to be closed
|
||||
|
||||
:type collection: list[_InstructionNode]
|
||||
:rtype: list[_InstructionNode]
|
||||
"""
|
||||
new_collection = list(collection)
|
||||
|
||||
italics_on = False
|
||||
last_italics_on_node = None
|
||||
|
||||
for node in collection:
|
||||
if node.is_italics_node() and node.sets_italics_on():
|
||||
italics_on = True
|
||||
last_italics_on_node = node
|
||||
if node.is_italics_node() and node.sets_italics_off():
|
||||
italics_on = False
|
||||
|
||||
if italics_on:
|
||||
new_collection.append(
|
||||
_InstructionNode.create_italics_style(
|
||||
position=last_italics_on_node.position,
|
||||
turn_on=False
|
||||
)
|
||||
)
|
||||
return new_collection
|
||||
128
utils/modules/pycaption/scc/state_machines.py
Normal file
128
utils/modules/pycaption/scc/state_machines.py
Normal file
@@ -0,0 +1,128 @@
|
||||
from ..exceptions import CaptionReadSyntaxError
|
||||
|
||||
|
||||
class _PositioningTracker(object):
|
||||
"""Helps determine the positioning of a node, having kept track of
|
||||
positioning-related commands.
|
||||
"""
|
||||
def __init__(self, positioning=None):
|
||||
"""
|
||||
:param positioning: positioning information (row, column)
|
||||
:type positioning: tuple[int]
|
||||
"""
|
||||
self._positions = [positioning]
|
||||
self._break_required = False
|
||||
self._repositioning_required = False
|
||||
|
||||
def update_positioning(self, positioning):
|
||||
"""Being notified of a position change, updates the internal state,
|
||||
to as to be able to tell if it was a trivial change (a simple line
|
||||
break) or not.
|
||||
|
||||
:type positioning: tuple[int]
|
||||
:param positioning: a tuple (row, col)
|
||||
"""
|
||||
current = self._positions[-1]
|
||||
|
||||
if not current:
|
||||
if positioning:
|
||||
# set the positioning for the first time
|
||||
self._positions = [positioning]
|
||||
return
|
||||
|
||||
row, col = current
|
||||
new_row, _ = positioning
|
||||
|
||||
# is the new position simply one line below?
|
||||
if new_row == row + 1:
|
||||
self._positions.append((new_row, col))
|
||||
self._break_required = True
|
||||
else:
|
||||
# reset the "current" position altogether.
|
||||
self._positions = [positioning]
|
||||
self._repositioning_required = True
|
||||
|
||||
def get_current_position(self):
|
||||
"""Returns the current usable position
|
||||
|
||||
:rtype: tuple[int]
|
||||
|
||||
:raise: CaptionReadSyntaxError
|
||||
"""
|
||||
if not any(self._positions):
|
||||
raise CaptionReadSyntaxError(
|
||||
'No Preamble Address Code [PAC] was provided'
|
||||
)
|
||||
else:
|
||||
return self._positions[0]
|
||||
|
||||
def is_repositioning_required(self):
|
||||
"""Determines whether the current positioning has changed non-trivially
|
||||
|
||||
Trivial would be mean that a line break should suffice.
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._repositioning_required
|
||||
|
||||
def acknowledge_position_changed(self):
|
||||
"""Acknowledge the position tracer that the position was changed
|
||||
"""
|
||||
self._repositioning_required = False
|
||||
|
||||
def is_linebreak_required(self):
|
||||
"""If the current position is simply one line below the previous.
|
||||
:rtype: bool
|
||||
"""
|
||||
return self._break_required
|
||||
|
||||
def acknowledge_linebreak_consumed(self):
|
||||
"""Call to acknowledge that the line required was consumed
|
||||
"""
|
||||
self._break_required = False
|
||||
|
||||
|
||||
class DefaultProvidingPositionTracker(_PositioningTracker):
|
||||
"""A _PositioningTracker that provides if needed a default value (14, 0), or
|
||||
uses the last positioning value set anywhere in the document
|
||||
"""
|
||||
default = (14, 0)
|
||||
|
||||
def __init__(self, positioning=None, default=None):
|
||||
"""
|
||||
:type positioning: tuple[int]
|
||||
:param positioning: a tuple of ints (row, column)
|
||||
|
||||
:type default: tuple[int]
|
||||
:param default: a tuple of ints (row, column) to use as fallback
|
||||
"""
|
||||
super(DefaultProvidingPositionTracker, self).__init__(positioning)
|
||||
|
||||
if default:
|
||||
self.default = default
|
||||
|
||||
def get_current_position(self):
|
||||
"""Returns the currently tracked positioning, the last positioning that
|
||||
was set (anywhere), or the default it was initiated with
|
||||
|
||||
:rtype: tuple[int]
|
||||
"""
|
||||
try:
|
||||
return (
|
||||
super(DefaultProvidingPositionTracker, self).
|
||||
get_current_position()
|
||||
)
|
||||
except CaptionReadSyntaxError:
|
||||
return self.default
|
||||
|
||||
def update_positioning(self, positioning):
|
||||
"""If called, sets this positioning as the default, then delegates
|
||||
to the super class.
|
||||
|
||||
:param positioning: a tuple of ints (row, col)
|
||||
:type positioning: tuple[int]
|
||||
"""
|
||||
if positioning:
|
||||
self.default = positioning
|
||||
|
||||
super(DefaultProvidingPositionTracker, self).update_positioning(
|
||||
positioning)
|
||||
155
utils/modules/pycaption/srt.py
Normal file
155
utils/modules/pycaption/srt.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from copy import deepcopy
|
||||
import six
|
||||
|
||||
from .base import (
|
||||
BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode)
|
||||
from .exceptions import CaptionReadNoCaptions, InvalidInputError
|
||||
|
||||
|
||||
class SRTReader(BaseReader):
|
||||
def detect(self, content):
|
||||
lines = content.splitlines()
|
||||
if lines[0].isdigit() and '-->' in lines[1]:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def read(self, content, lang='en-US'):
|
||||
if type(content) != six.text_type:
|
||||
raise InvalidInputError('The content is not a unicode string.')
|
||||
|
||||
lines = content.splitlines()
|
||||
start_line = 0
|
||||
captions = CaptionList()
|
||||
|
||||
while start_line < len(lines):
|
||||
if not lines[start_line].isdigit():
|
||||
break
|
||||
|
||||
end_line = self._find_text_line(start_line, lines)
|
||||
|
||||
timing = lines[start_line + 1].split('-->')
|
||||
start = self._srttomicro(timing[0].strip(' \r\n'))
|
||||
end = self._srttomicro(timing[1].strip(' \r\n'))
|
||||
|
||||
nodes = []
|
||||
|
||||
for line in lines[start_line + 2:end_line - 1]:
|
||||
# skip extra blank lines
|
||||
if not nodes or line != '':
|
||||
nodes.append(CaptionNode.create_text(line))
|
||||
nodes.append(CaptionNode.create_break())
|
||||
|
||||
if len(nodes):
|
||||
# remove last line break from end of caption list
|
||||
nodes.pop()
|
||||
caption = Caption(start, end, nodes)
|
||||
captions.append(caption)
|
||||
|
||||
start_line = end_line
|
||||
|
||||
caption_set = CaptionSet({lang: captions})
|
||||
|
||||
if caption_set.is_empty():
|
||||
raise CaptionReadNoCaptions("empty caption file")
|
||||
|
||||
return caption_set
|
||||
|
||||
def _srttomicro(self, stamp):
|
||||
timesplit = stamp.split(':')
|
||||
if ',' not in timesplit[2]:
|
||||
timesplit[2] += ',000'
|
||||
secsplit = timesplit[2].split(',')
|
||||
microseconds = (int(timesplit[0]) * 3600000000 +
|
||||
int(timesplit[1]) * 60000000 +
|
||||
int(secsplit[0]) * 1000000 +
|
||||
int(secsplit[1]) * 1000)
|
||||
|
||||
return microseconds
|
||||
|
||||
def _find_text_line(self, start_line, lines):
|
||||
end_line = start_line
|
||||
|
||||
found = False
|
||||
while end_line < len(lines):
|
||||
if lines[end_line].strip() == "":
|
||||
found = True
|
||||
elif found is True:
|
||||
end_line -= 1
|
||||
break
|
||||
end_line += 1
|
||||
|
||||
return end_line + 1
|
||||
|
||||
|
||||
class SRTWriter(BaseWriter):
|
||||
def write(self, caption_set):
|
||||
caption_set = deepcopy(caption_set)
|
||||
|
||||
srt_captions = []
|
||||
|
||||
for lang in caption_set.get_languages():
|
||||
srt_captions.append(
|
||||
self._recreate_lang(caption_set.get_captions(lang))
|
||||
)
|
||||
|
||||
caption_content = 'MULTI-LANGUAGE SRT\n'.join(srt_captions)
|
||||
return caption_content
|
||||
|
||||
def _recreate_lang(self, captions):
|
||||
|
||||
# Merge caption's that are on the exact same timestamp otherwise some
|
||||
# players will play them in reversed order, libass specifically which is
|
||||
# used quite a lot, including VLC and MPV.
|
||||
# Fixes #189 - https://github.com/pbs/pycaption/issues/189
|
||||
new_captions = []
|
||||
i = 0
|
||||
while len(captions) > i:
|
||||
# if there's a caption after this, and they have the same timestamps
|
||||
if len(captions) > i+1 and captions[i].start == captions[i+1].start and captions[i].end == captions[i+1].end:
|
||||
# merge them together as a new caption
|
||||
new_caption = Caption(start=captions[i].start, end=captions[i].end, nodes=captions[i].nodes + captions[i+1].nodes)
|
||||
# delete the caption after this as we merged them to the current one
|
||||
del captions[i]
|
||||
else:
|
||||
# don't do anything different
|
||||
new_caption = captions[i]
|
||||
# add final caption to new list
|
||||
new_captions.append(new_caption)
|
||||
# increment index
|
||||
i += 1
|
||||
captions = new_captions
|
||||
|
||||
srt = ''
|
||||
count = 1
|
||||
|
||||
for caption in captions:
|
||||
srt += '%s\n' % count
|
||||
|
||||
start = caption.format_start(msec_separator=',')
|
||||
end = caption.format_end(msec_separator=',')
|
||||
timestamp = '%s --> %s\n' % (start[:12], end[:12])
|
||||
|
||||
srt += timestamp.replace('.', ',')
|
||||
|
||||
new_content = ''
|
||||
for node in caption.nodes:
|
||||
new_content = self._recreate_line(new_content, node)
|
||||
|
||||
# Eliminate excessive line breaks
|
||||
new_content = new_content.strip()
|
||||
while '\n\n' in new_content:
|
||||
new_content = new_content.replace('\n\n', '\n')
|
||||
|
||||
srt += "%s%s" % (new_content, '\n\n')
|
||||
count += 1
|
||||
|
||||
return srt[:-1] # remove unwanted newline at end of file
|
||||
|
||||
def _recreate_line(self, srt, line):
|
||||
if line.type_ == CaptionNode.TEXT:
|
||||
return srt + '%s ' % line.content
|
||||
elif line.type_ == CaptionNode.BREAK:
|
||||
return srt + '\n'
|
||||
else:
|
||||
return srt
|
||||
33
utils/modules/pycaption/transcript.py
Normal file
33
utils/modules/pycaption/transcript.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
|
||||
try:
|
||||
import nltk.data
|
||||
except ImportError:
|
||||
raise ImportError('You must install nltk==2.0.4 and numpy==1.7.1 to be able to use this.')
|
||||
from pycaption.base import BaseWriter, CaptionNode
|
||||
|
||||
|
||||
class TranscriptWriter(BaseWriter):
|
||||
def __init__(self, *args, **kw):
|
||||
self.nltk = nltk.data.load('file:%s/english.pickle' %
|
||||
os.path.dirname(__file__))
|
||||
|
||||
def write(self, captions):
|
||||
transcripts = []
|
||||
|
||||
for lang in captions.get_languages():
|
||||
lang_transcript = '* %s Transcript *\n' % lang.upper()
|
||||
|
||||
for caption in captions.get_captions(lang):
|
||||
lang_transcript = self._strip_text(caption.nodes, lang_transcript)
|
||||
|
||||
lang_transcript = '\n'.join(self.nltk.tokenize(lang_transcript))
|
||||
transcripts.append(lang_transcript)
|
||||
|
||||
return '\n'.join(transcripts)
|
||||
|
||||
def _strip_text(self, elements, lang_transcript):
|
||||
for el in elements:
|
||||
if el.type_ == CaptionNode.TEXT:
|
||||
lang_transcript += el.content
|
||||
return lang_transcript
|
||||
10
utils/modules/pycaption/utils.py
Normal file
10
utils/modules/pycaption/utils.py
Normal file
@@ -0,0 +1,10 @@
|
||||
def is_leaf(element):
|
||||
"""
|
||||
Return True if the element is a leaf, False otherwise. The element is
|
||||
considered a leaf if it is either NavigableString or the "br" tag
|
||||
:param element: A BeautifulSoup tag or NavigableString
|
||||
"""
|
||||
name = getattr(element, 'name', None)
|
||||
if not name or name == 'br':
|
||||
return True
|
||||
return False
|
||||
470
utils/modules/pycaption/webvtt.py
Normal file
470
utils/modules/pycaption/webvtt.py
Normal file
@@ -0,0 +1,470 @@
|
||||
import re
|
||||
import six
|
||||
import sys
|
||||
import datetime
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
from .base import (
|
||||
BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode
|
||||
)
|
||||
|
||||
from .geometry import Layout
|
||||
|
||||
from .exceptions import (
|
||||
CaptionReadError, CaptionReadSyntaxError, CaptionReadNoCaptions,
|
||||
InvalidInputError
|
||||
)
|
||||
|
||||
# A WebVTT timing line has both start/end times and layout related settings
|
||||
# (referred to as 'cue settings' in the documentation)
|
||||
# The following pattern captures [start], [end] and [cue settings] if existent
|
||||
from pycaption.geometry import HorizontalAlignmentEnum
|
||||
|
||||
TIMING_LINE_PATTERN = re.compile('^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$')
|
||||
TIMESTAMP_PATTERN = re.compile('^(\d+):(\d{2})(:\d{2})?\.(\d{3})')
|
||||
VOICE_SPAN_PATTERN = re.compile('<v(\\.\\w+)* ([^>]*)>')
|
||||
OTHER_SPAN_PATTERN = (
|
||||
re.compile(
|
||||
'</?([cibuv]|ruby|rt|lang|(\d+):(\d{2})(:\d{2})?\.(\d{3})).*?>'
|
||||
)
|
||||
) # These WebVTT tags are stripped off the cues on conversion
|
||||
|
||||
WEBVTT_VERSION_OF = {
|
||||
HorizontalAlignmentEnum.LEFT: 'left',
|
||||
HorizontalAlignmentEnum.CENTER: 'middle',
|
||||
HorizontalAlignmentEnum.RIGHT: 'right',
|
||||
HorizontalAlignmentEnum.START: 'start',
|
||||
HorizontalAlignmentEnum.END: 'end'
|
||||
}
|
||||
|
||||
DEFAULT_ALIGNMENT = 'middle'
|
||||
|
||||
|
||||
def microseconds(h, m, s, f):
|
||||
"""
|
||||
Returns an integer representing a number of microseconds
|
||||
:rtype: int
|
||||
"""
|
||||
return (int(h) * 3600 + int(m) * 60 + int(s)) * 1000000 + int(f) * 1000
|
||||
|
||||
|
||||
class WebVTTReader(BaseReader):
|
||||
def __init__(self, ignore_timing_errors=True, *args, **kwargs):
|
||||
"""
|
||||
:param ignore_timing_errors: Whether to ignore timing checks
|
||||
"""
|
||||
self.ignore_timing_errors = ignore_timing_errors
|
||||
|
||||
def detect(self, content):
|
||||
return 'WEBVTT' in content
|
||||
|
||||
def read(self, content, lang='en-US'):
|
||||
if type(content) != six.text_type:
|
||||
raise InvalidInputError('The content is not a unicode string.')
|
||||
|
||||
caption_set = CaptionSet({lang: self._parse(content.splitlines())})
|
||||
|
||||
if caption_set.is_empty():
|
||||
raise CaptionReadNoCaptions("empty caption file")
|
||||
|
||||
return caption_set
|
||||
|
||||
def _parse(self, lines):
|
||||
captions = CaptionList()
|
||||
start = None
|
||||
end = None
|
||||
nodes = []
|
||||
layout_info = None
|
||||
found_timing = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
|
||||
if '-->' in line:
|
||||
found_timing = True
|
||||
timing_line = i
|
||||
last_start_time = captions[-1].start if captions else 0
|
||||
try:
|
||||
start, end, layout_info = self._parse_timing_line(
|
||||
line, last_start_time)
|
||||
except CaptionReadError as e:
|
||||
new_message = '%s (line %d)' % (e.args[0], timing_line)
|
||||
six.reraise(type(e), type(e)(new_message), sys.exc_info()[2])
|
||||
|
||||
elif '' == line:
|
||||
if found_timing:
|
||||
if not nodes:
|
||||
raise CaptionReadSyntaxError(
|
||||
'Cue without content. (line %d)' % timing_line)
|
||||
else:
|
||||
found_timing = False
|
||||
caption = Caption(
|
||||
start, end, nodes, layout_info=layout_info)
|
||||
captions.append(caption)
|
||||
nodes = []
|
||||
else:
|
||||
if found_timing:
|
||||
if nodes:
|
||||
nodes.append(CaptionNode.create_break())
|
||||
nodes.append(CaptionNode.create_text(
|
||||
self._decode(line)))
|
||||
else:
|
||||
# it's a comment or some metadata; ignore it
|
||||
pass
|
||||
|
||||
# Add a last caption if there are remaining nodes
|
||||
if nodes:
|
||||
caption = Caption(start, end, nodes, layout_info=layout_info)
|
||||
captions.append(caption)
|
||||
|
||||
return captions
|
||||
|
||||
def _remove_styles(self, line):
|
||||
partial_result = VOICE_SPAN_PATTERN.sub('\\2: ', line)
|
||||
return OTHER_SPAN_PATTERN.sub('', partial_result)
|
||||
|
||||
def _validate_timings(self, start, end, last_start_time):
|
||||
if start is None:
|
||||
raise CaptionReadSyntaxError(
|
||||
'Invalid cue start timestamp.')
|
||||
if end is None:
|
||||
raise CaptionReadSyntaxError('Invalid cue end timestamp.')
|
||||
if start > end:
|
||||
raise CaptionReadError(
|
||||
'End timestamp is not greater than start timestamp.')
|
||||
if start < last_start_time:
|
||||
raise CaptionReadError(
|
||||
'Start timestamp is not greater than or equal'
|
||||
'to start timestamp of previous cue.')
|
||||
|
||||
def _parse_timing_line(self, line, last_start_time):
|
||||
"""
|
||||
:returns: Tuple (int, int, Layout)
|
||||
"""
|
||||
m = TIMING_LINE_PATTERN.search(line)
|
||||
if not m:
|
||||
raise CaptionReadSyntaxError(
|
||||
'Invalid timing format.')
|
||||
|
||||
start = self._parse_timestamp(m.group(1))
|
||||
end = self._parse_timestamp(m.group(2))
|
||||
|
||||
cue_settings = m.group(3)
|
||||
|
||||
if not self.ignore_timing_errors:
|
||||
self._validate_timings(start, end, last_start_time)
|
||||
|
||||
layout_info = None
|
||||
if cue_settings:
|
||||
layout_info = Layout(webvtt_positioning=cue_settings)
|
||||
|
||||
return start, end, layout_info
|
||||
|
||||
def _parse_timestamp(self, timestamp):
|
||||
"""Returns an integer representing a number of microseconds
|
||||
:rtype: int
|
||||
"""
|
||||
m = TIMESTAMP_PATTERN.search(timestamp)
|
||||
if not m:
|
||||
raise CaptionReadSyntaxError(
|
||||
'Invalid timing format.')
|
||||
|
||||
m = m.groups()
|
||||
|
||||
if m[2]:
|
||||
# Timestamp takes the form of [hours]:[minutes]:[seconds].[milliseconds]
|
||||
return microseconds(m[0], m[1], m[2].replace(":", ""), m[3])
|
||||
else:
|
||||
# Timestamp takes the form of [minutes]:[seconds].[milliseconds]
|
||||
return microseconds(0, m[0], m[1], m[3])
|
||||
|
||||
def _decode(self, s):
|
||||
"""
|
||||
Convert cue text from WebVTT XML-like format to plain unicode.
|
||||
:type s: unicode
|
||||
"""
|
||||
s = s.strip()
|
||||
# Covert voice span
|
||||
s = VOICE_SPAN_PATTERN.sub('\\2: ', s)
|
||||
# TODO: Add support for other WebVTT tags. For now just strip them
|
||||
# off the text.
|
||||
s = OTHER_SPAN_PATTERN.sub('', s)
|
||||
# Replace WebVTT special XML codes with plain unicode values
|
||||
s = s.replace('<', '<')
|
||||
s = s.replace('>', '>')
|
||||
s = s.replace('‎', '\u200e')
|
||||
s = s.replace('‏', '\u200f')
|
||||
s = s.replace(' ', '\u00a0')
|
||||
# Must do ampersand last
|
||||
s = s.replace('&', '&')
|
||||
return s
|
||||
|
||||
|
||||
class WebVTTWriter(BaseWriter):
|
||||
HEADER = 'WEBVTT\n\n'
|
||||
global_layout = None
|
||||
video_width = None
|
||||
video_height = None
|
||||
|
||||
def write(self, caption_set):
|
||||
"""
|
||||
:type caption_set: CaptionSet
|
||||
"""
|
||||
output = self.HEADER
|
||||
|
||||
if caption_set.is_empty():
|
||||
return output
|
||||
|
||||
caption_set = deepcopy(caption_set)
|
||||
|
||||
# TODO: styles. These go into a separate CSS file, which doesn't really
|
||||
# fit the API here. Figure that out. Though some style stuff can be
|
||||
# done in-line. This format is a little bit crazy.
|
||||
|
||||
# WebVTT's language support seems to be a bit crazy, so let's just
|
||||
# support a single one for now.
|
||||
lang = list(caption_set.get_languages())[0]
|
||||
|
||||
self.global_layout = caption_set.get_layout_info(lang)
|
||||
|
||||
captions = caption_set.get_captions(lang)
|
||||
|
||||
return output + '\n'.join(
|
||||
[self._write_caption(caption_set, caption) for caption in captions])
|
||||
|
||||
def _timestamp(self, ts):
|
||||
td = datetime.timedelta(microseconds=ts)
|
||||
mm, ss = divmod(td.seconds, 60)
|
||||
hh, mm = divmod(mm, 60)
|
||||
s = "%02d:%02d.%03d" % (mm, ss, td.microseconds/1000)
|
||||
if hh:
|
||||
s = "%d:%s" % (hh, s)
|
||||
return s
|
||||
|
||||
def _tags_for_style(self, style):
|
||||
if style == 'italics':
|
||||
return ['<i>', '</i>']
|
||||
elif style == 'underline':
|
||||
return ['<u>', '</u>']
|
||||
elif style == 'bold':
|
||||
return ['<b>', '</b>']
|
||||
else:
|
||||
return ['', '']
|
||||
|
||||
def _calculate_resulting_style(self, style, caption_set):
|
||||
resulting_style = {}
|
||||
|
||||
style_classes = []
|
||||
if 'classes' in style:
|
||||
style_classes = style['classes']
|
||||
elif 'class' in style:
|
||||
style_classes = [style['class']]
|
||||
|
||||
for style_class in style_classes:
|
||||
sub_style = caption_set.get_style(style_class).copy()
|
||||
# Recursively resolve class attributes and calculate style
|
||||
resulting_style.update(self._calculate_resulting_style(sub_style, caption_set))
|
||||
|
||||
resulting_style.update(style)
|
||||
|
||||
return resulting_style
|
||||
|
||||
def _write_caption(self, caption_set, caption):
|
||||
"""
|
||||
:type caption: Caption
|
||||
"""
|
||||
layout_groups = self._layout_groups(caption.nodes, caption_set)
|
||||
|
||||
start = self._timestamp(caption.start)
|
||||
end = self._timestamp(caption.end)
|
||||
timespan = "{} --> {}".format(start, end)
|
||||
|
||||
output = ''
|
||||
|
||||
cue_style_tags = ['', '']
|
||||
|
||||
style = self._calculate_resulting_style(caption.style, caption_set)
|
||||
for key, value in sorted(style.items()):
|
||||
if value:
|
||||
tags = self._tags_for_style(key)
|
||||
# print "tags: " + str(tags) + "\n"
|
||||
cue_style_tags[0] += tags[0]
|
||||
cue_style_tags[1] = tags[1] + cue_style_tags[1]
|
||||
|
||||
for cue_text, layout in layout_groups:
|
||||
if not layout:
|
||||
layout = caption.layout_info or self.global_layout
|
||||
cue_settings = self._cue_settings_from(layout)
|
||||
output += timespan + cue_settings + '\n'
|
||||
output += cue_style_tags[0] + cue_text + cue_style_tags[1] + '\n'
|
||||
|
||||
return output
|
||||
|
||||
def _cue_settings_from(self, layout):
|
||||
"""
|
||||
Return WebVTT cue settings string based on layout info
|
||||
:type layout: Layout
|
||||
:rtype: unicode
|
||||
"""
|
||||
if not layout:
|
||||
return ''
|
||||
|
||||
# If it's converting from WebVTT to WebVTT, keep positioning info
|
||||
# unchanged
|
||||
if layout.webvtt_positioning:
|
||||
return ' {}'.format(layout.webvtt_positioning)
|
||||
|
||||
left_offset = None
|
||||
top_offset = None
|
||||
cue_width = None
|
||||
alignment = None
|
||||
|
||||
already_relative = False
|
||||
if not self.relativize:
|
||||
if layout.is_relative():
|
||||
already_relative = True
|
||||
else:
|
||||
# There are absolute positioning values for this cue but the
|
||||
# Writer is explicitly configured not to do any relativization.
|
||||
# Ignore all positioning for this cue.
|
||||
return ''
|
||||
|
||||
# Ensure that all positioning values are measured using percentage.
|
||||
# This may raise an exception if layout.is_relative() == False
|
||||
# If you want to avoid it, you have to turn off relativization by
|
||||
# initializing this Writer with relativize=False.
|
||||
if not already_relative:
|
||||
layout = layout.as_percentage_of(
|
||||
self.video_width, self.video_height)
|
||||
|
||||
# Ensure that when there's a left offset the caption is not pushed out
|
||||
# of the screen. If the execution got this far it means origin and
|
||||
# extent are already relative by now.
|
||||
if self.fit_to_screen:
|
||||
layout = layout.fit_to_screen()
|
||||
|
||||
if layout.origin:
|
||||
left_offset = layout.origin.x
|
||||
top_offset = layout.origin.y
|
||||
|
||||
if layout.extent:
|
||||
cue_width = layout.extent.horizontal
|
||||
|
||||
if layout.padding:
|
||||
if layout.padding.start and left_offset:
|
||||
# Since there is no padding in WebVTT, the left padding is
|
||||
# added to the total left offset (if it is defined and not
|
||||
# relative),
|
||||
if left_offset:
|
||||
left_offset += layout.padding.start
|
||||
# and removed from the total cue width
|
||||
if cue_width:
|
||||
cue_width -= layout.padding.start
|
||||
# the right padding is cut out of the total cue width,
|
||||
if layout.padding.end and cue_width:
|
||||
cue_width -= layout.padding.end
|
||||
# the top padding is added to the top offset
|
||||
# (if it is defined and not relative)
|
||||
if layout.padding.before and top_offset:
|
||||
top_offset += layout.padding.before
|
||||
# and the bottom padding is ignored because the cue box is only as
|
||||
# long vertically as the text it contains and nothing can be cut
|
||||
# out
|
||||
|
||||
try:
|
||||
alignment = WEBVTT_VERSION_OF[layout.alignment.horizontal]
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
|
||||
cue_settings = ''
|
||||
|
||||
if alignment and alignment != 'middle':
|
||||
cue_settings += " align:" + alignment
|
||||
if left_offset:
|
||||
cue_settings += " position:{},start".format(six.text_type(left_offset))
|
||||
if top_offset:
|
||||
cue_settings += " line:" + six.text_type(top_offset)
|
||||
if cue_width:
|
||||
cue_settings += " size:" + six.text_type(cue_width)
|
||||
|
||||
return cue_settings
|
||||
|
||||
def _layout_groups(self, nodes, caption_set):
|
||||
"""
|
||||
Convert a Caption's nodes to WebVTT cue or cues (depending on
|
||||
whether they have the same positioning or not).
|
||||
"""
|
||||
if not nodes:
|
||||
return []
|
||||
|
||||
current_layout = None
|
||||
|
||||
# A list with layout groups. Since WebVTT only support positioning
|
||||
# for different cues, each layout group has to be represented in a
|
||||
# new cue with the same timing but different positioning settings.
|
||||
layout_groups = []
|
||||
# A properly encoded WebVTT string (plain unicode must be properly
|
||||
# escaped before being appended to this string)
|
||||
s = ''
|
||||
for i, node in enumerate(nodes):
|
||||
if node.type_ == CaptionNode.TEXT:
|
||||
if s and current_layout and node.layout_info != current_layout:
|
||||
# If the positioning changes from one text node to
|
||||
# another, a new WebVTT cue has to be created.
|
||||
layout_groups.append((s, current_layout))
|
||||
s = ''
|
||||
# ATTENTION: This is where the plain unicode node content is
|
||||
# finally encoded as WebVTT.
|
||||
s += self._encode(node.content) or ' '
|
||||
current_layout = node.layout_info
|
||||
elif node.type_ == CaptionNode.STYLE:
|
||||
resulting_style = self._calculate_resulting_style(node.content, caption_set)
|
||||
|
||||
styles = ['italics', 'underline', 'bold']
|
||||
if not node.start:
|
||||
styles.reverse()
|
||||
|
||||
for style in styles:
|
||||
if style in resulting_style and resulting_style[style]:
|
||||
tags = self._tags_for_style(style)
|
||||
if node.start:
|
||||
s += tags[0]
|
||||
else:
|
||||
s += tags[1]
|
||||
|
||||
# TODO: Refactor pycaption and eliminate the concept of a
|
||||
# "Style node"
|
||||
elif node.type_ == CaptionNode.BREAK:
|
||||
if i > 0 and nodes[i - 1].type_ != CaptionNode.TEXT:
|
||||
s += ' '
|
||||
if i == 0: # cue text starts with a break
|
||||
s += ' '
|
||||
s += '\n'
|
||||
|
||||
if s:
|
||||
layout_groups.append((s, current_layout))
|
||||
return layout_groups
|
||||
|
||||
def _encode(self, s):
|
||||
"""
|
||||
Convert cue text from plain unicode to WebVTT XML-like format
|
||||
escaping illegal characters. For a list of illegal characters see:
|
||||
- http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-text-span
|
||||
:type s: unicode
|
||||
"""
|
||||
s = s.replace('&', '&')
|
||||
s = s.replace('<', '<')
|
||||
|
||||
# The substring "-->" is also not allowed according to this:
|
||||
# - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-block
|
||||
s = s.replace('-->', '-->')
|
||||
|
||||
# The following characters have escaping codes for some reason, but
|
||||
# they're not illegal, so for now I'll leave this commented out so that
|
||||
# we stay as close as possible to the specification and avoid doing
|
||||
# extra stuff "just to be safe".
|
||||
# s = s.replace(u'>', u'>')
|
||||
# s = s.replace(u'\u200e', u'‎')
|
||||
# s = s.replace(u'\u200f', u'‏')
|
||||
# s = s.replace(u'\u00a0', u' ')
|
||||
return s
|
||||
Reference in New Issue
Block a user