upload

2021-09-01 02:57:54 +05:00
parent 9df940f1fd
commit bf3c3712dd
222 changed files with 1007430 additions and 0 deletions
--- a/utils/modules/pycaption/init.py
+++ b/utils/modules/pycaption/init.py
@@ -0,0 +1,34 @@
+from .base import (
+    CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet)
+from .dfxp import DFXPWriter, DFXPReader
+from .sami import SAMIReader, SAMIWriter
+from .srt import SRTReader, SRTWriter
+from .scc import SCCReader, SCCWriter
+from .webvtt import WebVTTReader, WebVTTWriter
+from .exceptions import (
+    CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError)
+
+
+__all__ = [
+    'CaptionConverter', 'DFXPReader', 'DFXPWriter',
+    'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter',
+    'SCCReader', 'SCCWriter', 'WebVTTReader', 'WebVTTWriter',
+    'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError',
+    'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet'
+]
+
+SUPPORTED_READERS = (
+    DFXPReader, WebVTTReader, SAMIReader, SRTReader, SCCReader)
+
+
+def detect_format(caps):
+    """
+    Detect the format of the provided caption string.
+
+    :returns: the reader class for the detected format.
+    """
+    for reader in SUPPORTED_READERS:
+        if reader().detect(caps):
+            return reader
+
+    return None
--- a/utils/modules/pycaption/pycache/init.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/init.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/init.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/init.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/init.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/init.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/init.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/init.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/base.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/base.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/base.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/base.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/base.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/base.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/base.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/base.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/exceptions.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/exceptions.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/exceptions.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/exceptions.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/exceptions.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/exceptions.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/exceptions.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/exceptions.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/geometry.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/geometry.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/geometry.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/geometry.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/geometry.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/geometry.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/geometry.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/geometry.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/sami.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/sami.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/sami.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/sami.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/sami.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/sami.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/sami.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/sami.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/srt.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/srt.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/srt.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/srt.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/srt.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/srt.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/srt.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/srt.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/utils.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/utils.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/utils.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/utils.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/utils.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/utils.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/utils.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/utils.cpython-39.pyc
--- a/utils/modules/pycaption/pycache/webvtt.cpython-36.pyc
+++ b/utils/modules/pycaption/pycache/webvtt.cpython-36.pyc
--- a/utils/modules/pycaption/pycache/webvtt.cpython-37.pyc
+++ b/utils/modules/pycaption/pycache/webvtt.cpython-37.pyc
--- a/utils/modules/pycaption/pycache/webvtt.cpython-38.pyc
+++ b/utils/modules/pycaption/pycache/webvtt.cpython-38.pyc
--- a/utils/modules/pycaption/pycache/webvtt.cpython-39.pyc
+++ b/utils/modules/pycaption/pycache/webvtt.cpython-39.pyc
--- a/utils/modules/pycaption/base.py
+++ b/utils/modules/pycaption/base.py
@@ -0,0 +1,409 @@
+from datetime import timedelta
+from numbers import Number
+from six import text_type
+
+from .exceptions import CaptionReadError, CaptionReadTimingError
+
+DEFAULT_LANGUAGE_CODE = 'en-US'
+
+
+def force_byte_string(content):
+    try:
+        return content.encode('UTF-8')
+    except UnicodeEncodeError:
+        raise RuntimeError('Invalid content encoding')
+    except UnicodeDecodeError:
+        return content
+
+
+class CaptionConverter(object):
+    def __init__(self, captions=None):
+        self.captions = captions if captions else []
+
+    def read(self, content, caption_reader):
+        try:
+            self.captions = caption_reader.read(content)
+        except AttributeError as e:
+            raise Exception(e)
+        return self
+
+    def write(self, caption_writer):
+        try:
+            return caption_writer.write(self.captions)
+        except AttributeError as e:
+            raise Exception(e)
+
+
+class BaseReader(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def detect(self, content):
+        if content:
+            return True
+        else:
+            return False
+
+    def read(self, content):
+        return CaptionSet()
+
+
+class BaseWriter(object):
+    def __init__(self, relativize=True, video_width=None, video_height=None,
+                 fit_to_screen=True):
+        """
+        Initialize writer with the given parameters.
+
+        :param relativize: If True (default), converts absolute positioning
+            values (e.g. px) to percentage. ATTENTION: WebVTT does not support
+            absolute positioning. If relativize is set to False and it finds
+            an absolute positioning parameter for a given caption, it will
+            ignore all positioning for that cue and show it in the default
+            position.
+        :param video_width: The width of the video for which the captions being
+            converted were made. This is necessary for relativization.
+        :param video_height: The height of the video for which the captions
+            being converted were made. This is necessary for relativization.
+        :param fit_to_screen: If extent is not set or
+            if origin + extent > 100%, (re)calculate it based on origin.
+            It is a pycaption fix for caption files that are technically valid
+            but contains inconsistent settings that may cause long captions to
+            be cut out of the screen.
+        """
+        self.relativize = relativize
+        self.video_width = video_width
+        self.video_height = video_height
+        self.fit_to_screen = fit_to_screen
+
+    def _relativize_and_fit_to_screen(self, layout_info):
+        if layout_info:
+            if self.relativize:
+                # Transform absolute values (e.g. px) into percentages
+                layout_info = layout_info.as_percentage_of(
+                    self.video_width, self.video_height)
+            if self.fit_to_screen:
+                # Make sure origin + extent <= 100%
+                layout_info = layout_info.fit_to_screen()
+        return layout_info
+
+    def write(self, content):
+        return content
+
+
+class Style(object):
+    def __init__(self):
+        pass
+
+
+class CaptionNode(object):
+    """
+    A single node within a caption, representing either
+    text, a style, or a linebreak.
+
+    Rules:
+        1. All nodes should have the property layout_info set.
+        The value None means specifically that no positioning information
+        should be specified. Each reader is to supply its own default
+        values (if necessary) when reading their respective formats.
+    """
+
+    TEXT = 1
+    # When and if this is extended, it might be better to turn it into a
+    # property of the node, not a type of node itself.
+    STYLE = 2
+    BREAK = 3
+
+    def __init__(self, type_, layout_info=None):
+        """
+        :type type_: int
+        :type layout_info: Layout
+        """
+        self.type_ = type_
+        self.content = None
+
+        # Boolean. Marks the beginning/ end of a Style node.
+        self.start = None
+        self.layout_info = layout_info
+
+    def __repr__(self):
+        t = self.type_
+
+        if t == CaptionNode.TEXT:
+            return repr(self.content)
+        elif t == CaptionNode.BREAK:
+            return repr('BREAK')
+        elif t == CaptionNode.STYLE:
+            return repr('STYLE: %s %s' % (self.start, self.content))
+        else:
+            raise RuntimeError('Unknown node type: ' + str(t))
+
+    @staticmethod
+    def create_text(text, layout_info=None):
+        data = CaptionNode(CaptionNode.TEXT, layout_info=layout_info)
+        data.content = text
+        return data
+
+    @staticmethod
+    def create_style(start, content, layout_info=None):
+        data = CaptionNode(CaptionNode.STYLE, layout_info=layout_info)
+        data.content = content
+        data.start = start
+        return data
+
+    @staticmethod
+    def create_break(layout_info=None):
+        return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
+
+
+class Caption(object):
+    """
+    A single caption, including the time and styling information
+    for its display.
+    """
+    def __init__(self, start, end, nodes, style={}, layout_info=None):
+        """
+        Initialize the Caption object
+        :param start: The start time in microseconds
+        :type start: Number
+        :param end: The end time in microseconds
+        :type end: Number
+        :param nodes: A list of CaptionNodes
+        :type nodes: list
+        :param style: A dictionary with CSS-like styling rules
+        :type style: dict
+        :param layout_info: A Layout object with the necessary positioning
+            information
+        :type layout_info: Layout
+        """
+        if not isinstance(start, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid start time")
+        if not isinstance(end, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid end time")
+        if not nodes:
+            raise CaptionReadError("Node list cannot be empty")
+        self.start = start
+        self.end = end
+        self.nodes = nodes
+        self.style = style
+        self.layout_info = layout_info
+
+    def is_empty(self):
+        return len(self.nodes) == 0
+
+    def format_start(self, msec_separator=None):
+        """
+        Format the start time value in milliseconds into a string
+        value suitable for some of the supported output formats (ex.
+        SRT, DFXP).
+        """
+        return self._format_timestamp(self.start, msec_separator)
+
+    def format_end(self, msec_separator=None):
+        """
+        Format the end time value in milliseconds into a string value suitable
+        for some of the supported output formats (ex. SRT, DFXP).
+        """
+        return self._format_timestamp(self.end, msec_separator)
+
+    def __repr__(self):
+        return repr(
+            '{start} --> {end}\n{text}'.format(
+                start=self.format_start(),
+                end=self.format_end(),
+                text=self.get_text()
+            )
+        )
+
+    def get_text(self):
+        """
+        Get the text of the caption.
+        """
+        def get_text_for_node(node):
+            if node.type_ == CaptionNode.TEXT:
+                return node.content
+            if node.type_ == CaptionNode.BREAK:
+                return '\n'
+            return ''
+        text_nodes = [get_text_for_node(node) for node in self.nodes]
+        return ''.join(text_nodes).strip()
+
+    def _format_timestamp(self, value, msec_separator=None):
+        datetime_value = timedelta(milliseconds=(int(value / 1000)))
+
+        str_value = text_type(datetime_value)[:11]
+        if not datetime_value.microseconds:
+            str_value += '.000'
+
+        if msec_separator is not None:
+            str_value = str_value.replace(".", msec_separator)
+
+        return '0' + str_value
+
+
+class CaptionList(list):
+    """ A list of captions with a layout object attached to it """
+    def __init__(self, iterable=None, layout_info=None):
+        """
+        :param iterable: An iterator used to populate the caption list
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self.layout_info = layout_info
+        args = [iterable] if iterable else []
+        super(CaptionList, self).__init__(*args)
+
+    def __getslice__(self, i, j):
+        return CaptionList(
+            list.__getslice__(self, i, j), layout_info=self.layout_info)
+
+    def __getitem__(self, y):
+        item = list.__getitem__(self, y)
+        if isinstance(item, Caption):
+            return item
+        return CaptionList(item, layout_info=self.layout_info)
+
+    def __add__(self, other):
+        add_is_safe = (
+            not hasattr(other, 'layout_info') or
+            not other.layout_info or
+            self.layout_info == other.layout_info
+        )
+        if add_is_safe:
+            return CaptionList(
+                list.__add__(self, other), layout_info=self.layout_info)
+        else:
+            raise ValueError(
+                "Cannot add CaptionList objects with different layout_info")
+
+    def __mul__(self, other):
+        return CaptionList(
+            list.__mul__(self, other), layout_info=self.layout_info)
+
+    __rmul__ = __mul__
+
+
+class CaptionSet(object):
+    """
+    A set of captions in potentially multiple languages,
+    all representing the same underlying content.
+
+    The .layout_info attribute, keeps information that should be inherited
+    by all the children.
+    """
+    def __init__(self, captions, styles={}, layout_info=None):
+        """
+        :param captions: A dictionary of the format {'language': CaptionList}
+        :param styles: A dictionary with CSS-like styling rules
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self._captions = captions
+        self._styles = styles
+        self.layout_info = layout_info
+
+    def set_captions(self, lang, captions):
+        self._captions[lang] = captions
+
+    def get_languages(self):
+        return list(self._captions.keys())
+
+    def get_captions(self, lang):
+        return self._captions.get(lang, [])
+
+    def add_style(self, selector, rules):
+        """
+        :param selector: The selector indicating the elements to which the
+            rules should be applied.
+        :param rules: A dictionary with CSS-like styling rules.
+        """
+        self._styles[selector] = rules
+
+    def get_style(self, selector):
+        """
+        Returns a dictionary with CSS-like styling rules for a given selector.
+        :param selector: The selector whose rules should be returned (e.g. an
+            element or class name).
+        """
+        return self._styles.get(selector, {})
+
+    def get_styles(self):
+        return sorted(self._styles.items())
+
+    def set_styles(self, styles):
+        self._styles = styles
+
+    def is_empty(self):
+        return all(
+            [len(captions) == 0 for captions in list(self._captions.values())]
+        )
+
+    def set_layout_info(self, lang, layout_info):
+        self._captions[lang].layout_info = layout_info
+
+    def get_layout_info(self, lang):
+        caption_list = self._captions.get(lang)
+        if caption_list:
+            return caption_list.layout_info
+        return None
+
+    def adjust_caption_timing(self, offset=0, rate_skew=1.0):
+        """
+        Adjust the timing according to offset and rate_skew.
+        Skew is applied first, then offset.
+
+        e.g. if skew == 1.1, and offset is 5, a caption originally
+        displayed from 10-11 seconds would instead be at 16-17.1
+        """
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                caption.start = caption.start * rate_skew + offset
+                caption.end = caption.end * rate_skew + offset
+                if caption.start >= 0:
+                    out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+
+# Functions
+def merge_concurrent_captions(caption_set):
+    """Merge captions that have the same start and end times"""
+    for lang in caption_set.get_languages():
+        captions = caption_set.get_captions(lang)
+        last_caption = None
+        concurrent_captions = CaptionList()
+        merged_captions = CaptionList()
+        for caption in captions:
+            if last_caption:
+                last_timespan = last_caption.start, last_caption.end
+                current_timespan = caption.start, caption.end
+                if current_timespan == last_timespan:
+                    concurrent_captions.append(caption)
+                    last_caption = caption
+                    continue
+                else:
+                    merged_captions.append(merge(concurrent_captions))
+            concurrent_captions = [caption]
+            last_caption = caption
+
+        if concurrent_captions:
+            merged_captions.append(merge(concurrent_captions))
+        if merged_captions:
+            caption_set.set_captions(lang, merged_captions)
+    return caption_set
+
+
+def merge(captions):
+    """
+    Merge list of captions into one caption. The start/end times from the first
+    caption are kept.
+    """
+    new_nodes = []
+    for caption in captions:
+        if new_nodes:
+            new_nodes.append(CaptionNode.create_break())
+        for node in caption.nodes:
+            new_nodes.append(node)
+    caption = Caption(
+        captions[0].start, captions[0].end, new_nodes, captions[0].style)
+    return caption
--- a/utils/modules/pycaption/dfxp/init.py
+++ b/utils/modules/pycaption/dfxp/init.py
@@ -0,0 +1,2 @@
+from .base import *
+from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter
--- a/utils/modules/pycaption/dfxp/pycache/init.cpython-36.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/init.cpython-36.pyc
--- a/utils/modules/pycaption/dfxp/pycache/init.cpython-37.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/init.cpython-37.pyc
--- a/utils/modules/pycaption/dfxp/pycache/init.cpython-38.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/init.cpython-38.pyc
--- a/utils/modules/pycaption/dfxp/pycache/init.cpython-39.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/init.cpython-39.pyc
--- a/utils/modules/pycaption/dfxp/pycache/base.cpython-36.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/base.cpython-36.pyc
--- a/utils/modules/pycaption/dfxp/pycache/base.cpython-37.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/base.cpython-37.pyc
--- a/utils/modules/pycaption/dfxp/pycache/base.cpython-38.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/base.cpython-38.pyc
--- a/utils/modules/pycaption/dfxp/pycache/base.cpython-39.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/base.cpython-39.pyc
--- a/utils/modules/pycaption/dfxp/pycache/extras.cpython-36.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/extras.cpython-36.pyc
--- a/utils/modules/pycaption/dfxp/pycache/extras.cpython-37.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/extras.cpython-37.pyc
--- a/utils/modules/pycaption/dfxp/pycache/extras.cpython-38.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/extras.cpython-38.pyc
--- a/utils/modules/pycaption/dfxp/pycache/extras.cpython-39.pyc
+++ b/utils/modules/pycaption/dfxp/pycache/extras.cpython-39.pyc
--- a/utils/modules/pycaption/dfxp/base.py
+++ b/utils/modules/pycaption/dfxp/base.py
--- a/utils/modules/pycaption/dfxp/extras.py
+++ b/utils/modules/pycaption/dfxp/extras.py
@@ -0,0 +1,248 @@
+# We thought about making pycaption.base objects immutable. This would be nice
+# in a lot of cases, but since the transformations on them could be quite
+# complex, the deepcopy method is good enough sometimes.
+from copy import deepcopy
+
+from .base import DFXPWriter, DFXP_DEFAULT_REGION
+from ..base import BaseWriter, CaptionNode, merge_concurrent_captions
+
+from xml.sax.saxutils import escape
+from bs4 import BeautifulSoup
+
+LEGACY_DFXP_BASE_MARKUP = '''
+<tt xmlns="http://www.w3.org/ns/ttml"
+    xmlns:tts="http://www.w3.org/ns/ttml#styling">
+    <head>
+        <styling/>
+        <layout/>
+    </head>
+    <body/>
+</tt>
+'''
+
+LEGACY_DFXP_DEFAULT_STYLE = {
+    'color': 'white',
+    'font-family': 'monospace',
+    'font-size': '1c',
+}
+
+LEGACY_DFXP_DEFAULT_STYLE_ID = 'default'
+LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom'
+
+LEGACY_DFXP_DEFAULT_REGION = {
+    'text-align': 'center',
+    'display-align': 'after'
+}
+
+
+class SinglePositioningDFXPWriter(DFXPWriter):
+    """A dfxp writer, that ignores all positioning, using a single provided value
+    """
+    def __init__(self, default_positioning=DFXP_DEFAULT_REGION,
+                 *args, **kwargs):
+        super(SinglePositioningDFXPWriter, self).__init__(*args, **kwargs)
+        self.default_positioning = default_positioning
+
+    def write(self, captions_set, force=''):
+        """Writes a DFXP file using the positioning provided in the initializer
+
+        :type captions_set: pycaption.base.CaptionSet
+        :param force: only write this language, if available in the CaptionSet
+        :rtype: unicode
+        """
+        captions_set = self._create_single_positioning_caption_set(
+            captions_set, self.default_positioning)
+
+        return super(SinglePositioningDFXPWriter, self).write(captions_set, force)  # noqa
+
+    @staticmethod
+    def _create_single_positioning_caption_set(caption_set, positioning):
+        """Return a caption where all the positioning information was
+        replaced from positioning
+
+        :type caption_set: pycaption.base.CaptionSet
+        :rtype: pycaption.base.CaptionSet
+        """
+        # If SinglePositioningDFXPWriter would modify the state of the caption
+        # set, any writer using the same caption_set thereafter would be
+        # affected. At the moment we know we don't use any other writers, but
+        # this is important and mustn't be neglected
+        caption_set = deepcopy(caption_set)
+        caption_set = merge_concurrent_captions(caption_set)
+        caption_set.layout_info = positioning
+
+        for lang in caption_set.get_languages():
+            caption_set.set_layout_info(lang, positioning)
+
+            caption_list = caption_set.get_captions(lang)
+            for caption in caption_list:
+                caption.layout_info = positioning
+
+                for node in caption.nodes:
+                    if hasattr(node, 'layout_info'):
+                        node.layout_info = positioning
+
+        for _, style in caption_set.get_styles():
+            if 'text-align' in style:
+                style.pop('text-align')
+
+        return caption_set
+
+
+class LegacyDFXPWriter(BaseWriter):
+    """Ported the legacy DFXPWriter from 0.4.5"""
+    def __init__(self, *args, **kw):
+        self.p_style = False
+        self.open_span = False
+
+    def write(self, caption_set, force=''):
+        caption_set = deepcopy(caption_set)
+        caption_set = merge_concurrent_captions(caption_set)
+
+        dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml')
+        dfxp.find('tt')['xml:lang'] = "en"
+
+        for style_id, style in caption_set.get_styles():
+            if style != {}:
+                dfxp = self._recreate_styling_tag(style_id, style, dfxp)
+        if not caption_set.get_styles():
+            dfxp = self._recreate_styling_tag(
+                LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp)
+
+        # XXX For now we will always use this default region. In the future if
+        # regions are provided, they will be kept
+        dfxp = self._recreate_region_tag(
+            LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp)
+
+        body = dfxp.find('body')
+
+        if force:
+            langs = [self._force_language(force, caption_set.get_languages())]
+        else:
+            langs = caption_set.get_languages()
+
+        for lang in langs:
+            div = dfxp.new_tag('div')
+            div['xml:lang'] = '%s' % lang
+
+            for caption in caption_set.get_captions(lang):
+                if caption.style:
+                    caption_style = caption.style
+                    caption_style.update({'region': LEGACY_DFXP_DEFAULT_REGION_ID})
+                else:
+                    caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID,
+                                     'region': LEGACY_DFXP_DEFAULT_REGION_ID}
+                p = self._recreate_p_tag(caption, caption_style, dfxp)
+                div.append(p)
+
+            body.append(div)
+
+        caption_content = dfxp.prettify(formatter=None)
+        return caption_content
+
+    # force the DFXP to only have one language, trying to match on "force"
+    def _force_language(self, force, langs):
+        for lang in langs:
+            if force == lang:
+                return lang
+
+        return langs[-1]
+
+    def _recreate_region_tag(self, region_id, styling, dfxp):
+        dfxp_region = dfxp.new_tag('region')
+        dfxp_region.attrs.update({'xml:id': region_id})
+
+        attributes = self._recreate_style(styling, dfxp)
+        dfxp_region.attrs.update(attributes)
+
+        new_tag = dfxp.new_tag('region')
+        new_tag.attrs.update({'xml:id': region_id})
+        if dfxp_region != new_tag:
+            dfxp.find('layout').append(dfxp_region)
+        return dfxp
+
+    def _recreate_styling_tag(self, style, content, dfxp):
+        dfxp_style = dfxp.new_tag('style')
+        dfxp_style.attrs.update({'xml:id': style})
+
+        attributes = self._recreate_style(content, dfxp)
+        dfxp_style.attrs.update(attributes)
+
+        new_tag = dfxp.new_tag('style')
+        new_tag.attrs.update({'xml:id': style})
+        if dfxp_style != new_tag:
+            dfxp.find('styling').append(dfxp_style)
+
+        return dfxp
+
+    def _recreate_p_tag(self, caption, caption_style, dfxp):
+        start = caption.format_start()
+        end = caption.format_end()
+        p = dfxp.new_tag("p", begin=start, end=end)
+        p.string = self._recreate_text(caption, dfxp)
+
+        if dfxp.find("style", {"xml:id": "p"}):
+            p['style'] = 'p'
+
+        p.attrs.update(self._recreate_style(caption_style, dfxp))
+
+        return p
+
+    def _recreate_text(self, caption, dfxp):
+        line = ''
+
+        for node in caption.nodes:
+            if node.type_ == CaptionNode.TEXT:
+                line += escape(node.content) + ' '
+
+            elif node.type_ == CaptionNode.BREAK:
+                line = line.rstrip() + '<br/>\n    '
+
+            elif node.type_ == CaptionNode.STYLE:
+                line = self._recreate_span(line, node, dfxp)
+
+        return line.rstrip()
+
+    def _recreate_span(self, line, node, dfxp):
+        if node.start:
+            styles = ''
+
+            content_with_style = self._recreate_style(node.content, dfxp)
+            for style, value in list(content_with_style.items()):
+                styles += ' %s="%s"' % (style, value)
+
+            if styles:
+                if self.open_span:
+                    line = line.rstrip() + '</span> '
+                line += '<span%s>' % styles
+                self.open_span = True
+
+        elif self.open_span:
+            line = line.rstrip() + '</span> '
+            self.open_span = False
+
+        return line
+
+    def _recreate_style(self, content, dfxp):
+        dfxp_style = {}
+
+        if 'region' in content:
+            if dfxp.find('region', {'xml:id': content['region']}):
+                dfxp_style['region'] = content['region']
+        if 'class' in content:
+            if dfxp.find("style", {"xml:id": content['class']}):
+                dfxp_style['style'] = content['class']
+        if 'text-align' in content:
+            dfxp_style['tts:textAlign'] = content['text-align']
+        if 'italics' in content:
+            dfxp_style['tts:fontStyle'] = 'italic'
+        if 'font-family' in content:
+            dfxp_style['tts:fontFamily'] = content['font-family']
+        if 'font-size' in content:
+            dfxp_style['tts:fontSize'] = content['font-size']
+        if 'color' in content:
+            dfxp_style['tts:color'] = content['color']
+        if 'display-align' in content:
+            dfxp_style['tts:displayAlign'] = content['display-align']
+
+        return dfxp_style
--- a/utils/modules/pycaption/english.pickle
+++ b/utils/modules/pycaption/english.pickle
--- a/utils/modules/pycaption/exceptions.py
+++ b/utils/modules/pycaption/exceptions.py
@@ -0,0 +1,40 @@
+
+
+class CaptionReadError(Exception):
+    """
+    Generic error raised when the reading of the caption file failed.
+    """
+    def __str__(self):
+        return "%s(%s)" % (self.__class__.__name__, self.args)
+
+
+class CaptionReadNoCaptions(CaptionReadError):
+    """
+    Error raised when the provided caption file was not containing any
+    actual captions.
+    """
+
+
+class CaptionReadSyntaxError(CaptionReadError):
+    """
+    Error raised when the provided caption file has syntax errors and could
+    not be parsed.
+    """
+
+
+class CaptionReadTimingError(CaptionReadError):
+    """
+    Error raised when a Caption is initialized with invalid timings.
+    """
+
+
+class RelativizationError(Exception):
+    """
+    Error raised when absolute positioning cannot be converted to
+    percentage
+    """
+
+
+class InvalidInputError(RuntimeError):
+    """ Error raised when the input is invalid (i.e. a unicode string)
+    """
--- a/utils/modules/pycaption/geometry.py
+++ b/utils/modules/pycaption/geometry.py
@@ -0,0 +1,916 @@
+"""
+This module implements the classes used to represent positioning information.
+
+CONVENTIONS:
+* None of the methods should modify the state of the objects on which they're
+  called. If the values of an object need to be recalculated, the method
+  responsible for the recalculation should return a new object with the
+  necessary modifications.
+"""
+import six
+
+from enum import Enum
+from .exceptions import RelativizationError
+
+
+class UnitEnum(Enum):
+    """Enumeration-like object, specifying the units of measure for length
+
+    Usage:
+        unit = UnitEnum.PIXEL
+        unit = UnitEnum.EM
+        if unit == UnitEnum.CELL :
+            ...
+    """
+    PIXEL = 'px'
+    EM = 'em'
+    PERCENT = '%'
+    CELL = 'c'
+    PT = 'pt'
+
+
+class VerticalAlignmentEnum(Enum):
+    """Enumeration object, specifying the allowed vertical alignment options
+
+    Usage:
+        alignment = VerticalAlignmentEnum.TOP
+        if alignment == VerticalAlignmentEnum.BOTTOM:
+            ...
+    """
+    TOP = 'top'
+    CENTER = 'center'
+    BOTTOM = 'bottom'
+
+
+class HorizontalAlignmentEnum(Enum):
+    """Enumeration object specifying the horizontal alignment preferences
+    """
+    LEFT = 'left'
+    CENTER = 'center'
+    RIGHT = 'right'
+    START = 'start'
+    END = 'end'
+
+
+class Alignment(object):
+    def __init__(self, horizontal, vertical):
+        """
+        :type horizontal: HorizontalAlignmentEnum
+        :param horizontal: HorizontalAlignmentEnum member
+        :type vertical: VerticalAlignmentEnum
+        :param vertical: VerticalAlignmentEnum member
+        """
+        self.horizontal = horizontal
+        self.vertical = vertical
+
+    def __hash__(self):
+        return hash(
+            hash(self.horizontal) * 83 +
+            hash(self.vertical) * 89 +
+            97
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.horizontal == other.horizontal and
+            self.vertical == other.vertical
+        )
+
+    def __repr__(self):
+        return "<Alignment ({horizontal} {vertical})>".format(
+            horizontal=self.horizontal, vertical=self.vertical
+        )
+
+    def serialized(self):
+        """Returns a tuple of the useful information regarding this object
+        """
+        return self.horizontal, self.vertical
+
+    @classmethod
+    def from_horizontal_and_vertical_align(cls, text_align=None,
+                                           display_align=None):
+        horizontal_obj = None
+        vertical_obj = None
+
+        if text_align == 'left':
+            horizontal_obj = HorizontalAlignmentEnum.LEFT
+        if text_align == 'start':
+            horizontal_obj = HorizontalAlignmentEnum.START
+        if text_align == 'center':
+            horizontal_obj = HorizontalAlignmentEnum.CENTER
+        if text_align == 'right':
+            horizontal_obj = HorizontalAlignmentEnum.RIGHT
+        if text_align == 'end':
+            horizontal_obj = HorizontalAlignmentEnum.END
+
+        if display_align == 'before':
+            vertical_obj = VerticalAlignmentEnum.TOP
+        if display_align == 'center':
+            vertical_obj = VerticalAlignmentEnum.CENTER
+        if display_align == 'after':
+            vertical_obj = VerticalAlignmentEnum.BOTTOM
+
+        if not any([horizontal_obj, vertical_obj]):
+            return None
+        return cls(horizontal_obj, vertical_obj)
+
+
+class TwoDimensionalObject(object):
+    """Adds a couple useful methods to its subclasses, nothing fancy.
+    """
+    @classmethod
+    # TODO - highly cachable. Should use WeakValueDictionary here to return
+    # flyweights, not new objects.
+    def from_xml_attribute(cls, attribute):
+        """Instantiate the class from a value of the type "4px" or "5%"
+        or any number concatenated with a measuring unit (member of UnitEnum)
+
+        :type attribute: unicode
+        """
+        horizontal, vertical = six.text_type(attribute).split(' ')
+        horizontal = Size.from_string(horizontal)
+        vertical = Size.from_string(vertical)
+
+        return cls(horizontal, vertical)
+
+
+class Stretch(TwoDimensionalObject):
+    """Used for specifying the extent of a rectangle (how much it stretches),
+    or the padding in a rectangle (how much space should be left empty until
+    text can be displayed)
+    """
+    def __init__(self, horizontal, vertical):
+        """Use the .from_xxx methods. They know what's best for you.
+
+        :type horizontal: Size
+        :type vertical: Size
+        """
+        for parameter in [horizontal, vertical]:
+            if not isinstance(parameter, Size):
+                raise ValueError("Stretch must be initialized with two valid "
+                                 "Size objects.")
+        self.horizontal = horizontal
+        self.vertical = vertical
+
+    def is_measured_in(self, measure_unit):
+        """Whether the stretch is only measured in the provided units
+
+        :param measure_unit: a UnitEnum member
+        :return: True/False
+        """
+        return (
+            self.horizontal.unit == measure_unit and
+            self.vertical.unit == measure_unit
+        )
+
+    def __repr__(self):
+        return '<Stretch ({horizontal}, {vertical})>'.format(
+            horizontal=self.horizontal, vertical=self.vertical
+        )
+
+    def serialized(self):
+        """Returns a tuple of the useful attributes of this object"""
+        return (
+            None if not self.horizontal else self.horizontal.serialized(),
+            None if not self.vertical else self.vertical.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.horizontal == other.horizontal and
+            self.vertical == other.vertical
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.horizontal) * 59 +
+            hash(self.vertical) * 61 +
+            67
+        )
+
+    def __bool__(self):
+        return True if self.horizontal or self.vertical else False
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+        """
+        return '{horizontal} {vertical}'.format(
+            horizontal=self.horizontal.to_xml_attribute(),
+            vertical=self.vertical.to_xml_attribute()
+        )
+
+    def is_relative(self):
+        """
+        Returns True if all dimensions are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.horizontal:
+            is_relative &= self.horizontal.is_relative()
+        if self.vertical:
+            is_relative &= self.vertical.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        """
+        Converts absolute units (e.g. px, pt etc) to percentage
+        """
+        return Stretch(
+            self.horizontal.as_percentage_of(video_width=video_width),
+            self.vertical.as_percentage_of(video_height=video_height)
+        )
+
+
+class Region(object):
+    """Represents the spatial coordinates of a rectangle
+
+    Don't instantiate by hand. use Region.from_points or Region.from_extent
+    """
+    @classmethod
+    def from_points(cls, p1, p2):
+        """Create a rectangle, knowing 2 points on the plane.
+        We assume that p1 is in the upper left (closer to the origin)
+
+        :param p1: Point instance
+        :param p2: Point instance
+        :return: a Point instance
+        """
+        inst = cls()
+        inst._p1 = p1
+        inst._p2 = p2
+        return inst
+
+    @classmethod
+    def from_extent(cls, extent, origin):
+        """Create a rectangle, knowing its upper left origin, and
+        spatial extension
+
+        :type extent: Stretch
+        :type origin: Point
+        :return: a Point instance
+        """
+        inst = cls()
+        inst._extent = extent
+        inst._origin = origin
+        return inst
+
+    @property
+    def extent(self):
+        """How wide this rectangle stretches (horizontally and vertically)
+        """
+        if hasattr(self, '_extent'):
+            return self._extent
+        else:
+            return self._p1 - self._p2
+
+    @property
+    def origin(self):
+        """Out of its 4 points, returns the one closest to the origin
+        """
+        if hasattr(self, '_origin'):
+            return self._origin
+        else:
+            return Point.align_from_origin(self._p1, self._p2)[0]
+
+    upper_left_point = origin
+
+    @property
+    def lower_right_point(self):
+        """The point furthest from the origin from the rectangle's 4 points
+        """
+        if hasattr(self, '_p2'):
+            return Point.align_from_origin(self._p1, self._p2)[1]
+        else:
+            return self.origin.add_extent(self.extent)
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.extent == other.extent and
+            self.origin == other.origin
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.origin) * 71 +
+            hash(self.extent) * 73 +
+            79
+        )
+
+
+class Point(TwoDimensionalObject):
+    """Represent a point in 2d space.
+    """
+    def __init__(self, x, y):
+        """
+        :type x: Size
+        :type y: Size
+        """
+        for parameter in [x, y]:
+            if not isinstance(parameter, Size):
+                raise ValueError("Point must be initialized with two valid "
+                                 "Size objects.")
+        self.x = x
+        self.y = y
+
+    def __sub__(self, other):
+        """Returns an Stretch object, if the other point's units are compatible
+        """
+        return Stretch(abs(self.x - other.x), abs(self.y - other.y))
+
+    def add_stretch(self, stretch):
+        """Returns another Point instance, whose coordinates are the sum of the
+         current Point's, and the Stretch instance's.
+        """
+        return Point(self.x + stretch.horizontal, self.y + stretch.vertical)
+
+    def is_relative(self):
+        """
+        Returns True if all dimensions are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.x:
+            is_relative &= self.x.is_relative()
+        if self.y:
+            is_relative &= self.y.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        """
+        Converts absolute units (e.g. px, pt etc) to percentage
+        """
+        return Point(
+            self.x.as_percentage_of(video_width=video_width),
+            self.y.as_percentage_of(video_height=video_height)
+        )
+
+    @classmethod
+    def align_from_origin(cls, p1, p2):
+        """Returns a tuple of 2 points. The first is closest to the origin
+        on both axes than the second.
+
+        If the 2 points fulfill this condition, returns them (ordered), if not,
+        creates 2 new points.
+        """
+        if p1.x <= p2.x and p1.y <= p2.y:
+            return p1
+        if p1.x >= p2.x and p1.y >= p2.y:
+            return p2
+        else:
+            return (Point(min(p1.x, p2.x), min(p1.y, p2.y)),
+                    Point(max(p1.x, p2.x), max(p1.y, p2.y)))
+
+    def __repr__(self):
+        return '<Point ({x}, {y})>'.format(
+            x=self.x, y=self.y
+        )
+
+    def serialized(self):
+        """Returns the "useful" values of this object.
+        """
+        return (
+            None if not self.x else self.x.serialized(),
+            None if not self.y else self.y.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.x == other.x and
+            self.y == other.y
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.x) * 51 +
+            hash(self.y) * 53 +
+            57
+        )
+
+    def __bool__(self):
+        return True if self.x or self.y else False
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+        """
+        return '{x} {y}'.format(
+            x=self.x.to_xml_attribute(), y=self.y.to_xml_attribute())
+
+
+@six.python_2_unicode_compatible
+class Size(object):
+    """Ties together a number with a unit, to represent a size.
+
+    Use as value objects! (don't change after creation)
+    """
+    def __init__(self, value, unit):
+        """
+        :param value: A number (float or int will do)
+        :param unit: A UnitEnum member
+        """
+        if value is None:
+            raise ValueError("Size must be initialized with a value.")
+        if not isinstance(unit,UnitEnum):
+            raise ValueError("Size must be initialized with a valid unit.")
+
+        self.value = float(value)
+        self.unit = unit
+
+    def __sub__(self, other):
+        if self.unit == other.unit:
+            return Size(self.value - other.value, self.unit)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def __abs__(self):
+        return Size(abs(self.value), self.unit)
+
+    def __cmp__(self, other):
+        if self.unit == other.unit:
+            # python3 does not have cmp
+            return (self.value > other.value) - (self.value < other.value)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+
+    def __add__(self, other):
+        if self.unit == other.unit:
+            return Size(self.value + other.value, self.unit)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def is_relative(self):
+        """
+        Returns True if value is expressed as percentage, False otherwise.
+        """
+        return self.unit == UnitEnum.PERCENT
+
+    def as_percentage_of(self, video_width=None, video_height=None):
+        """
+        :param video_width: An integer representing a width in pixels
+        :param video_height: An integer representing a height in pixels
+        """
+        value = self.value
+        unit = self.unit
+
+        if unit == UnitEnum.PERCENT:
+            return self  # Nothing to do here
+
+        # The input must be valid so that any conversion can be done
+        if not (video_width or video_height):
+            raise RelativizationError(
+                "Either video width or height must be given as a reference")
+        elif video_width and video_height:
+            raise RelativizationError(
+                "Only video width or height can be given as reference")
+
+        if unit == UnitEnum.EM:
+            # TODO: Implement proper conversion of em in function of font-size
+            # The em unit is relative to the font-size, to which we currently
+            # have no access. As a workaround, we presume the font-size is 16px,
+            # which is a common default value but not guaranteed.
+            value *= 16
+            unit = UnitEnum.PIXEL
+
+        if unit == UnitEnum.PT:
+            # XXX: we will convert first to "px" and from "px" this will be
+            # converted to percent. we don't take into consideration the
+            # font-size
+            value = value / 72.0 * 96.0
+            unit = UnitEnum.PIXEL
+
+        if unit == UnitEnum.PIXEL:
+            value = value * 100.0 / (video_width or video_height)
+            unit = UnitEnum.PERCENT
+
+        if unit == UnitEnum.CELL:
+            # TODO: Implement proper cell resolution
+            # (w3.org/TR/ttaf1-dfxp/#parameter-attribute-cellResolution)
+            # For now we will use the default values (32 columns and 15 rows)
+            cell_reference = 32 if video_width else 15
+            value = value * 100.0 / cell_reference
+            unit = UnitEnum.PERCENT
+
+        return Size(value, unit)
+
+    @classmethod
+    # TODO - this also looks highly cachable. Should use a WeakValueDict here
+    # to return flyweights
+    def from_string(cls, string):
+        """Given a string of the form "46px" or "5%" etc., returns the proper
+        size object
+
+        :param string: a number concatenated to any of the UnitEnum members.
+        :type string: unicode
+        :rtype: Size
+        """
+
+        raw_number = string
+        for unit in list(UnitEnum):
+            if raw_number.endswith(unit.value):
+                raw_number = raw_number.rstrip(unit.value)
+                break
+        else:
+            unit = None
+
+        if unit is not None:
+            value = None
+            try:
+                value = float(raw_number)
+                value = int(raw_number)
+            except ValueError:
+                pass
+
+            if value is None:
+                raise ValueError(
+                    """Couldn't recognize the value "{value}" as a number"""
+                    .format(value=raw_number)
+                )
+            instance = cls(value, unit)
+            return instance
+        else:
+            raise ValueError(
+                "The specified value is not valid because its unit "
+                "is not recognized: {value}. "
+                "The only supported units are: {supported}"
+                .format(value=raw_number, supported=', '.join(UnitEnum._member_map_))
+            )
+
+    def __repr__(self):
+        return '<Size ({value} {unit})>'.format(
+            value=self.value, unit=self.unit.value
+        )
+
+    def __str__(self):
+        value = round(self.value, 2)
+        if value.is_integer():
+            s = "{}".format(int(value))
+        else:
+            s = "{:.2f}".format(value).rstrip('0').rstrip('.')
+        return "{}{}".format(s, self.unit.value)
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object, as an xml attribute
+        """
+        return six.text_type(self)
+
+    def serialized(self):
+        """Returns the "useful" values of this object"""
+        return self.value, self.unit
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.value == other.value and
+            self.unit == other.unit
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.value) * 41 +
+            hash(self.unit) * 43 +
+            47
+        )
+
+    def __bool__(self):
+        return self.unit in UnitEnum and self.value is not None
+
+
+class Padding(object):
+    """Represents padding information. Consists of 4 Size objects, representing
+    padding from (in this order): before (up), after (down), start (left) and
+    end (right).
+
+    A valid Padding object must always have all paddings set and different from
+    None. If this is not true Writers may fail for they rely on this assumption.
+    """
+    def __init__(self, before=None, after=None, start=None, end=None):
+        """
+        :type before: Size
+        :type after: Size
+        :type start: Size
+        :type end: Size
+        """
+        self.before = before  # top
+        self.after = after  # bottom
+        self.start = start  # left
+        self.end = end  # right
+
+        for attr in ['before', 'after', 'start', 'end']:
+            # Ensure that a Padding object always explicitly defines all
+            # four possible paddings
+            if not isinstance(getattr(self, attr), Size):
+                # Sets default padding (0%)
+                setattr(self, attr, Size(0, UnitEnum.PERCENT))
+
+    @classmethod
+    def from_xml_attribute(cls, attribute):
+        """As per the docs, the style attribute can contain 1,2,3 or 4 values.
+
+        If 1 value: apply to all edges
+        If 2: first applies to before and after, second to start and end
+        If 3: first applies to before, second to start and end, third to after
+        If 4: before, end, after, start;
+
+        http://www.w3.org/TR/ttaf1-dfxp/#style-attribute-padding
+
+        :param attribute: a string like object, representing a dfxp attr. value
+        :return: a Padding object
+        """
+        values_list = six.text_type(attribute).split(' ')
+        sizes = []
+
+        for value in values_list:
+            sizes.append(Size.from_string(value))
+
+        if len(sizes) == 1:
+            return cls(sizes[0], sizes[0], sizes[0], sizes[0])
+        elif len(sizes) == 2:
+            return cls(sizes[0], sizes[0], sizes[1], sizes[1])
+        elif len(sizes) == 3:
+            return cls(sizes[0], sizes[2], sizes[1], sizes[1])
+        elif len(sizes) == 4:
+            return cls(sizes[0], sizes[2], sizes[3], sizes[1])
+        else:
+            raise ValueError('The provided value "{value}" could not be '
+                             "parsed into the a padding. Check out "
+                             "http://www.w3.org/TR/ttaf1-dfxp/"
+                             "#style-attribute-padding for the definition "
+                             "and examples".format(value=attribute))
+
+    def __repr__(self):
+        return (
+            "<Padding (before: {before}, after: {after}, start: {start}, "
+            "end: {end})>".format(
+                before=self.before, after=self.after, start=self.start,
+                end=self.end
+            )
+        )
+
+    def serialized(self):
+        """Returns a tuple containing the useful values of this object
+        """
+        return (
+            None if not self.before else self.before.serialized(),
+            None if not self.after else self.after.serialized(),
+            None if not self.start else self.start.serialized(),
+            None if not self.end else self.end.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.before == other.before and
+            self.after == other.after and
+            self.start == other.start and
+            self.end == other.end
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.before) * 19 +
+            hash(self.after) * 23 +
+            hash(self.start) * 29 +
+            hash(self.end) * 31 +
+            37
+        )
+
+    def to_xml_attribute(
+            self, attribute_order=('before', 'end', 'after', 'start'),
+            **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+
+        TODO - should extend the attribute_order tuple to contain 4 tuples,
+        so we can reduce the output length to 3, 2 or 1 element.
+
+        :type attribute_order: tuple
+        :param attribute_order: the order that the attributes should be
+            serialized
+        """
+        try:
+            string_list = []
+            for attrib in attribute_order:
+                if hasattr(self, attrib):
+                    string_list.append(
+                        getattr(self, attrib).to_xml_attribute())
+        except AttributeError:
+            # A Padding object with attributes set to None is considered
+            # invalid. All four possible paddings must be set. If one of them
+            # is not, this error is raised.
+            raise ValueError("The attribute order specified is invalid.")
+
+        return ' '.join(string_list)
+
+    def as_percentage_of(self, video_width, video_height):
+        return Padding(
+            self.before.as_percentage_of(video_height=video_height),
+            self.after.as_percentage_of(video_height=video_height),
+            self.start.as_percentage_of(video_width=video_width),
+            self.end.as_percentage_of(video_width=video_width)
+        )
+
+    def is_relative(self):
+        is_relative = True
+        if self.before:
+            is_relative &= self.before.is_relative()
+        if self.after:
+            is_relative &= self.after.is_relative()
+        if self.start:
+            is_relative &= self.start.is_relative()
+        if self.end:
+            is_relative &= self.end.is_relative()
+        return is_relative
+
+
+class Layout(object):
+    """Should encapsulate all the information needed to determine (as correctly
+    as possible) the layout (positioning) of elements on the screen.
+
+     Inheritance of this property, from the CaptionSet to its children is
+     specific for each caption type.
+    """
+    def __init__(self, origin=None, extent=None, padding=None, alignment=None,
+                 webvtt_positioning=None, inherit_from=None):
+        """
+        :type origin: Point
+        :param origin: The point on the screen which is the top left vertex
+            of a rectangular region where the captions should be placed
+
+        :type extent: Stretch
+        :param extent: The width and height of the rectangle where the caption
+            should be placed on the screen.
+
+        :type padding: Padding
+        :param padding: The padding of the text inside the region described
+            by the origin and the extent
+
+        :type alignment: Alignment
+
+        :type webvtt_positioning: unicode
+        :param webvtt_positioning: A string with the raw WebVTT cue settings.
+            This is used so that WebVTT positioning isn't lost on conversion
+            from WebVTT to WebVTT. It is needed only because pycaption
+            currently doesn't support reading positioning from WebVTT.
+
+        :type inherit_from: Layout
+        :param inherit_from: A Layout with the positioning parameters to be
+            used if not specified by the positioning arguments,
+        """
+
+        self.origin = origin
+        self.extent = extent
+        self.padding = padding
+        self.alignment = alignment
+        self.webvtt_positioning = webvtt_positioning
+
+        if inherit_from:
+            for attr_name in ['origin', 'extent', 'padding', 'alignment']:
+                attr = getattr(self, attr_name)
+                if not attr:
+                    setattr(self, attr_name, getattr(inherit_from, attr_name))
+
+    def __bool__(self):
+        return any([
+            self.origin, self.extent, self.padding, self.alignment,
+            self.webvtt_positioning
+        ])
+
+    def __repr__(self):
+        return (
+            "<Layout (origin: {origin}, extent: {extent}, "
+            "padding: {padding}, alignment: {alignment})>".format(
+                origin=self.origin, extent=self.extent, padding=self.padding,
+                alignment=self.alignment
+            )
+        )
+
+    def serialized(self):
+        """Returns nested tuple containing the "useful" values of this object
+        """
+        return (
+            None if not self.origin else self.origin.serialized(),
+            None if not self.extent else self.extent.serialized(),
+            None if not self.padding else self.padding.serialized(),
+            None if not self.alignment else self.alignment.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            type(self) == type(other) and
+            self.origin == other.origin and
+            self.extent == other.extent and
+            self.padding == other.padding and
+            self.alignment == other.alignment
+        )
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(
+            hash(self.origin) * 7
+            + hash(self.extent) * 11
+            + hash(self.padding) * 13
+            + hash(self.alignment) * 5
+            + 17
+        )
+
+    def is_relative(self):
+        """
+        Returns True if all positioning values are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.origin:
+            is_relative &= self.origin.is_relative()
+        if self.extent:
+            is_relative &= self.extent.is_relative()
+        if self.padding:
+            is_relative &= self.padding.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        params = {'alignment': self.alignment}
+        # We don't need to preserve webvtt_positioning on Layout
+        # transformations because, if it is set, the WebVTT writer
+        # returns as soon as it's found and the transformations are
+        # never triggered.
+        for attr_name in ['origin', 'extent', 'padding']:
+            attr = getattr(self, attr_name)
+            if attr:
+                params[attr_name] = attr.as_percentage_of(video_width,
+                                                          video_height)
+        return Layout(**params)
+
+    def fit_to_screen(self):
+        """
+        If extent is not set or if origin + extent > 100%, (re)calculate it
+        based on origin. It is a pycaption fix for caption files that are
+        technically valid but contain inconsistent settings that may cause
+        long captions to be cut out of the screen.
+
+        ATTENTION: This must be called on relativized objects (such as the one
+        returned by as_percentage_of). All units are presumed to be percentages.
+        """
+
+        if self.origin:
+            # Calculated values to be used if replacement is needed
+            diff_horizontal = Size(100 - self.origin.x.value, UnitEnum.PERCENT)
+            diff_vertical = Size(100 - self.origin.y.value, UnitEnum.PERCENT)
+            if not self.extent:
+                # Extent is not set, use the calculated values
+                new_extent = Stretch(diff_horizontal, diff_vertical)
+            else:
+                # Extent is set but may have inconsistent values,
+                # e.g. origin="35% 25%" extent="80% 80%", which would cause
+                # captions to end horizontally at 115% and vertically at 105%,
+                # which would result in them being cut out of the screen.
+                # In this case, the horizontal and vertical values are
+                # corrected so that origin + extent = 100%.
+                bottom_right = self.origin.add_stretch(self.extent)
+
+                found_absolute_unit = False
+                if bottom_right.x.unit != UnitEnum.PERCENT:
+                    found_absolute_unit = True
+                elif bottom_right.x.unit != UnitEnum.PERCENT:
+                    found_absolute_unit = True
+
+                if found_absolute_unit:
+                    raise ValueError("Units must be relativized before extent "
+                                     "can be calculated based on origin.")
+
+                new_horizontal = self.extent.horizontal
+                new_vertical = self.extent.vertical
+                # If extent is set but it's inconsistent, replace with
+                # calculated values
+                if bottom_right.x.value > 100:
+                    new_horizontal = diff_horizontal
+                if bottom_right.y.value > 100:
+                    new_vertical = diff_vertical
+
+                new_extent = Stretch(new_horizontal, new_vertical)
+
+            return Layout(
+                origin=self.origin,
+                extent=new_extent,
+                padding=self.padding,
+                alignment=self.alignment
+                # We don't need to preserve webvtt_positioning on Layout
+                # transformations because, if it is set, the WebVTT writer
+                # returns as soon as it's found and the transformations are
+                # never triggered.
+            )
+
+        return self
--- a/utils/modules/pycaption/sami.py
+++ b/utils/modules/pycaption/sami.py
@@ -0,0 +1,805 @@
+"""
+The classes in this module handle SAMI reading and writing. It supports several
+CSS attributes, some of which are handled as positioning settings (and applied
+to Layout objects) and others as simple styling (applied to legacy style nodes).
+
+The following attributes are handled as positioning:
+
+    'text-align' # Converted to Alignment
+    'margin-top'
+    'margin-right'
+    'margin-bottom'
+    'margin-left'
+
+OBS:
+    * Margins are converted to Padding
+    * Margins defined inline are not supported
+      TODO: Add support for inline margins
+
+Any other CSS the BeautifulSoup library manages to parse is handled as simple
+styling and applied to style nodes. However, apparently only these are actually
+used by writers on conversion:
+
+    'font-family'
+    'font-size'
+    'font-style'
+    'color'
+OBS:
+    * Other parameters are preserved, but not if they're specified inline.
+    TODO:
+      Make this less confusing. Confirm whether these really are the only
+      supported styling attributes and make it more clear, perhaps by listing
+      them in constants in the beginning of the file and using them to filter
+      out unneeded attributes either everywhere in the code or not at all, but
+      most importantly regardless of whether they're defined inline or not,
+      because this is irrelevant.
+
+"""
+import re
+import six
+from logging import FATAL
+from collections import deque
+from copy import deepcopy
+from future.backports.html.parser import HTMLParseError
+
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+from xml.sax.saxutils import escape
+
+
+from bs4 import BeautifulSoup, NavigableString
+from cssutils import parseString, log, css as cssutils_css
+
+from .base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
+    DEFAULT_LANGUAGE_CODE)
+from .exceptions import (
+    CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError)
+from .geometry import Layout, Alignment, Padding, Size
+
+
+# change cssutils default logging
+log.setLevel(FATAL)
+
+
+SAMI_BASE_MARKUP = '''
+<sami>
+    <head>
+        <style type="text/css"/>
+    </head>
+    <body/>
+</sami>'''
+
+
+class SAMIReader(BaseReader):
+
+    def __init__(self, *args, **kw):
+        super(SAMIReader, self).__init__(*args, **kw)
+        self.line = []
+        self.first_alignment = None
+
+    def detect(self, content):
+        if '<sami' in content.lower():
+            return True
+        else:
+            return False
+
+    def read(self, content):
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        content, doc_styles, doc_langs = (
+            self._get_sami_parser_class()().feed(content))
+        sami_soup = self._get_xml_parser_class()(content)
+
+        # Get the global layout that applies to all <p> tags
+        global_layout = self._build_layout(doc_styles.get('p', {}))
+
+        caption_dict = {}
+        for language in doc_langs:
+            lang_layout = None
+            for target, styling in list(doc_styles.items()):
+                if target not in ['p', 'sync', 'span']:
+                    if styling.get('lang', None) == language:
+                        lang_layout = self._build_layout(
+                            doc_styles.get(target, {}),
+                            inherit_from=global_layout
+                        )
+                        break
+            lang_layout = lang_layout or global_layout
+            lang_captions = self._translate_lang(
+                language, sami_soup, lang_layout)
+
+            caption_dict[language] = lang_captions
+
+        caption_set = CaptionSet(
+            caption_dict,
+            layout_info=global_layout
+        )
+
+        # Convert styles from CSS to internal representation
+        for style in list(doc_styles.items()):
+            style = (style[0], self._translate_parsed_style(style[1]))
+
+        caption_set.set_styles(doc_styles)
+
+        if caption_set.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+
+        return caption_set
+
+    @staticmethod
+    def _get_sami_parser_class():
+        """Hook method for providing custom SAMIParser classes"""
+        return SAMIParser
+
+    @staticmethod
+    def _get_xml_parser_class():
+        """Hook method for providing a custom XML parser class"""
+        return BeautifulSoup
+
+    def _build_layout(self, styles, inherit_from=None):
+        """
+        :type styles: dict
+        :param styles: a dictionary with CSS-like styling rules
+
+        :type inherit_from: Layout
+        :param inherit_from: The Layout with values to be used in case the
+            positioning settings in the styles parameter don't specify
+            something.
+        """
+        alignment = Alignment.from_horizontal_and_vertical_align(
+            text_align=styles.get('text-align')
+        )
+        return self._get_layout_class()(
+            origin=None,
+            extent=None,
+            padding=self._get_padding(styles),
+            alignment=alignment,
+            inherit_from=inherit_from
+        )
+
+    @staticmethod
+    def _get_layout_class():
+        """Hook method for providing a custom Layout class"""
+        return Layout
+
+    def _get_padding(self, styles):
+        margin_before = self._get_size(styles, 'margin-top')
+        margin_after = self._get_size(styles, 'margin-bottom')
+        margin_start = self._get_size(styles, 'margin-left')
+        margin_end = self._get_size(styles, 'margin-right')
+        if not any([margin_before, margin_after, margin_start, margin_end]):
+            return None
+        return Padding(
+            before=margin_before,  # top
+            after=margin_after,  # bottom
+            start=margin_start,  # left
+            end=margin_end  # right
+        )
+
+    def _get_size(self, styles, style_label):
+        value_from_style = styles.get(style_label, None)
+        if not value_from_style:
+            return None
+        return Size.from_string(value_from_style)
+
+    def _translate_lang(self, language, sami_soup, parent_layout):
+        """
+        For a given language, translate the SAMI XML to internal list of
+        captions.
+
+        :rtype: list
+        """
+        captions = CaptionList(layout_info=parent_layout)
+        milliseconds = 0
+
+        for p in sami_soup.select('p[lang|=%s]' % language):
+            milliseconds = int(float(p.parent['start']))
+            start = milliseconds * 1000
+            end = 0
+
+            if captions != [] and captions[-1].end == 0:
+                captions[-1].end = milliseconds * 1000
+
+            if p.get_text().strip():
+                self.first_alignment = None
+                styles = self._translate_attrs(p)
+                layout_info = self._build_layout(styles,
+                                                 inherit_from=parent_layout)
+                self.line = []
+
+                self._translate_tag(p, layout_info)
+                caption_layout = self._get_layout_class()(
+                    alignment=self.first_alignment,
+                    inherit_from=layout_info
+                )
+                for node in self.line:
+                    node.layout_info = Layout(
+                        alignment=self.first_alignment,
+                        inherit_from=node.layout_info
+                    )
+                self.first_alignment = None
+
+                caption = Caption(start, end, self.line, styles, caption_layout)
+                captions.append(caption)
+
+        if captions and captions[-1].end == 0:
+            # Arbitrarily make this last 4 seconds. Not ideal...
+            captions[-1].end = (milliseconds + 4000) * 1000
+
+        return captions
+
+    def _get_style_name_from_tag(self, tag):
+        if tag == 'i':
+            return 'italics'
+        elif tag == 'b':
+            return 'bold'
+        elif tag == 'u':
+            return 'underline'
+        else:
+            raise RuntimeError("Unknown style tag")
+
+    def _translate_tag(self, tag, inherit_from=None):
+        """
+        :param inherit_from: A Layout object extracted from an ancestor tag
+                to be attached to leaf nodes
+        """
+        # convert text
+        if isinstance(tag, NavigableString):
+            # BeautifulSoup apparently handles unescaping character codes
+            # (e.g. &amp;) automatically. The following variable, therefore,
+            # should contain a plain unicode string.
+            # strips indentation whitespace only
+            pattern = re.compile("^(?:[\n\r]+\s*)?(.+)")
+            result = pattern.search(tag)
+            if not result:
+                return
+            tag_text = result.groups()[0]
+            self.line.append(CaptionNode.create_text(tag_text, inherit_from))
+        # convert line breaks
+        elif tag.name == 'br':
+            self.line.append(CaptionNode.create_break(inherit_from))
+        # convert italics, bold, and underline
+        elif tag.name == 'i' or tag.name == 'b' or tag.name == 'u':
+            style_name = self._get_style_name_from_tag(tag.name)
+            self.line.append(
+                CaptionNode.create_style(True, {style_name: True})
+            )
+            # recursively call function for any children elements
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+            self.line.append(
+                CaptionNode.create_style(False, {style_name: True}))
+        elif tag.name == 'span':
+            self._translate_span(tag, inherit_from)
+        else:
+            # recursively call function for any children elements
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+
+    def _translate_span(self, tag, inherit_from=None):
+        # convert tag attributes
+        args = self._translate_attrs(tag)
+        # only include span tag if attributes returned
+        if args:
+            layout_info = self._build_layout(args, inherit_from)
+            # OLD: Create legacy style node
+            # NEW: But pass new layout object
+            node = CaptionNode.create_style(True, args, layout_info)
+            self.line.append(node)
+            # recursively call function for any children elements
+            for a in tag.contents:
+                # NEW: Pass the layout along so that it's eventually attached
+                # to leaf nodes (e.g. text or break)
+                self._translate_tag(a, layout_info)
+            node = CaptionNode.create_style(False, args, layout_info)
+            self.line.append(node)
+        else:
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+
+    def _translate_attrs(self, tag):
+        attrs = {}
+        css_attrs = tag.attrs
+
+        if 'class' in css_attrs:
+            attrs['class'] = css_attrs['class'][0].lower()
+        if 'id' in css_attrs:
+            attrs['class'] = css_attrs['id'].lower()
+        if 'style' in css_attrs:
+            styles = css_attrs['style'].split(';')
+            attrs.update(self._translate_style(attrs, styles))
+
+        return attrs
+
+    # convert attributes from inline CSS
+    def _translate_style(self, attrs, styles):
+        for style in styles:
+            style = style.split(':')
+            if len(style) == 2:
+                css_property, value = style
+            else:
+                continue
+            if css_property == 'text-align':
+                self._save_first_alignment(value.strip())
+            else:
+                self._translate_css_property(attrs, css_property, value)
+
+        return attrs
+
+    def _translate_parsed_style(self, styles):
+        # Keep unknown styles by default
+        attrs = styles
+        for css_property in list(styles.keys()):
+            value = styles[css_property]
+            self._translate_css_property(attrs, css_property, value)
+
+        return attrs
+
+    def _translate_css_property(self, attrs, css_property, value):
+        if css_property == 'font-family':
+            attrs['font-family'] = value.strip()
+        elif css_property == 'font-size':
+            attrs['font-size'] = value.strip()
+        elif css_property == 'font-style' and value.strip() == 'italic':
+            attrs['italics'] = True
+        elif css_property == 'text-decoration' and value.strip() == 'underline':
+            attrs['underline'] = True
+        elif css_property == 'font-weight' and value.strip() == 'bold':
+            attrs['bold'] = True
+        elif css_property == 'lang':
+            attrs['lang'] = value.strip()
+        elif css_property == 'color':
+            attrs['color'] = value.strip()
+
+    def _save_first_alignment(self, align):
+        """
+        Unlike the other inline CSS attributes parsed in _translate_styles, the
+        'text-align' setting must be applied to a Layout and not to a style
+        because it affects positioning. This Layout must be assigned to the
+        Caption object, and not a Node, because it doesn't make sense to have
+        spans in the same caption with different alignments. Even though the
+        SAMI format seems to in principle accept it, pycaption normalizes to
+        something it can make sense of internally and convert to other formats.
+
+        If there are multiple elements (span, div, etc) in the same line with
+        different alignments, only the first alignment is taken into account.
+
+        If the root element of the caption (sync's first child) has an inline
+        text-align, it is preserved and any children alignment is ignored.
+
+        :param align: A unicode string representing a CSS text-align value
+        """
+        if not self.first_alignment:
+            self.first_alignment = Alignment.from_horizontal_and_vertical_align(  # noqa
+                text_align=align
+            )
+
+
+class SAMIWriter(BaseWriter):
+    def __init__(self, *args, **kwargs):
+        super(SAMIWriter, self).__init__(*args, **kwargs)
+        self.open_span = False
+        self.last_time = None
+
+    def write(self, caption_set):
+        caption_set = deepcopy(caption_set)
+        sami = BeautifulSoup(SAMI_BASE_MARKUP, "lxml-xml")
+
+        caption_set.layout_info = self._relativize_and_fit_to_screen(
+            caption_set.layout_info)
+
+        primary = None
+
+        for lang in caption_set.get_languages():
+            self.last_time = None
+            if primary is None:
+                primary = lang
+
+            caption_set.set_layout_info(
+                lang,
+                self._relativize_and_fit_to_screen(
+                    caption_set.get_layout_info(lang))
+            )
+
+            for caption in caption_set.get_captions(lang):
+                # Loop through all captions/nodes and apply transformations to
+                # layout in function of the provided or default settings
+                caption.layout_info = self._relativize_and_fit_to_screen(
+                    caption.layout_info)
+                for node in caption.nodes:
+                    node.layout_info = self._relativize_and_fit_to_screen(
+                        node.layout_info)
+                sami = self._recreate_p_tag(
+                    caption, sami, lang, primary, caption_set)
+
+        stylesheet = self._recreate_stylesheet(caption_set)
+        sami.find('style').append(stylesheet)
+
+        a = sami.prettify(formatter=None).split('\n')
+        caption_content = '\n'.join(a[1:])
+        return caption_content
+
+    def _recreate_p_tag(self, caption, sami, lang, primary, captions):
+        """
+        Creates a p tag for the given caption, attach it to the sami object
+        and return it.
+
+        :type caption: Caption
+        :type sami: BeautifulSoup
+        :type lang: unicode
+        :type primary: unicode
+        :type captions: CaptionSet
+
+        :rtype: BeautifulSoup
+        """
+        time = caption.start / 1000
+
+        if self.last_time and time != self.last_time:
+            sami = self._recreate_blank_tag(
+                sami, caption, lang, primary, captions)
+
+        self.last_time = caption.end / 1000
+
+        sami, sync = self._recreate_sync(sami, lang, primary, time)
+
+        p = sami.new_tag("p")
+
+        p_style = ''
+        for attr, value in list(self._recreate_style(caption.style).items()):
+            p_style += '%s:%s;' % (attr, value)
+        if p_style:
+            p['p_style'] = p_style
+
+        p['class'] = self._recreate_p_lang(caption, lang, captions)
+        p.string = self._recreate_text(caption.nodes)
+
+        sync.append(p)
+
+        return sami
+
+    def _recreate_sync(self, sami, lang, primary, time):
+        """
+        Creates a sync tag for a given language and timing (if it doesn't
+        already exist), attach it to the sami body and return the sami
+        BeautifulSoupobject.
+
+        :type sami: BeautifulSoup
+        :type lang: unicode
+        :type primary: unicode
+        :type time: int
+
+        :rtype: BeautifulSoup
+        """
+        if lang == primary:
+            sync = sami.new_tag("sync", start="%d" % time)
+            sami.body.append(sync)
+        else:
+            sync = sami.find("sync", start="%d" % time)
+            if sync is None:
+                sami, sync = self._find_closest_sync(sami, time)
+
+        return sami, sync
+
+    def _find_closest_sync(self, sami, time):
+        sync = sami.new_tag("sync", start="%d" % time)
+
+        earlier = sami.find_all("sync", start=lambda x: int(x) < time)
+        if earlier:
+            last_sync = earlier[-1]
+            last_sync.insert_after(sync)
+        else:
+            def later_syncs(start):
+                return int(start) > time
+            later = sami.find_all("sync", start=later_syncs)
+            if later:
+                last_sync = later[0]
+                last_sync.insert_before(sync)
+        return sami, sync
+
+    def _recreate_blank_tag(self, sami, caption, lang, primary, captions):
+        sami, sync = self._recreate_sync(sami, lang, primary, self.last_time)
+
+        p = sami.new_tag("p")
+        p['class'] = self._recreate_p_lang(caption, lang, captions)
+        p.string = '&nbsp;'
+
+        sync.append(p)
+
+        return sami
+
+    def _recreate_p_lang(self, caption, lang, captions):
+        try:
+            if 'lang' in captions.get_style(caption.style['class']):
+                return caption.style['class']
+        except KeyError:
+            pass
+        return lang
+
+    def _recreate_stylesheet(self, caption_set):
+        stylesheet = '<!--'
+
+        for attr, value in caption_set.get_styles():
+            if value != {}:
+                stylesheet += self._recreate_style_block(
+                    attr, value, caption_set.layout_info)
+
+        for lang in caption_set.get_languages():
+            lang_string = 'lang: {}'.format(lang)
+            if lang_string not in stylesheet:
+                stylesheet += self._recreate_style_block(
+                    lang, {'lang': lang}, caption_set.get_layout_info(lang))
+
+        return stylesheet + '   -->'
+
+    def _recreate_style_block(self, target, rules, layout_info):
+        """
+        :param target: A unicode string representing the target of the styling
+            rules.
+        :param rules: A dictionary with CSS-like styling rules.
+
+        :param layout_info: A Layout object providing positioning information
+            to be converted to CSS
+        """
+        if target not in ['p', 'sync', 'span']:
+            # If it's not a valid SAMI element, then it's a custom class name
+            selector = '.{}'.format(target)
+        else:
+            selector = target
+
+        sami_style = '\n    {} {{\n    '.format(selector)
+
+        if layout_info and layout_info.padding:
+            rules.update({
+                'margin-top': six.text_type(layout_info.padding.before),
+                'margin-right': six.text_type(layout_info.padding.end),
+                'margin-bottom': six.text_type(layout_info.padding.after),
+                'margin-left': six.text_type(layout_info.padding.start),
+            })
+
+        for attr, value in sorted(self._recreate_style(rules).items()):
+            sami_style += ' {}: {};\n    '.format(attr, value)
+
+        return sami_style + '}\n'
+
+    def _recreate_text(self, caption):
+        line = ''
+
+        for node in caption:
+            if node.type_ == CaptionNode.TEXT:
+                line += self._encode(node.content) + ' '
+            elif node.type_ == CaptionNode.BREAK:
+                line = line.rstrip() + '<br/>\n    '
+            elif node.type_ == CaptionNode.STYLE:
+                line = self._recreate_line_style(line, node)
+
+        return line.rstrip()
+
+    def _recreate_line_style(self, line, node):
+        if node.start:
+            if self.open_span:
+                line = line.rstrip() + '</span> '
+            line = self._recreate_span(line, node.content)
+        else:
+            if self.open_span:
+                line = line.rstrip() + '</span> '
+                self.open_span = False
+
+        return line
+
+    def _recreate_span(self, line, content):
+        style = ''
+        klass = ''
+        if 'class' in content:
+            klass += ' class="%s"' % content['class']
+
+        for attr, value in list(self._recreate_style(content).items()):
+            style += '%s:%s;' % (attr, value)
+
+        if style or klass:
+            if style:
+                style = ' style="%s"' % style
+            line += '<span%s%s>' % (klass, style)
+            self.open_span = True
+
+        return line
+
+    def _recreate_style(self, rules):
+        """
+        :param rules: A dictionary with CSS-like styling rules
+        """
+        sami_style = {}
+
+        for key, value in list(rules.items()):
+            # Recreate original CSS rules from internal style
+            if key == 'italics' and value == True:
+                sami_style['font-style'] = 'italic'
+            elif key == 'bold' and value == True:
+                sami_style['font-weight'] = 'bold'
+            elif key == 'underline' and value == True:
+                sami_style['text-decoration'] = 'underline'
+            else:
+                sami_style[key] = value
+
+        return sami_style
+
+    def _encode(self, s):
+        """
+        Encodes plain unicode string to proper SAMI file escaping special
+        characters in case they appear in the string.
+        :type s: unicode
+        """
+        return escape(s)
+
+
+class SAMIParser(HTMLParser):
+    def __init__(self, *args, **kw):
+        HTMLParser.__init__(self, *args, **kw)
+        self.sami = ''
+        self.line = ''
+        self.styles = {}
+        self.queue = deque()
+        self.langs = set()
+        self.last_element = ''
+        self.name2codepoint = name2codepoint.copy()
+        self.name2codepoint['apos'] = 0x0027
+        self.convert_charrefs = False
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Override the parser's handling of starttags
+        :param tag: unicode string indicating the tag type (e.g. "head" or "p")
+        :param tag: list of attribute tuples of type (u'name', u'value')
+        """
+        self.last_element = tag
+
+        # treat divs as spans
+        if tag == 'div':
+            tag = 'span'
+
+        # figure out the caption language of P tags
+        if tag == 'p':
+            lang = self._find_lang(attrs)
+
+            # if no language detected, set it as the default
+            lang = lang or DEFAULT_LANGUAGE_CODE
+            attrs.append(('lang', lang))
+            self.langs.add(lang)
+
+        # clean-up line breaks
+        if tag == 'br':
+            self.sami += "<br/>"
+        # add tag to queue
+        else:
+            # if already in queue, first close tags off in LIFO order
+            while tag in self.queue:
+                closer = self.queue.pop()
+                self.sami += "</%s>" % closer
+            # open new tag in queue
+            self.queue.append(tag)
+            # add tag with attributes
+            for attr, value in attrs:
+                tag += ' %s="%s"' % (attr.lower(), value)
+            self.sami += "<%s>" % tag
+
+    # override the parser's handling of endtags
+    def handle_endtag(self, tag):
+        # treat divs as spans
+        if tag == 'div':
+            tag = 'span'
+
+        # handle incorrectly formatted sync/p tags
+        if tag in ['p', 'sync'] and tag == self.last_element:
+            return
+
+        # close off tags in LIFO order, if matching starting tag in queue
+        while tag in self.queue:
+            closing_tag = self.queue.pop()
+            self.sami += "</%s>" % closing_tag
+
+    def handle_entityref(self, name):
+        if name in ['gt', 'lt']:
+            self.sami += '&%s;' % name
+        else:
+            try:
+                self.sami += chr(self.name2codepoint[name])
+            except (KeyError, ValueError):
+                self.sami += '&%s' % name
+
+        self.last_element = ''
+
+    def handle_charref(self, name):
+        if name[0] == 'x':
+            self.sami += chr(int(name[1:], 16))
+        else:
+            self.sami += chr(int(name))
+
+    # override the parser's handling of data
+    def handle_data(self, data):
+        self.sami += data
+        self.last_element = ''
+
+    # override the parser's feed function
+    def feed(self, data):
+        """
+        :param data: Raw SAMI unicode string
+        :returns: tuple (unicode, dict, set)
+        """
+        no_cc = 'no closed captioning available'
+
+        if '<html' in data.lower():
+            raise CaptionReadSyntaxError(
+                'SAMI File seems to be an HTML file.')
+        elif no_cc in data.lower():
+            raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc)
+
+        # try to find style tag in SAMI
+        try:
+            # prevent BS4 error with huge SAMI files with unclosed tags
+            index = data.lower().find("</head>")
+
+            self.styles = self._css_parse(
+                BeautifulSoup(data[:index], "lxml").find('style').get_text())
+        except AttributeError:
+            self.styles = {}
+
+        # fix erroneous italics tags
+        data = data.replace('<i/>', '<i>')
+
+        # fix awkward tags found in some SAMIs
+        data = data.replace(';>', '>')
+        try:
+            HTMLParser.feed(self, data)
+        except HTMLParseError as e:
+            raise CaptionReadSyntaxError(e)
+
+        # close any tags that remain in the queue
+        while self.queue != deque([]):
+            closing_tag = self.queue.pop()
+            self.sami += "</%s>" % closing_tag
+
+        return self.sami, self.styles, self.langs
+
+    # parse the SAMI's stylesheet
+    def _css_parse(self, css):
+        """
+        Parse styling via cssutils modules
+        :rtype: dict
+        """
+        sheet = parseString(css)
+        style_sheet = {}
+
+        for rule in sheet:
+            new_style = {}
+            selector = rule.selectorText.lower()
+            if selector[0] in ['#', '.']:
+                selector = selector[1:]
+            # keep any style attributes that are needed
+            for prop in rule.style:
+                if prop.name == 'color':
+                    cv = cssutils_css.ColorValue(prop.value)
+                    # Code for RGB to hex conversion comes from
+                    # http://bit.ly/1kwfBnQ
+                    new_style['color'] = "#%02x%02x%02x" % (
+                        cv.red, cv.green, cv.blue)
+                else:
+                    new_style[prop.name] = prop.value
+            if new_style:
+                style_sheet[selector] = new_style
+
+        return style_sheet
+
+    def _find_lang(self, attrs):
+        for attr, value in attrs:
+            # if lang is an attribute of the tag
+            if attr.lower() == 'lang':
+                return value[:2]
+            # if the P tag has a class, try and find the language
+            if attr.lower() == 'class':
+                try:
+                    return self.styles[value.lower()]['lang']
+                except KeyError:
+                    pass
+
+        return None
--- a/utils/modules/pycaption/scc/init.py
+++ b/utils/modules/pycaption/scc/init.py
@@ -0,0 +1,696 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+3 types of SCC captions:
+    Roll-Up
+    Paint-On
+    Pop-On
+
+Commands:
+    94ae - [ENM] - Erase Non-displayed(buffer) Memory
+    942c - [EDM] - Erase Displayed Memory
+    9420 - [RCL] - Resume Caption Loading
+    9429 - [RDC] - Resume Direct Captioning
+
+    9425, 9426, 94a7 - [RU2], [RU3], [RU4] (roll up captions 2,3 or 4 rows)
+        - these commands set the number of expected lines
+
+    94ad - (in CEA-608-E: 142d) - [CR] carriage return.
+        - This actually rolls the captions up as many rows as specified by
+        [RU1], [RU2], or [RU3]
+
+    80 - no-op char. Doesn't do anything, but must be used with other
+        characters, to make a 2 byte word
+
+    97a1, 97a2, 9723 - [TO] move 1, 2 or 3 columns - Tab Over command
+        - this moves the positioning 1, 2, or 3 columns to the right
+        - Nothing regarding this is implemented.
+
+    942f - [EOC] - display the buffer on the screen - End Of Caption
+    ... - [PAC] - Preamble address code (can set positioning and style)
+        - All the PACs are specified by the first and second byte combined
+        from pycaption.scc.constants.PAC_BYTES_TO_POSITIONING_MAP
+
+    9429 - [RDC] - Resume Direct Captioning
+    94a4 - (in CEA-608-E: 1424) - [DER] Delete to End of Row
+
+
+Pop-On:
+    The commands should usually appear in this order. Not strict though, and
+    the the commands don't have to necessarily be on the same row.
+
+    1. 94ae [ENM] (erase non displayed memory)
+    2. 9420 [RCL] (resume caption loading => this command here means we're using Pop-On captions)
+    2.1? [ENM] - if step 0 was skipped?
+    3. [PAC] Positioning/ styling command (can position on columns divisible by 4)
+        The control chars is called Preamble Address Code [PAC].
+    4. If positioning needs to be on columns not divisible by 4, use a [TO] command
+    5. text
+    6. 942c [EDM] - optionally, erase the currently displayed caption
+    7. 942f [EOC] display the caption
+
+
+Roll-Up:
+    1. [RU2], [RU3] or [RU4]    - sets Roll-Up style and depth
+        - these set the Roll-Up style: (characteristic command)
+    2. [CR] to roll the display up 1 row...lol?
+    3. [PAC] - sets the indent of the base row
+
+
+Paint-On:
+    1. [RDC] - sets the Paint-On style (characteristic command)
+    2. [PAC]
+    3. text
+    4. [PAC]
+    5. text or [DER]
+
+There are some rules regarding the parity of the commands.
+
+This resource:
+http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML
+ specifies that there are interpreters which only work if the commands have an
+ odd parity. This however is not consistent, and we might not handle well
+ these cases. Odd parity of a command means that, converting toe word into
+ binary, should result in an odd number of '1's. The PAC commands obey this
+ rule, but some do not. Some commands that do not are found in the COMMANDS
+ dictionary. This is legacy logic, that I didn't know how to handle, and
+ just carried over when implementing positioning.
+"""
+
+import re
+import math
+import textwrap
+from copy import deepcopy
+
+import six
+
+from pycaption.base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionNode,
+)
+from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError
+from .constants import (
+    HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS,
+    MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
+    SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
+    PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
+)
+from .specialized_collections import (
+    TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
+    InstructionNodeCreator)
+from .state_machines import DefaultProvidingPositionTracker
+
+
+class NodeCreatorFactory(object):
+    """Will return instances of the given node_creator.
+
+    This is used as a means of creating new InstructionNodeCreator instances,
+    because these need to share state beyond their garbage collection, but
+    storing the information at the class level is not good either, because
+    this information must be erased after the reader's .read() operation
+    completes.
+    """
+    def __init__(self, position_tracker,
+                 node_creator=InstructionNodeCreator):
+        self.position_tracker = position_tracker
+        self.node_creator = node_creator
+
+    def new_creator(self):
+        """Returns a new instance of self.node_creator, initialized with
+        the same italics_tracker, and position_tracker
+        """
+        return self.node_creator(position_tracker=self.position_tracker)
+
+    def from_list(self, roll_rows):
+        """Wraps the node_creator's method with the same name
+
+        :param roll_rows: list of node_creator instances
+
+        :return: a node_creator instance
+        """
+        return self.node_creator.from_list(
+            roll_rows,
+            position_tracker=self.position_tracker
+        )
+
+
+def get_corrected_end_time(caption):
+    """If the last caption was never explicitly ended, set its end time to
+    start + 4 seconds
+
+    :param Caption caption: the last caption
+    :rtype: int
+    """
+    if caption.end:
+        return caption.end
+
+    return caption.start + 4 * 1000 * 1000
+
+
+class SCCReader(BaseReader):
+    """Converts a given unicode string to a CaptionSet.
+
+    This can be then later used for converting into any other supported formats
+    """
+    def __init__(self, *args, **kw):
+        self.caption_stash = CaptionCreator()
+        self.time_translator = _SccTimeTranslator()
+
+        self.node_creator_factory = NodeCreatorFactory(
+            DefaultProvidingPositionTracker()
+        )
+
+        self.last_command = ''
+
+        self.buffer_dict = NotifyingDict()
+
+        self.buffer_dict['pop'] = self.node_creator_factory.new_creator()
+        self.buffer_dict['paint'] = self.node_creator_factory.new_creator()
+        self.buffer_dict['roll'] = self.node_creator_factory.new_creator()
+
+        # Call this method when the active key changes
+        self.buffer_dict.add_change_observer(self._flush_implicit_buffers)
+        self.buffer_dict.set_active('pop')
+
+        self.roll_rows = []
+        self.roll_rows_expected = 0
+        self.simulate_roll_up = False
+
+        self.time = 0
+
+    def detect(self, content):
+        """Checks whether the given content is a proper SCC file
+
+        :type content: unicode
+
+        :rtype: bool
+        """
+        lines = content.splitlines()
+        if lines[0] == HEADER:
+            return True
+        else:
+            return False
+
+    def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
+        """Converts the unicode string into a CaptionSet
+
+        :type content: six.text_type
+        :param content: The SCC content to be converted to a CaptionSet
+
+        :type lang: six.text_type
+        :param lang: The language of the caption
+
+        :type simulate_roll_up: bool
+        :param simulate_roll_up: If True, when converting to other formats,
+            the resulting captions will contain all the rows that were visible
+            on the screen when the captions were rolling up.
+
+        :type offset: int
+        :param offset:
+
+        :rtype: CaptionSet
+        """
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        self.simulate_roll_up = simulate_roll_up
+        self.time_translator.offset = offset * 1000000
+        # split lines
+        lines = content.splitlines()
+
+        # loop through each line except the first
+        for line in lines[1:]:
+            self._translate_line(line)
+
+        self._flush_implicit_buffers()
+
+        captions = CaptionSet({lang: self.caption_stash.get_all()})
+
+        # check captions for incorrect lengths
+        for cap in captions.get_captions(lang):
+            # if there's an end time on a caption and the difference is
+            # less than .05s kill it (this is likely caused by a standalone
+            # EOC marker in the SCC file)
+            if 0 < cap.end - cap.start < 50000:
+                raise ValueError('unsupported length found in SCC input file: ' + str(cap))
+
+        if captions.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+        else:
+            last_caption = captions.get_captions(lang)[-1]
+            last_caption.end = get_corrected_end_time(last_caption)
+
+        return captions
+
+    def _fix_last_timing(self, timing):
+        """HACK HACK: Certain Paint-On captions don't specify the 942f [EOC]
+        (End Of Caption) command on the same line.
+        If this is a 942f line, also simulate a 942c (Erase Displayed Memory)
+        to properly set the timing on the last caption.
+
+        This method needs some serious attention, because it proves the timing
+        calculation is not done well for Pop-On captions
+        """
+        # Calculate the end time from the current line
+        time_translator = _SccTimeTranslator()
+        time_translator.start_at(timing)
+        time_translator.offset = self.time_translator.offset
+
+        # But use the current time translator for the start time
+        self.caption_stash.create_and_store(
+            self.buffer, self.time_translator.get_time())
+
+        self.caption_stash.correct_last_timing(time_translator.get_time())
+        self.buffer = self.node_creator_factory.node_creator()
+
+    def _flush_implicit_buffers(self, old_key=None, *args):
+        """Convert to Captions those buffers whose behavior is implicit.
+
+        The Paint-On buffer is explicit. New captions are created from it
+        with the command 'End Of Caption' [EOC], '942f'
+
+        The other 2 buffers, Roll-Up and Paint-On we treat as "more" implicit,
+        meaning that they can be displayed by a command on the next row.
+        If they're on the last row however, or if the caption type is changing,
+        we make sure to convert the buffers to text, so we don't lose any info.
+        """
+        if old_key == 'pop':
+            return
+
+        elif old_key is None or old_key == 'roll':
+            if not self.buffer.is_empty():
+                self._roll_up()
+
+        elif old_key is None or old_key == 'paint':
+            # xxx - perhaps the self.buffer property is sufficient
+            if not self.buffer_dict['paint'].is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer_dict['paint'], self.time)
+
+    def _translate_line(self, line):
+        # ignore blank lines
+        if line.strip() == '':
+            return
+
+        # split line in timestamp and words
+        r = re.compile(r"([0-9:;]*)([\s\t]*)((.)*)")
+        parts = r.findall(line.lower())
+
+        # XXX!!!!!! THESE 2 LINES ARE A HACK
+        if parts[0][2].strip() == '942f':
+            self._fix_last_timing(timing=parts[0][0])
+
+        self.time_translator.start_at(parts[0][0])
+
+        # loop through each word
+        for word in parts[0][2].split(' '):
+            # ignore empty results
+            if word.strip() != '':
+                self._translate_word(word)
+
+    def _translate_word(self, word):
+        # count frames for timing
+        self.time_translator.increment_frames()
+
+        # first check if word is a command
+        # TODO - check that all the positioning commands are here, or use
+        # some other strategy to determine if the word is a command.
+        if word in COMMANDS or _is_pac_command(word):
+            self._translate_command(word)
+
+        # second, check if word is a special character
+        elif word in SPECIAL_CHARS:
+            self._translate_special_char(word)
+
+        elif word in EXTENDED_CHARS:
+            self._translate_extended_char(word)
+
+        # third, try to convert word into 2 characters
+        else:
+            self._translate_characters(word)
+
+    def _handle_double_command(self, word):
+        # ensure we don't accidentally use the same command twice
+        if word == self.last_command:
+            self.last_command = ''
+            return True
+        else:
+            self.last_command = word
+            return False
+
+    def _translate_special_char(self, word):
+        # XXX - this looks highly buggy. Why should special chars be ignored
+        # when printed 2 times one after another?
+        if self._handle_double_command(word):
+            return
+
+        self.buffer.add_chars(SPECIAL_CHARS[word])
+
+    def _translate_extended_char(self, word):
+        # XXX - this looks highly buggy. Why would a special char be ignored
+        # if it's printed 2 times one after another?
+        if self._handle_double_command(word):
+            return
+
+        # add to buffer
+        self.buffer.add_chars(EXTENDED_CHARS[word])
+
+    def _translate_command(self, word):
+        if self._handle_double_command(word):
+            return
+
+        # if command is pop_up
+        if word == '9420':
+            self.buffer_dict.set_active('pop')
+
+        # command is paint_on [Resume Direct Captioning]
+        elif word == '9429':
+            self.buffer_dict.set_active('paint')
+
+            self.roll_rows_expected = 1
+            if not self.buffer.is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer, self.time
+                )
+                self.buffer = self.node_creator_factory.new_creator()
+
+            self.time = self.time_translator.get_time()
+
+        # if command is roll_up 2, 3 or 4 rows
+        elif word in ('9425', '9426', '94a7'):
+            self.buffer_dict.set_active('roll')
+
+            # count how many lines are expected
+            if word == '9425':
+                self.roll_rows_expected = 2
+            elif word == '9426':
+                self.roll_rows_expected = 3
+            elif word == '94a7':
+                self.roll_rows_expected = 4
+
+            # if content is in the queue, turn it into a caption
+            if not self.buffer.is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer, self.time)
+                self.buffer = self.node_creator_factory.new_creator()
+
+            # set rows to empty, configure start time for caption
+            self.roll_rows = []
+            self.time = self.time_translator.get_time()
+
+        # clear pop_on buffer
+        elif word == '94ae':
+            self.buffer = self.node_creator_factory.new_creator()
+
+        # display pop_on buffer [End Of Caption]
+        elif word == '942f':
+            self.time = self.time_translator.get_time()
+            self.caption_stash.create_and_store(self.buffer, self.time)
+            self.buffer = self.node_creator_factory.new_creator()
+
+        # roll up captions [Carriage Return]
+        elif word == '94ad':
+            # display roll-up buffer
+            if not self.buffer.is_empty():
+                self._roll_up()
+
+        # clear screen
+        elif word == '942c':
+            self.roll_rows = []
+
+            # XXX - The 942c command has nothing to do with paint-ons
+            # This however is legacy code, and will break lots of tests if
+            # the proper buffer (self.buffer) is used.
+            # Most likely using `self.buffer` instead of the paint buffer
+            # is the right thing to do, but this needs some further attention.
+            if not self.buffer_dict['paint'].is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer_dict['paint'], self.time)
+                self.buffer = self.node_creator_factory.new_creator()
+
+            # attempt to add proper end time to last caption(s)
+            self.caption_stash.correct_last_timing(
+                self.time_translator.get_time())
+
+        # if command not one of the aforementioned, add to buffer
+        else:
+            self.buffer.interpret_command(word)
+
+    def _translate_characters(self, word):
+        # split word into the 2 bytes
+        byte1 = word[:2]
+        byte2 = word[2:]
+
+        # check to see if the the bytes are recognized characters
+        if byte1 not in CHARACTERS or byte2 not in CHARACTERS:
+            return
+
+        self.buffer.add_chars(CHARACTERS[byte1], CHARACTERS[byte2])
+
+    @property
+    def buffer(self):
+        """Returns the currently active buffer
+        """
+        return self.buffer_dict.get_active()
+
+    @buffer.setter
+    def buffer(self, value):
+        """Sets a new value to the active key
+
+        :param value: any object
+        """
+        try:
+            key = self.buffer_dict.active_key
+            self.buffer_dict[key] = value
+        except TypeError:
+            pass
+
+    def _roll_up(self):
+        # We expect the active buffer to be the rol buffer
+        if self.simulate_roll_up:
+            if self.roll_rows_expected > 1:
+                if len(self.roll_rows) >= self.roll_rows_expected:
+                    self.roll_rows.pop(0)
+
+                self.roll_rows.append(self.buffer)
+                self.buffer = self.node_creator_factory.from_list(
+                    self.roll_rows)
+
+        # convert buffer and empty
+        self.caption_stash.create_and_store(self.buffer, self.time)
+        self.buffer = self.node_creator_factory.new_creator()
+
+        # configure time
+        self.time = self.time_translator.get_time()
+
+        # try to insert the proper ending time for the previous caption
+        self.caption_stash.correct_last_timing(self.time, force=True)
+
+
+class SCCWriter(BaseWriter):
+
+    def __init__(self, *args, **kw):
+        super(SCCWriter, self).__init__(*args, **kw)
+
+    def write(self, caption_set):
+        output = HEADER + '\n\n'
+
+        if caption_set.is_empty():
+            return output
+
+        caption_set = deepcopy(caption_set)
+
+        # Only support one language.
+        lang = list(caption_set.get_languages())[0]
+        captions = caption_set.get_captions(lang)
+
+        # PASS 1: compute codes for each caption
+        codes = [(self._text_to_code(caption), caption.start, caption.end)
+                 for caption in captions]
+
+        # PASS 2:
+        # Advance start times so as to have time to write to the pop-on
+        # buffer; possibly remove the previous clear-screen command
+        for index, (code, start, end) in enumerate(codes):
+            code_words = len(code) / 5 + 8
+            code_time_microseconds = code_words * MICROSECONDS_PER_CODEWORD
+            code_start = start - code_time_microseconds
+            if index == 0:
+                continue
+            previous_code, previous_start, previous_end = codes[index-1]
+            if previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start:
+                codes[index-1] = (previous_code, previous_start, None)
+            codes[index] = (code, code_start, end)
+
+        # PASS 3:
+        # Write captions.
+        for (code, start, end) in codes:
+            output += ('%s\t' % self._format_timestamp(start))
+            output += '94ae 94ae 9420 9420 '
+            output += code
+            output += '942c 942c 942f 942f\n\n'
+            if end is not None:
+                output += '%s\t942c 942c\n\n' % self._format_timestamp(end)
+
+        return output
+
+    # Wrap lines at 32 chars
+    @staticmethod
+    def _layout_line(caption):
+        def caption_node_to_text(caption_node):
+            if caption_node.type_ == CaptionNode.TEXT:
+                return six.text_type(caption_node.content)
+            elif caption_node.type_ == CaptionNode.BREAK:
+                return '\n'
+        caption_text = ''.join(
+            [caption_node_to_text(node) for node in caption.nodes])
+        inner_lines = caption_text.split('\n')
+        inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines]
+        return '\n'.join(inner_lines_laid_out)
+
+    @staticmethod
+    def _maybe_align(code):
+        # Finish a half-word with a no-op so we can move to a full word
+        if len(code) % 5 == 2:
+            code += '80 '
+        return code
+
+    @staticmethod
+    def _maybe_space(code):
+        if len(code) % 5 == 4:
+            code += ' '
+        return code
+
+    def _print_character(self, code, char):
+        try:
+            char_code = CHARACTER_TO_CODE[char]
+        except KeyError:
+            try:
+                char_code = SPECIAL_OR_EXTENDED_CHAR_TO_CODE[char]
+            except KeyError:
+                char_code = '91b6'  # Use £ as "unknown character" symbol
+
+        if len(char_code) == 2:
+            return code + char_code
+        elif len(char_code) == 4:
+            return self._maybe_align(code) + char_code
+        else:
+            # This should not happen!
+            return code
+
+    def _text_to_code(self, s):
+        code = ''
+        lines = self._layout_line(s).split('\n')
+        for row, line in enumerate(lines):
+            row += 16 - len(lines)
+            # Move cursor to column 0 of the destination row
+            for _ in range(2):
+                code += ('%s%s ' % (PAC_HIGH_BYTE_BY_ROW[row],
+                                    PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]))
+            # Print the line using the SCC encoding
+            for char in line:
+                code = self._print_character(code, char)
+                code = self._maybe_space(code)
+            code = self._maybe_align(code)
+        return code
+
+    @staticmethod
+    def _format_timestamp(microseconds):
+        seconds_float = microseconds / 1000.0 / 1000.0
+        # Convert to non-drop-frame timecode
+        seconds_float *= 1000.0 / 1001.0
+        hours = math.floor(seconds_float / 3600)
+        seconds_float -= hours * 3600
+        minutes = math.floor(seconds_float / 60)
+        seconds_float -= minutes * 60
+        seconds = math.floor(seconds_float)
+        seconds_float -= seconds
+        frames = math.floor(seconds_float * 30)
+        return '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frames)
+
+
+class _SccTimeTranslator(object):
+    """Converts SCC time to microseconds, keeping track of frames passed
+    """
+    def __init__(self):
+        self._time = '00:00:00;00'
+
+        # microseconds. The offset from which we begin the time calculation
+        self.offset = 0
+        self._frames = 0
+
+    def get_time(self):
+        """Returns the time, in microseconds. Takes into account the number of
+        frames passed, and the offset
+
+        :rtype: int
+        """
+        return self._translate_time(
+            self._time[:-2] + six.text_type(int(self._time[-2:]) + self._frames),
+            self.offset
+        )
+
+    @staticmethod
+    def _translate_time(stamp, offset):
+        """
+        :param stamp:
+        :type offset: int
+        :param offset: Subtract this many microseconds from the calculated time
+            Helpful for when the captions are off by some time interval.
+        :rtype: int
+        """
+        if ';' in stamp:
+            # Drop-frame timebase runs at the same rate as wall clock
+            seconds_per_timestamp_second = 1.0
+        else:
+            # Non-drop-frame timebase runs "slow"
+            # 1 second of timecode is longer than an actual second (1.001s)
+            seconds_per_timestamp_second = 1001.0 / 1000.0
+
+        time_split = stamp.replace(';', ':').split(':')
+
+        timestamp_seconds = (int(time_split[0]) * 3600 +
+                             int(time_split[1]) * 60 +
+                             int(time_split[2]) +
+                             int(time_split[3]) / 30.0)
+
+        seconds = timestamp_seconds * seconds_per_timestamp_second
+        microseconds = seconds * 1000 * 1000 - offset
+
+        if microseconds < 0:
+            microseconds = 0
+
+        return microseconds
+
+    def start_at(self, timespec):
+        """Reset the counter to the given time
+
+        :type timespec: unicode
+        """
+        self._time = timespec
+        self._frames = 0
+
+    def increment_frames(self):
+        """After a command was processed, we'd increment the number of frames
+        """
+        self._frames += 1
+
+
+def _is_pac_command(word):
+    """Checks whether the given word is a Preamble Address Code [PAC] command
+
+    :type word: unicode
+    :param word: 4 letter unicode command
+
+    :rtype: bool
+    """
+    if not word or len(word) != 4:
+        return False
+
+    byte1, byte2 = word[:2], word[2:]
+
+    try:
+        PAC_BYTES_TO_POSITIONING_MAP[byte1][byte2]
+    except KeyError:
+        return False
+    else:
+        return True
--- a/utils/modules/pycaption/scc/pycache/init.cpython-36.pyc
+++ b/utils/modules/pycaption/scc/pycache/init.cpython-36.pyc
--- a/utils/modules/pycaption/scc/pycache/init.cpython-37.pyc
+++ b/utils/modules/pycaption/scc/pycache/init.cpython-37.pyc
--- a/utils/modules/pycaption/scc/pycache/init.cpython-38.pyc
+++ b/utils/modules/pycaption/scc/pycache/init.cpython-38.pyc
--- a/utils/modules/pycaption/scc/pycache/init.cpython-39.pyc
+++ b/utils/modules/pycaption/scc/pycache/init.cpython-39.pyc
--- a/utils/modules/pycaption/scc/pycache/constants.cpython-36.pyc
+++ b/utils/modules/pycaption/scc/pycache/constants.cpython-36.pyc
--- a/utils/modules/pycaption/scc/pycache/constants.cpython-37.pyc
+++ b/utils/modules/pycaption/scc/pycache/constants.cpython-37.pyc
--- a/utils/modules/pycaption/scc/pycache/constants.cpython-38.pyc
+++ b/utils/modules/pycaption/scc/pycache/constants.cpython-38.pyc
--- a/utils/modules/pycaption/scc/pycache/constants.cpython-39.pyc
+++ b/utils/modules/pycaption/scc/pycache/constants.cpython-39.pyc
--- a/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-36.pyc
+++ b/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-36.pyc
--- a/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-37.pyc
+++ b/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-37.pyc
--- a/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-38.pyc
+++ b/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-38.pyc
--- a/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-39.pyc
+++ b/utils/modules/pycaption/scc/pycache/specialized_collections.cpython-39.pyc
--- a/utils/modules/pycaption/scc/pycache/state_machines.cpython-36.pyc
+++ b/utils/modules/pycaption/scc/pycache/state_machines.cpython-36.pyc
--- a/utils/modules/pycaption/scc/pycache/state_machines.cpython-37.pyc
+++ b/utils/modules/pycaption/scc/pycache/state_machines.cpython-37.pyc
--- a/utils/modules/pycaption/scc/pycache/state_machines.cpython-38.pyc
+++ b/utils/modules/pycaption/scc/pycache/state_machines.cpython-38.pyc
--- a/utils/modules/pycaption/scc/pycache/state_machines.cpython-39.pyc
+++ b/utils/modules/pycaption/scc/pycache/state_machines.cpython-39.pyc
--- a/utils/modules/pycaption/scc/constants.py
+++ b/utils/modules/pycaption/scc/constants.py
@@ -0,0 +1,984 @@
+# -*- coding: utf-8 -*-
+
+from itertools import product
+from future.utils import viewitems
+
+COMMANDS = {
+    '9420': '',
+    '9429': '',
+    '9425': '',
+    '9426': '',
+    '94a7': '',
+    '942a': '',
+    '94ab': '',
+    '942c': '',
+    '94ae': '',
+    '942f': '',
+    '9779': '<$>{break}<$>',
+    '9775': '<$>{break}<$>',
+    '9776': '<$>{break}<$>',
+    '9770': '<$>{break}<$>',
+    '9773': '<$>{break}<$>',
+    '10c8': '<$>{break}<$>',
+    '10c2': '<$>{break}<$>',
+    '166e': '<$>{break}<$>{italic}<$>',
+    '166d': '<$>{break}<$>',
+    '166b': '<$>{break}<$>',
+    '10c4': '<$>{break}<$>',
+    '9473': '<$>{break}<$>',
+    '977f': '<$>{break}<$>',
+    '977a': '<$>{break}<$>',
+    '1668': '<$>{break}<$>',
+    '1667': '<$>{break}<$>',
+    '1664': '<$>{break}<$>',
+    '1661': '<$>{break}<$>',
+    '10ce': '<$>{break}<$>{italic}<$>',
+    '94c8': '<$>{break}<$>',
+    '94c7': '<$>{break}<$>',
+    '94c4': '<$>{break}<$>',
+    '94c2': '<$>{break}<$>',
+    '94c1': '<$>{break}<$>',
+    '915e': '<$>{break}<$>',
+    '915d': '<$>{break}<$>',
+    '915b': '<$>{break}<$>',
+    '925d': '<$>{break}<$>',
+    '925e': '<$>{break}<$>',
+    '925b': '<$>{break}<$>',
+    '97e6': '<$>{break}<$>',
+    '97e5': '<$>{break}<$>',
+    '97e3': '<$>{break}<$>',
+    '97e0': '<$>{break}<$>',
+    '97e9': '<$>{break}<$>',
+    '9154': '<$>{break}<$>',
+    '9157': '<$>{break}<$>',
+    '9151': '<$>{break}<$>',
+    '9258': '<$>{break}<$>',
+    '9152': '<$>{break}<$>',
+    '9257': '<$>{break}<$>',
+    '9254': '<$>{break}<$>',
+    '9252': '<$>{break}<$>',
+    '9158': '<$>{break}<$>',
+    '9251': '<$>{break}<$>',
+    '94cd': '<$>{break}<$>',
+    '94ce': '<$>{break}<$>{italic}<$>',
+    '94cb': '<$>{break}<$>',
+    '97ef': '<$>{break}<$>{italic}<$>',
+    '1373': '<$>{break}<$>',
+    '97ec': '<$>{break}<$>',
+    '97ea': '<$>{break}<$>',
+    '15c7': '<$>{break}<$>',
+    '974f': '<$>{break}<$>{italic}<$>',
+    '10c1': '<$>{break}<$>',
+    '974a': '<$>{break}<$>',
+    '974c': '<$>{break}<$>',
+    '10c7': '<$>{break}<$>',
+    '976d': '<$>{break}<$>',
+    '15d6': '<$>{break}<$>',
+    '15d5': '<$>{break}<$>',
+    '15d3': '<$>{break}<$>',
+    '15d0': '<$>{break}<$>',
+    '15d9': '<$>{break}<$>',
+    '9745': '<$>{break}<$>',
+    '9746': '<$>{break}<$>',
+    '9740': '<$>{break}<$>',
+    '9743': '<$>{break}<$>',
+    '9749': '<$>{break}<$>',
+    '15df': '<$>{break}<$>',
+    '15dc': '<$>{break}<$>',
+    '15da': '<$>{break}<$>',
+    '15f8': '<$>{break}<$>',
+    '94fe': '<$>{break}<$>',
+    '94fd': '<$>{break}<$>',
+    '94fc': '<$>{break}<$>',
+    '94fb': '<$>{break}<$>',
+    '944f': '<$>{break}<$>{italic}<$>',
+    '944c': '<$>{break}<$>',
+    '944a': '<$>{break}<$>',
+    '92fc': '<$>{break}<$>',
+    '1051': '<$>{break}<$>',
+    '1052': '<$>{break}<$>',
+    '1054': '<$>{break}<$>',
+    '92fe': '<$>{break}<$>',
+    '92fd': '<$>{break}<$>',
+    '1058': '<$>{break}<$>',
+    '157a': '<$>{break}<$>',
+    '157f': '<$>{break}<$>',
+    '9279': '<$>{break}<$>',
+    '94f4': '<$>{break}<$>',
+    '94f7': '<$>{break}<$>',
+    '94f1': '<$>{break}<$>',
+    '9449': '<$>{break}<$>',
+    '92fb': '<$>{break}<$>',
+    '9446': '<$>{break}<$>',
+    '9445': '<$>{break}<$>',
+    '9443': '<$>{break}<$>',
+    '94f8': '<$>{break}<$>',
+    '9440': '<$>{break}<$>',
+    '1057': '<$>{break}<$>',
+    '9245': '<$>{break}<$>',
+    '92f2': '<$>{break}<$>',
+    '1579': '<$>{break}<$>',
+    '92f7': '<$>{break}<$>',
+    '105e': '<$>{break}<$>',
+    '92f4': '<$>{break}<$>',
+    '1573': '<$>{break}<$>',
+    '1570': '<$>{break}<$>',
+    '1576': '<$>{break}<$>',
+    '1575': '<$>{break}<$>',
+    '16c1': '<$>{break}<$>',
+    '16c2': '<$>{break}<$>',
+    '9168': '<$>{break}<$>',
+    '16c7': '<$>{break}<$>',
+    '9164': '<$>{break}<$>',
+    '9167': '<$>{break}<$>',
+    '9161': '<$>{break}<$>',
+    '9162': '<$>{break}<$>',
+    '947f': '<$>{break}<$>',
+    '91c2': '<$>{break}<$>',
+    '91c1': '<$>{break}<$>',
+    '91c7': '<$>{break}<$>',
+    '91c4': '<$>{break}<$>',
+    '13e3': '<$>{break}<$>',
+    '91c8': '<$>{break}<$>',
+    '91d0': '<$>{break}<$>',
+    '13e5': '<$>{break}<$>',
+    '13c8': '<$>{break}<$>',
+    '16cb': '<$>{break}<$>',
+    '16cd': '<$>{break}<$>',
+    '16ce': '<$>{break}<$>{italic}<$>',
+    '916d': '<$>{break}<$>',
+    '916e': '<$>{break}<$>{italic}<$>',
+    '916b': '<$>{break}<$>',
+    '91d5': '<$>{break}<$>',
+    '137a': '<$>{break}<$>',
+    '91cb': '<$>{break}<$>',
+    '91ce': '<$>{break}<$>{italic}<$>',
+    '91cd': '<$>{break}<$>',
+    '13ec': '<$>{break}<$>',
+    '13c1': '<$>{break}<$>',
+    '13ea': '<$>{break}<$>',
+    '13ef': '<$>{break}<$>{italic}<$>',
+    '94f2': '<$>{break}<$>',
+    '97fb': '<$>{break}<$>',
+    '97fc': '<$>{break}<$>',
+    '1658': '<$>{break}<$>',
+    '97fd': '<$>{break}<$>',
+    '97fe': '<$>{break}<$>',
+    '1652': '<$>{break}<$>',
+    '1651': '<$>{break}<$>',
+    '1657': '<$>{break}<$>',
+    '1654': '<$>{break}<$>',
+    '10cb': '<$>{break}<$>',
+    '97f2': '<$>{break}<$>',
+    '97f1': '<$>{break}<$>',
+    '97f7': '<$>{break}<$>',
+    '97f4': '<$>{break}<$>',
+    '165b': '<$>{break}<$>',
+    '97f8': '<$>{break}<$>',
+    '165d': '<$>{break}<$>',
+    '165e': '<$>{break}<$>',
+    '15cd': '<$>{break}<$>',
+    '10cd': '<$>{break}<$>',
+    '9767': '<$>{break}<$>',
+    '9249': '<$>{break}<$>',
+    '1349': '<$>{break}<$>',
+    '91d9': '<$>{break}<$>',
+    '1340': '<$>{break}<$>',
+    '91d3': '<$>{break}<$>',
+    '9243': '<$>{break}<$>',
+    '1343': '<$>{break}<$>',
+    '91d6': '<$>{break}<$>',
+    '1345': '<$>{break}<$>',
+    '1346': '<$>{break}<$>',
+    '9246': '<$>{break}<$>',
+    '94e9': '<$>{break}<$>',
+    '94e5': '<$>{break}<$>',
+    '94e6': '<$>{break}<$>',
+    '94e0': '<$>{break}<$>',
+    '94e3': '<$>{break}<$>',
+    '15ea': '<$>{break}<$>',
+    '15ec': '<$>{break}<$>',
+    '15ef': '<$>{break}<$>{italic}<$>',
+    '16fe': '<$>{break}<$>',
+    '16fd': '<$>{break}<$>',
+    '16fc': '<$>{break}<$>',
+    '16fb': '<$>{break}<$>',
+    '1367': '<$>{break}<$>',
+    '94ef': '<$>{break}<$>{italic}<$>',
+    '94ea': '<$>{break}<$>',
+    '94ec': '<$>{break}<$>',
+    '924a': '<$>{break}<$>',
+    '91dc': '<$>{break}<$>',
+    '924c': '<$>{break}<$>',
+    '91da': '<$>{break}<$>',
+    '91df': '<$>{break}<$>',
+    '134f': '<$>{break}<$>{italic}<$>',
+    '924f': '<$>{break}<$>{italic}<$>',
+    '16f8': '<$>{break}<$>',
+    '16f7': '<$>{break}<$>',
+    '16f4': '<$>{break}<$>',
+    '16f2': '<$>{break}<$>',
+    '16f1': '<$>{break}<$>',
+    '15e0': '<$>{break}<$>',
+    '15e3': '<$>{break}<$>',
+    '15e5': '<$>{break}<$>',
+    '15e6': '<$>{break}<$>',
+    '15e9': '<$>{break}<$>',
+    '9757': '<$>{break}<$>',
+    '9754': '<$>{break}<$>',
+    '9752': '<$>{break}<$>',
+    '9751': '<$>{break}<$>',
+    '9758': '<$>{break}<$>',
+    '92f1': '<$>{break}<$>',
+    '104c': '<$>{break}<$>',
+    '104a': '<$>{break}<$>',
+    '104f': '<$>{break}<$>{italic}<$>',
+    '105d': '<$>{break}<$>',
+    '92f8': '<$>{break}<$>',
+    '975e': '<$>{break}<$>',
+    '975d': '<$>{break}<$>',
+    '975b': '<$>{break}<$>',
+    '1043': '<$>{break}<$>',
+    '1040': '<$>{break}<$>',
+    '1046': '<$>{break}<$>',
+    '1045': '<$>{break}<$>',
+    '1049': '<$>{break}<$>',
+    '9479': '<$>{break}<$>',
+    '917f': '<$>{break}<$>',
+    '9470': '<$>{break}<$>',
+    '9476': '<$>{break}<$>',
+    '917a': '<$>{break}<$>',
+    '9475': '<$>{break}<$>',
+    '927a': '<$>{break}<$>',
+    '927f': '<$>{break}<$>',
+    '134a': '<$>{break}<$>',
+    '15fb': '<$>{break}<$>',
+    '15fc': '<$>{break}<$>',
+    '15fd': '<$>{break}<$>',
+    '15fe': '<$>{break}<$>',
+    '1546': '<$>{break}<$>',
+    '1545': '<$>{break}<$>',
+    '1543': '<$>{break}<$>',
+    '1540': '<$>{break}<$>',
+    '1549': '<$>{break}<$>',
+    '13fd': '<$>{break}<$>',
+    '13fe': '<$>{break}<$>',
+    '13fb': '<$>{break}<$>',
+    '13fc': '<$>{break}<$>',
+    '92e9': '<$>{break}<$>',
+    '92e6': '<$>{break}<$>',
+    '9458': '<$>{break}<$>',
+    '92e5': '<$>{break}<$>',
+    '92e3': '<$>{break}<$>',
+    '92e0': '<$>{break}<$>',
+    '9270': '<$>{break}<$>',
+    '9273': '<$>{break}<$>',
+    '9275': '<$>{break}<$>',
+    '9276': '<$>{break}<$>',
+    '15f1': '<$>{break}<$>',
+    '15f2': '<$>{break}<$>',
+    '15f4': '<$>{break}<$>',
+    '15f7': '<$>{break}<$>',
+    '9179': '<$>{break}<$>',
+    '9176': '<$>{break}<$>',
+    '9175': '<$>{break}<$>',
+    '947a': '<$>{break}<$>',
+    '9173': '<$>{break}<$>',
+    '9170': '<$>{break}<$>',
+    '13f7': '<$>{break}<$>',
+    '13f4': '<$>{break}<$>',
+    '13f2': '<$>{break}<$>',
+    '13f1': '<$>{break}<$>',
+    '92ef': '<$>{break}<$>{italic}<$>',
+    '92ec': '<$>{break}<$>',
+    '13f8': '<$>{break}<$>',
+    '92ea': '<$>{break}<$>',
+    '154f': '<$>{break}<$>{italic}<$>',
+    '154c': '<$>{break}<$>',
+    '154a': '<$>{break}<$>',
+    '16c4': '<$>{break}<$>',
+    '16c8': '<$>{break}<$>',
+    '97c8': '<$>{break}<$>',
+    '164f': '<$>{break}<$>{italic}<$>',
+    '164a': '<$>{break}<$>',
+    '164c': '<$>{break}<$>',
+    '1645': '<$>{break}<$>',
+    '1646': '<$>{break}<$>',
+    '1640': '<$>{break}<$>',
+    '1643': '<$>{break}<$>',
+    '1649': '<$>{break}<$>',
+    '94df': '<$>{break}<$>',
+    '94dc': '<$>{break}<$>',
+    '94da': '<$>{break}<$>',
+    '135b': '<$>{break}<$>',
+    '135e': '<$>{break}<$>',
+    '135d': '<$>{break}<$>',
+    '1370': '<$>{break}<$>',
+    '9240': '<$>{break}<$>',
+    '13e9': '<$>{break}<$>',
+    '1375': '<$>{break}<$>',
+    '1679': '<$>{break}<$>',
+    '1358': '<$>{break}<$>',
+    '1352': '<$>{break}<$>',
+    '1351': '<$>{break}<$>',
+    '1376': '<$>{break}<$>',
+    '1357': '<$>{break}<$>',
+    '1354': '<$>{break}<$>',
+    '1379': '<$>{break}<$>',
+    '94d9': '<$>{break}<$>',
+    '94d6': '<$>{break}<$>',
+    '94d5': '<$>{break}<$>',
+    '15462': '<$>{break}<$>',
+    '94d3': '<$>{break}<$>',
+    '94d0': '<$>{break}<$>',
+    '13e0': '<$>{break}<$>',
+    '13e6': '<$>{break}<$>',
+    '976b': '<$>{break}<$>',
+    '15c4': '<$>{break}<$>',
+    '15c2': '<$>{break}<$>',
+    '15c1': '<$>{break}<$>',
+    '976e': '<$>{break}<$>{italic}<$>',
+    '134c': '<$>{break}<$>',
+    '15c8': '<$>{break}<$>',
+    '92c8': '<$>{break}<$>',
+    '16e9': '<$>{break}<$>',
+    '16e3': '<$>{break}<$>',
+    '16e0': '<$>{break}<$>',
+    '16e6': '<$>{break}<$>',
+    '16e5': '<$>{break}<$>',
+    '91e5': '<$>{break}<$>',
+    '91e6': '<$>{break}<$>',
+    '91e0': '<$>{break}<$>',
+    '91e3': '<$>{break}<$>',
+    '13c4': '<$>{break}<$>',
+    '13c7': '<$>{break}<$>',
+    '91e9': '<$>{break}<$>',
+    '13c2': '<$>{break}<$>',
+    '9762': '<$>{break}<$>',
+    '15ce': '<$>{break}<$>{italic}<$>',
+    '9761': '<$>{break}<$>',
+    '15cb': '<$>{break}<$>',
+    '9764': '<$>{break}<$>',
+    '9768': '<$>{break}<$>',
+    '91ef': '<$>{break}<$>{italic}<$>',
+    '91ea': '<$>{break}<$>',
+    '91ec': '<$>{break}<$>',
+    '13ce': '<$>{break}<$>{italic}<$>',
+    '13cd': '<$>{break}<$>',
+    '97da': '<$>{break}<$>',
+    '13cb': '<$>{break}<$>',
+    '13462': '<$>{break}<$>',
+    '16ec': '<$>{break}<$>',
+    '16ea': '<$>{break}<$>',
+    '16ef': '<$>{break}<$>{italic}<$>',
+    '97c1': '<$>{break}<$>',
+    '97c2': '<$>{break}<$>',
+    '97c4': '<$>{break}<$>',
+    '97c7': '<$>{break}<$>',
+    '92cd': '<$>{break}<$>',
+    '92ce': '<$>{break}<$>{italic}<$>',
+    '92cb': '<$>{break}<$>',
+    '92da': '<$>{break}<$>',
+    '92dc': '<$>{break}<$>',
+    '92df': '<$>{break}<$>',
+    '97df': '<$>{break}<$>',
+    '155b': '<$>{break}<$>',
+    '155e': '<$>{break}<$>',
+    '155d': '<$>{break}<$>',
+    '97dc': '<$>{break}<$>',
+    '1675': '<$>{break}<$>',
+    '1676': '<$>{break}<$>',
+    '1670': '<$>{break}<$>',
+    '1673': '<$>{break}<$>',
+    '16462': '<$>{break}<$>',
+    '97cb': '<$>{break}<$>',
+    '97ce': '<$>{break}<$>{italic}<$>',
+    '97cd': '<$>{break}<$>',
+    '92c4': '<$>{break}<$>',
+    '92c7': '<$>{break}<$>',
+    '92c1': '<$>{break}<$>',
+    '92c2': '<$>{break}<$>',
+    '1551': '<$>{break}<$>',
+    '97d5': '<$>{break}<$>',
+    '97d6': '<$>{break}<$>',
+    '1552': '<$>{break}<$>',
+    '97d0': '<$>{break}<$>',
+    '1554': '<$>{break}<$>',
+    '1557': '<$>{break}<$>',
+    '97d3': '<$>{break}<$>',
+    '1558': '<$>{break}<$>',
+    '167f': '<$>{break}<$>',
+    '137f': '<$>{break}<$>',
+    '167a': '<$>{break}<$>',
+    '92d9': '<$>{break}<$>',
+    '92d0': '<$>{break}<$>',
+    '92d3': '<$>{break}<$>',
+    '92d5': '<$>{break}<$>',
+    '92d6': '<$>{break}<$>',
+    '10dc': '<$>{break}<$>',
+    '9262': '<$>{break}<$>',
+    '9261': '<$>{break}<$>',
+    '91f8': '<$>{break}<$>',
+    '10df': '<$>{break}<$>',
+    '9264': '<$>{break}<$>',
+    '91f4': '<$>{break}<$>',
+    '91f7': '<$>{break}<$>',
+    '91f1': '<$>{break}<$>',
+    '91f2': '<$>{break}<$>',
+    '97d9': '<$>{break}<$>',
+    '9149': '<$>{break}<$>',
+    '9143': '<$>{break}<$>',
+    '9140': '<$>{break}<$>',
+    '9146': '<$>{break}<$>',
+    '9145': '<$>{break}<$>',
+    '9464': '<$>{break}<$>',
+    '9467': '<$>{break}<$>',
+    '9461': '<$>{break}<$>',
+    '9462': '<$>{break}<$>',
+    '9468': '<$>{break}<$>',
+    '914c': '<$>{break}<$>',
+    '914a': '<$>{break}<$>',
+    '914f': '<$>{break}<$>{italic}<$>',
+    '10d3': '<$>{break}<$>',
+    '926b': '<$>{break}<$>',
+    '10d0': '<$>{break}<$>',
+    '10d6': '<$>{break}<$>',
+    '926e': '<$>{break}<$>{italic}<$>',
+    '926d': '<$>{break}<$>',
+    '91fd': '<$>{break}<$>',
+    '91fe': '<$>{break}<$>',
+    '10d9': '<$>{break}<$>',
+    '91fb': '<$>{break}<$>',
+    '91fc': '<$>{break}<$>',
+    '946e': '<$>{break}<$>{italic}<$>',
+    '946d': '<$>{break}<$>',
+    '946b': '<$>{break}<$>',
+    '10da': '<$>{break}<$>',
+    '10d5': '<$>{break}<$>',
+    '9267': '<$>{break}<$>',
+    '9268': '<$>{break}<$>',
+    '16df': '<$>{break}<$>',
+    '16da': '<$>{break}<$>',
+    '16dc': '<$>{break}<$>',
+    '9454': '<$>{break}<$>',
+    '9457': '<$>{break}<$>',
+    '9451': '<$>{break}<$>',
+    '9452': '<$>{break}<$>',
+    '136d': '<$>{break}<$>',
+    '136e': '<$>{break}<$>{italic}<$>',
+    '136b': '<$>{break}<$>',
+    '13d9': '<$>{break}<$>',
+    '13da': '<$>{break}<$>',
+    '13dc': '<$>{break}<$>',
+    '13df': '<$>{break}<$>',
+    '1568': '<$>{break}<$>',
+    '1561': '<$>{break}<$>',
+    '1564': '<$>{break}<$>',
+    '1567': '<$>{break}<$>',
+    '16d5': '<$>{break}<$>',
+    '16d6': '<$>{break}<$>',
+    '16d0': '<$>{break}<$>',
+    '16d3': '<$>{break}<$>',
+    '945d': '<$>{break}<$>',
+    '945e': '<$>{break}<$>',
+    '16d9': '<$>{break}<$>',
+    '945b': '<$>{break}<$>',
+    '156b': '<$>{break}<$>',
+    '156d': '<$>{break}<$>',
+    '156e': '<$>{break}<$>{italic}<$>',
+    '105b': '<$>{break}<$>',
+    '1364': '<$>{break}<$>',
+    '1368': '<$>{break}<$>',
+    '1361': '<$>{break}<$>',
+    '13d0': '<$>{break}<$>',
+    '13d3': '<$>{break}<$>',
+    '13d5': '<$>{break}<$>',
+    '13d6': '<$>{break}<$>',
+    '97a1': '',
+    '97a2': '',
+    '9723': '',
+    '94a1': '',
+    '94a4': '',
+    '94ad': '',
+    '1020': '',
+    '10a1': '',
+    '10a2': '',
+    '1023': '',
+    '10a4': '',
+    '1025': '',
+    '1026': '',
+    '10a7': '',
+    '10a8': '',
+    '1029': '',
+    '102a': '',
+    '10ab': '',
+    '102c': '',
+    '10ad': '',
+    '10ae': '',
+    '102f': '',
+    '97ad': '',
+    '97a4': '',
+    '9725': '',
+    '9726': '',
+    '97a7': '',
+    '97a8': '',
+    '9729': '',
+    '972a': '',
+    '9120': '<$>{end-italic}<$>',
+    '91a1': '',
+    '91a2': '',
+    '9123': '',
+    '91a4': '',
+    '9125': '',
+    '9126': '',
+    '91a7': '',
+    '91a8': '',
+    '9129': '',
+    '912a': '',
+    '91ab': '',
+    '912c': '',
+    '91ad': '',
+    '97ae': '',
+    '972f': '',
+    '91ae': '<$>{italic}<$>',
+    '912f': '<$>{italic}<$>',
+    '94a8': '',
+    '9423': '',
+    '94a2': '',
+}
+
+
+CHARACTERS = {
+    '20': ' ',
+    'a1': '!',
+    'a2': '"',
+    '23': '#',
+    'a4': '$',
+    '25': '%',
+    '26': '&',
+    'a7': '\'',
+    'a8': '(',
+    '29': ')',
+    '2a': 'á',
+    'ab': '+',
+    '2c': ',',
+    'ad': '-',
+    'ae': '.',
+    '2f': '/',
+    'b0': '0',
+    '31': '1',
+    '32': '2',
+    'b3': '3',
+    '34': '4',
+    'b5': '5',
+    'b6': '6',
+    '37': '7',
+    '38': '8',
+    'b9': '9',
+    'ba': ':',
+    '3b': ';',
+    'bc': '<',
+    '3d': '=',
+    '3e': '>',
+    'bf': '?',
+    '40': '@',
+    'c1': 'A',
+    'c2': 'B',
+    '43': 'C',
+    'c4': 'D',
+    '45': 'E',
+    '46': 'F',
+    'c7': 'G',
+    'c8': 'H',
+    '49': 'I',
+    '4a': 'J',
+    'cb': 'K',
+    '4c': 'L',
+    'cd': 'M',
+    'ce': 'N',
+    '4f': 'O',
+    'd0': 'P',
+    '51': 'Q',
+    '52': 'R',
+    'd3': 'S',
+    '54': 'T',
+    'd5': 'U',
+    'd6': 'V',
+    '57': 'W',
+    '58': 'X',
+    'd9': 'Y',
+    'da': 'Z',
+    '5b': '[',
+    'dc': 'é',
+    '5d': ']',
+    '5e': 'í',
+    'df': 'ó',
+    'e0': 'ú',
+    '61': 'a',
+    '62': 'b',
+    'e3': 'c',
+    '64': 'd',
+    'e5': 'e',
+    'e6': 'f',
+    '67': 'g',
+    '68': 'h',
+    'e9': 'i',
+    'ea': 'j',
+    '6b': 'k',
+    'ec': 'l',
+    '6d': 'm',
+    '6e': 'n',
+    'ef': 'o',
+    '70': 'p',
+    'f1': 'q',
+    'f2': 'r',
+    '73': 's',
+    'f4': 't',
+    '75': 'u',
+    '76': 'v',
+    'f7': 'w',
+    'f8': 'x',
+    '79': 'y',
+    '7a': 'z',
+    'fb': 'ç',
+    '7c': '÷',
+    'fd': 'Ñ',
+    'fe': 'ñ',
+    '7f': '',
+    '80': ''
+}
+
+
+SPECIAL_CHARS = {
+    '91b0': '®',
+    '9131': '°',
+    '9132': '½',
+    '91b3': '¿',
+    '91b4': '™',
+    '91b5': '¢',
+    '91b6': '£',
+    '9137': '♪',
+    '9138': 'à',
+    '91b9': ' ',
+    '91ba': 'è',
+    '913b': 'â',
+    '91bc': 'ê',
+    '913d': 'î',
+    '913e': 'ô',
+    '91bf': 'û'
+}
+
+
+EXTENDED_CHARS = {
+    '9220': 'Á',
+    '92a1': 'É',
+    '92a2': 'Ó',
+    '9223': 'Ú',
+    '92a4': 'Ü',
+    '9225': 'ü',
+    '9226': '‘',
+    '92a7': '¡',
+    '92a8': '*',
+    '9229': '’',
+    '922a': '—',
+    '92ab': '©',
+    '922c': '℠',
+    '92ad': '•',
+    '92ae': '“',
+    '922f': '”',
+    '92b0': 'À',
+    '9231': 'Â',
+    '9232': 'Ç',
+    '92b3': 'È',
+    '9234': 'Ê',
+    '92b5': 'Ë',
+    '92b6': 'ë',
+    '9237': 'Î',
+    '9238': 'Ï',
+    '92b9': 'ï',
+    '92ba': 'Ô',
+    '923b': 'Ù',
+    '92bc': 'ù',
+    '923d': 'Û',
+    '923e': '«',
+    '92bf': '»',
+    '1320': 'Ã',
+    '13a1': 'ã',
+    '13a2': 'Í',
+    '1323': 'Ì',
+    '13a4': 'ì',
+    '1325': 'Ò',
+    '1326': 'ò',
+    '13a7': 'Õ',
+    '13a8': 'õ',
+    '1329': '{',
+    '132a': '}',
+    '13ab': '\\',
+    '132c': '^',
+    '13ad': '_',
+    '13ae': '¦',
+    '132f': '~',
+    '13b0': 'Ä',
+    '1331': 'ä',
+    '1332': 'Ö',
+    '13b3': 'ö',
+    '1334': 'ß',
+    '13b5': '¥',
+    '13b6': '¤',
+    '1337': '|',
+    '1338': 'Å',
+    '13b9': 'å',
+    '13ba': 'Ø',
+    '133b': 'ø',
+    '13bc': '┌',
+    '133d': '┐',
+    '133e': '└',
+    '13bf': '┘',
+}
+
+
+# Cursor positioning codes
+PAC_HIGH_BYTE_BY_ROW = [
+    'xx',
+    '91',
+    '91',
+    '92',
+    '92',
+    '15',
+    '15',
+    '16',
+    '16',
+    '97',
+    '97',
+    '10',
+    '13',
+    '13',
+    '94',
+    '94'
+]
+PAC_LOW_BYTE_BY_ROW_RESTRICTED = [
+    'xx',
+    'd0',
+    '70',
+    'd0',
+    '70',
+    'd0',
+    '70',
+    'd0',
+    '70',
+    'd0',
+    '70',
+    'd0',
+    'd0',
+    '70',
+    'd0',
+    '70'
+]
+
+# High order bytes come first, then each key contains a list of low bytes.
+# Any of the values in that list, coupled with the high order byte will
+# map to the (row, column) tuple.
+# This particular dictionary will get transformed to a more suitable form for
+# usage like PAC_BYTES_TO_POSITIONING_MAP[u'91'][u'd6'] = (1, 12)
+PAC_BYTES_TO_POSITIONING_MAP = {
+    '91': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (1, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (2, 0),  # noqa
+        ('52', 'd3'): (1, 4),
+        ('54', 'd5'): (1, 8),
+        ('d6', '57'): (1, 12),
+        ('58', 'd9'): (1, 16),
+        ('da', '5b'): (1, 20),
+        ('dc', '5d'): (1, 24),
+        ('5e', 'df'): (1, 28),
+
+        ('f2', '73'): (2, 4),
+        ('f4', '75'): (2, 8),
+        ('76', 'f7'): (2, 12),
+        ('f8', '79'): (2, 16),
+        ('7a', 'fb'): (2, 20),
+        ('7c', 'fd'): (2, 24),
+        ('fe', '7f'): (2, 28)
+    },
+    '92': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (3, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (4, 0),  # noqa
+        ('52', 'd3'): (3, 4),
+        ('54', 'd5'): (3, 8),
+        ('d6', '57'): (3, 12),
+        ('58', 'd9'): (3, 16),
+        ('da', '5b'): (3, 20),
+        ('dc', '5d'): (3, 24),
+        ('5e', 'df'): (3, 28),
+
+        ('f2', '73'): (4, 4),
+        ('f4', '75'): (4, 8),
+        ('76', 'f7'): (4, 12),
+        ('f8', '79'): (4, 16),
+        ('7a', 'fb'): (4, 20),
+        ('7c', 'fd'): (4, 24),
+        ('fe', '7f'): (4, 28)
+    },
+    '15': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (5, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (6, 0),  # noqa
+        ('52', 'd3'): (5, 4),
+        ('54', 'd5'): (5, 8),
+        ('d6', '57'): (5, 12),
+        ('58', 'd9'): (5, 16),
+        ('da', '5b'): (5, 20),
+        ('dc', '5d'): (5, 24),
+        ('5e', 'df'): (5, 28),
+
+        ('f2', '73'): (6, 4),
+        ('f4', '75'): (6, 8),
+        ('76', 'f7'): (6, 12),
+        ('f8', '79'): (6, 16),
+        ('7a', 'fb'): (6, 20),
+        ('7c', 'fd'): (6, 24),
+        ('fe', '7f'): (6, 28)
+    },
+    '16': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (7, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (8, 0),  # noqa
+        ('52', 'd3'): (7, 4),
+        ('54', 'd5'): (7, 8),
+        ('d6', '57'): (7, 12),
+        ('58', 'd9'): (7, 16),
+        ('da', '5b'): (7, 20),
+        ('dc', '5d'): (7, 24),
+        ('5e', 'df'): (7, 28),
+
+        ('f2', '73'): (8, 4),
+        ('f4', '75'): (8, 8),
+        ('76', 'f7'): (8, 12),
+        ('f8', '79'): (8, 16),
+        ('7a', 'fb'): (8, 20),
+        ('7c', 'fd'): (8, 24),
+        ('fe', '7f'): (8, 28)
+    },
+    '97': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (9, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (10, 0),  # noqa
+        ('52', 'd3'): (9, 4),
+        ('54', 'd5'): (9, 8),
+        ('d6', '57'): (9, 12),
+        ('58', 'd9'): (9, 16),
+        ('da', '5b'): (9, 20),
+        ('dc', '5d'): (9, 24),
+        ('5e', 'df'): (9, 28),
+
+        ('f2', '73'): (10, 4),
+        ('f4', '75'): (10, 8),
+        ('76', 'f7'): (10, 12),
+        ('f8', '79'): (10, 16),
+        ('7a', 'fb'): (10, 20),
+        ('7c', 'fd'): (10, 24),
+        ('fe', '7f'): (10, 28)
+    },
+    '10': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (11, 0),  # noqa
+        ('52', 'd3'): (11, 4),
+        ('54', 'd5'): (11, 8),
+        ('d6', '57'): (11, 12),
+        ('58', 'd9'): (11, 16),
+        ('da', '5b'): (11, 20),
+        ('dc', '5d'): (11, 24),
+        ('5e', 'df'): (11, 28),
+    },
+    '13': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (12, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (13, 0),  # noqa
+        ('52', 'd3'): (12, 4),
+        ('54', 'd5'): (12, 8),
+        ('d6', '57'): (12, 12),
+        ('58', 'd9'): (12, 16),
+        ('da', '5b'): (12, 20),
+        ('dc', '5d'): (12, 24),
+        ('5e', 'df'): (12, 28),
+
+        ('f2', '73'): (13, 4),
+        ('f4', '75'): (13, 8),
+        ('76', 'f7'): (13, 12),
+        ('f8', '79'): (13, 16),
+        ('7a', 'fb'): (13, 20),
+        ('7c', 'fd'): (13, 24),
+        ('fe', '7f'): (13, 28)
+    },
+    '94': {
+        ('d0', '51', 'c2', '43', 'c4', '45', '46', 'c7', 'c8', '49', '4a', 'cb', '4c', 'cd'): (14, 0),  # noqa
+        ('70', 'f1', '62', 'e3', '64', 'e5', 'e6', '67', '68', 'e9', 'ea', '6b', 'ec', '6d'): (15, 0),  # noqa
+        ('52', 'd3'): (14, 4),
+        ('54', 'd5'): (14, 8),
+        ('d6', '57'): (14, 12),
+        ('58', 'd9'): (14, 16),
+        ('da', '5b'): (14, 20),
+        ('dc', '5d'): (14, 24),
+        ('5e', 'df'): (14, 28),
+
+        ('f2', '73'): (15, 4),
+        ('f4', '75'): (15, 8),
+        ('76', 'f7'): (15, 12),
+        ('f8', '79'): (15, 16),
+        ('7a', 'fb'): (15, 20),
+        ('7c', 'fd'): (15, 24),
+        ('fe', '7f'): (15, 28)
+    }
+}
+
+
+def _create_position_to_bytes_map(bytes_to_pos):
+    result = {}
+    for high_byte, low_byte_dict in list(bytes_to_pos.items()):
+
+        # must contain mappings to column, to the tuple of possible values
+        for low_byte_list in list(low_byte_dict.keys()):
+            column = bytes_to_pos[high_byte][low_byte_list][1]
+
+            row = bytes_to_pos[high_byte][low_byte_list][0]
+            if row not in result:
+                result[row] = {}
+
+            result[row][column] = (
+                tuple(product([high_byte], low_byte_list)))
+    return result
+
+# (Almost) the reverse of PAC_BYTES_TO_POSITIONING_MAP. Call with arguments
+# like for example [15][4] to get the tuple ((u'94', u'f2'), (u'94', u'73'))
+POSITIONING_TO_PAC_MAP = _create_position_to_bytes_map(
+    PAC_BYTES_TO_POSITIONING_MAP
+)
+
+
+def _restructure_bytes_to_position_map(byte_to_pos_map):
+    return {
+        k_: {
+            low_byte: byte_to_pos_map[k_][low_byte_list]
+            for low_byte_list in list(v_.keys()) for low_byte in low_byte_list
+        }
+        for k_, v_ in list(byte_to_pos_map.items())
+    }
+
+# Now use the dict with arguments like [u'91'][u'75'] directly.
+PAC_BYTES_TO_POSITIONING_MAP = _restructure_bytes_to_position_map(
+    PAC_BYTES_TO_POSITIONING_MAP)
+
+
+# Inverted character lookup
+CHARACTER_TO_CODE = {
+    character: code
+    for code, character in viewitems(CHARACTERS)
+}
+
+SPECIAL_OR_EXTENDED_CHAR_TO_CODE = {
+    character: code for code, character in viewitems(EXTENDED_CHARS)
+}
+SPECIAL_OR_EXTENDED_CHAR_TO_CODE.update(
+    {character: code for code, character in viewitems(SPECIAL_CHARS)}
+)
+
+# Time to transmit a single codeword = 1 second / 29.97
+MICROSECONDS_PER_CODEWORD = 1000.0 * 1000.0 / (30.0 * 1000.0 / 1001.0)
+
+
+HEADER = 'Scenarist_SCC V1.0'
--- a/utils/modules/pycaption/scc/specialized_collections.py
+++ b/utils/modules/pycaption/scc/specialized_collections.py
@@ -0,0 +1,823 @@
+from ..base import CaptionList, Caption, CaptionNode
+from ..geometry import (UnitEnum, Size, Layout, Point, Alignment,
+                        VerticalAlignmentEnum, HorizontalAlignmentEnum)
+
+from .constants import PAC_BYTES_TO_POSITIONING_MAP, COMMANDS
+import collections
+
+
+class PreCaption(object):
+    """
+    The Caption class has been refactored and now its instances must be used as
+    immutable objects. Some of the code in this module, however, relied on the
+    fact that Caption instances were mutable. For backwards compatibility,
+    therefore, this class was created to work as a mutable caption data holder
+    used to eventually instantiate an actual Caption object.
+    """
+
+    def __init__(self, start=0, end=0):
+        self.start = start
+        self.end = end
+        self.nodes = []
+        self.style = {}
+        self.layout_info = None
+
+    def to_real_caption(self):
+        return Caption(
+            self.start, self.end, self.nodes, self.style, self.layout_info
+        )
+
+
+class TimingCorrectingCaptionList(list):
+    """List of captions. When appending new elements, it will correct the end time
+    of the last ones, so they end when the new caption gets added.
+
+    "last ones" could mean the last caption `append`ed or all of the last
+    captions with which this list was `extended`
+
+    Also, doesn't allow Nones or empty captions
+    """
+    def __init__(self, *args, **kwargs):
+        super(TimingCorrectingCaptionList, self).__init__(*args, **kwargs)
+        self._last_batch = ()
+
+    def append(self, p_object):
+        """When appending a new caption to the list, make sure the last one
+        has an end. Also, don't add empty captions
+
+        :type p_object: Caption | None
+        """
+        if p_object is None or not p_object.nodes:
+            return
+
+        self._update_last_batch(self._last_batch, p_object)
+
+        self._last_batch = (p_object,)
+
+        super(TimingCorrectingCaptionList, self).append(p_object)
+
+    def extend(self, iterable):
+        """Adds the elements in the iterable to the list, regarding the first
+        caption's start time as the end time for the previously added
+        caption(s)
+
+        :param iterable: an iterable of Caption instances
+        """
+        appendable_items = [item for item in iterable if item and item.nodes]
+        self._update_last_batch(self._last_batch, *appendable_items)
+
+        self._last_batch = tuple(appendable_items)
+
+        super(TimingCorrectingCaptionList, self).extend(appendable_items)
+
+    @staticmethod
+    def _update_last_batch(batch, *new_captions):
+        """Given a batch of captions, sets their end time equal to the start
+        time of the first caption in *new_captions
+
+        The start time of the first caption in new_captions should never be 0.
+        This means an invalid SCC file.
+
+        :type batch: tuple[Caption]
+        :type new_captions: tuple[Caption]
+        """
+        if not new_captions:
+            return
+        if not new_captions[0]:
+            return
+        if not new_captions[0].nodes:
+            return
+
+        new_caption = new_captions[0]
+
+        if batch and batch[-1].end == 0:
+            for caption in batch:
+                caption.end = new_caption.start
+
+
+class NotifyingDict(dict):
+    """Dictionary-like object, that treats one key as 'active',
+    and notifies observers if the active key changed
+    """
+    # Need an unhashable object as initial value for the active key.
+    # That way we're sure this was never a key in the dict.
+    _guard = {}
+
+    def __init__(self, *args, **kwargs):
+        super(NotifyingDict, self).__init__(*args, **kwargs)
+        self.active_key = self._guard
+        self.observers = []
+
+    def set_active(self, key):
+        """Sets the active key
+
+        :param key: any hashable object
+        """
+        if key not in self:
+            raise ValueError('No such key present')
+
+        # Notify observers of the change
+        if key != self.active_key:
+            for observer in self.observers:
+                observer(self.active_key, key)
+
+        self.active_key = key
+
+    def get_active(self):
+        """Returns the value corresponding to the active key
+        """
+        if self.active_key is self._guard:
+            raise KeyError('No active key set')
+
+        return self[self.active_key]
+
+    def add_change_observer(self, observer):
+        """Receives a callable function, which it will call if the active
+        element changes.
+
+        The observer will receive 2 positional arguments: the old and new key
+
+        :param observer: any callable that can be called with 2 positional
+            arguments
+        """
+        if not isinstance(observer, collections.Callable):
+            raise TypeError('The observer should be callable')
+
+        self.observers.append(observer)
+
+
+class CaptionCreator(object):
+    """Creates and maintains a collection of Captions
+    """
+    def __init__(self):
+        self._collection = TimingCorrectingCaptionList()
+
+        # subset of self._collection;
+        # captions here will be susceptible to time corrections
+        self._still_editing = []
+
+    def correct_last_timing(self, end_time, force=False):
+        """Called to set the time on the last Caption(s) stored with no end
+        time
+
+        :type force: bool
+        :param force: Set the end time even if there's already an end time
+
+        :type end_time: float
+        :param end_time: microseconds; the end of the caption;
+        """
+        if not self._still_editing:
+            return
+
+        if force:
+            # Select all last captions
+            captions_to_correct = self._still_editing
+        elif self._still_editing[-1].end == 0:
+            # Only select the last captions if they haven't gotten their
+            # end time set yet
+            captions_to_correct = self._still_editing
+        else:
+            return
+
+        for caption in captions_to_correct:
+            caption.end = end_time
+
+    def create_and_store(self, node_buffer, start):
+        """Interpreter method, will convert the buffer into one or more Caption
+        objects, storing them internally.
+
+        This method relies on the InstructionNodeCreator's ability to generate
+        InstructionNodes properly, so at this point we can convert
+        _InstructionNodes nodes almost 1:1 to CaptionNodes
+
+        :type node_buffer: InstructionNodeCreator
+
+        :type start: float
+        :param start: the start time in microseconds
+        """
+        if node_buffer.is_empty():
+            return
+
+        caption = PreCaption()
+        caption.start = start
+        caption.end = 0  # Not yet known; filled in later
+        self._still_editing = [caption]
+
+        for instruction in node_buffer:
+            # skip empty elements
+            if instruction.is_empty():
+                continue
+
+            elif instruction.requires_repositioning():
+                caption = PreCaption()
+                caption.start = start
+                caption.end = 0
+                self._still_editing.append(caption)
+
+            # handle line breaks
+            elif instruction.is_explicit_break():
+                caption.nodes.append(CaptionNode.create_break(
+                    layout_info=_get_layout_from_tuple(instruction.position)
+                ))
+
+            # handle open italics
+            elif instruction.sets_italics_on():
+                caption.nodes.append(
+                    CaptionNode.create_style(
+                        True, {'italics': True},
+                        layout_info=_get_layout_from_tuple(
+                            instruction.position
+                        ))
+                )
+
+            # handle clone italics
+            elif instruction.sets_italics_off():
+                caption.nodes.append(
+                    CaptionNode.create_style(
+                        False, {'italics': True},
+                        layout_info=_get_layout_from_tuple(
+                            instruction.position)
+                    ))
+
+            # handle text
+            elif instruction.is_text_node():
+                layout_info = _get_layout_from_tuple(instruction.position)
+                caption.nodes.append(
+                    CaptionNode.create_text(
+                        instruction.get_text(), layout_info=layout_info),
+                )
+                caption.layout_info = layout_info
+
+        self._collection.extend(self._still_editing)
+
+    def get_all(self):
+        """Returns the Caption collection as a CaptionList
+
+        :rtype: CaptionList
+        """
+        caption_list = CaptionList()
+        for precap in self._collection:
+            caption_list.append(precap.to_real_caption())
+        return caption_list
+
+
+class InstructionNodeCreator(object):
+    """Creates _InstructionNode instances from characters and commands, storing
+    them internally
+    """
+    def __init__(self, collection=None, position_tracker=None):
+        """
+        :param collection: an optional collection of nodes
+
+        :param position_tracker:
+        :return:
+        """
+        if not collection:
+            self._collection = []
+        else:
+            self._collection = collection
+
+        self._position_tracer = position_tracker
+
+    def is_empty(self):
+        """Whether any text was added to the buffer
+        """
+        return not any(element.text for element in self._collection)
+
+    def add_chars(self, *chars):
+        """Adds characters to a text node (last text node, or a new one)
+
+        :param chars: tuple containing text (unicode)
+        """
+        if not chars:
+            return
+
+        current_position = self._position_tracer.get_current_position()
+
+        # get or create a usable node
+        if (self._collection and self._collection[-1].is_text_node()
+                and not self._position_tracer.is_repositioning_required()):
+            node = self._collection[-1]
+        else:
+            # create first node
+            node = _InstructionNode(position=current_position)
+            self._collection.append(node)
+
+        # handle a simple line break
+        if self._position_tracer.is_linebreak_required():
+            # must insert a line break here
+            self._collection.append(_InstructionNode.create_break(
+                position=current_position))
+            node = _InstructionNode.create_text(current_position)
+            self._collection.append(node)
+            self._position_tracer.acknowledge_linebreak_consumed()
+
+        # handle completely new positioning
+        elif self._position_tracer.is_repositioning_required():
+            self._collection.append(
+                _InstructionNode.create_repositioning_command(
+                    current_position
+                )
+            )
+            node = _InstructionNode.create_text(current_position)
+            self._collection.append(node)
+            self._position_tracer.acknowledge_position_changed()
+
+        node.add_chars(*chars)
+
+    def interpret_command(self, command):
+        """Given a command determines whether tu turn italics on or off,
+        or to set the positioning
+
+        This is mostly used to convert from the legacy-style commands
+
+        :type command: unicode
+        """
+        self._update_positioning(command)
+
+        text = COMMANDS.get(command, '')
+
+        if 'italic' in text:
+            if 'end' not in text:
+                self._collection.append(
+                    _InstructionNode.create_italics_style(
+                        self._position_tracer.get_current_position())
+                )
+            else:
+                self._collection.append(
+                    _InstructionNode.create_italics_style(
+                        self._position_tracer.get_current_position(),
+                        turn_on=False
+                    )
+                )
+
+    def _update_positioning(self, command):
+        """Sets the positioning information to use for the next nodes
+
+        :type command: unicode
+        """
+        if len(command) != 4:
+            return
+
+        first, second = command[:2], command[2:]
+
+        try:
+            positioning = PAC_BYTES_TO_POSITIONING_MAP[first][second]
+        except KeyError:
+            pass
+        else:
+            self._position_tracer.update_positioning(positioning)
+
+    def __iter__(self):
+        return iter(_format_italics(self._collection))
+
+    @classmethod
+    def from_list(cls, stash_list, position_tracker):
+        """Having received a list of instances of this class, creates a new
+        instance that contains all the nodes of the previous instances
+        (basically concatenates the many stashes into one)
+
+        :type stash_list: list[InstructionNodeCreator]
+        :param stash_list: a list of instances of this class
+
+        :type position_tracker: .state_machines.DefaultProvidingPositionTracker
+        :param position_tracker: state machine to be interrogated about the
+            positioning when creating a node
+
+        :rtype: InstructionNodeCreator
+        """
+        instance = cls(position_tracker=position_tracker)
+        new_collection = instance._collection
+
+        for idx, stash in enumerate(stash_list):
+            new_collection.extend(stash._collection)
+
+            # use space to separate the stashes, but don't add final space
+            if idx < len(stash_list) - 1:
+                try:
+                    instance._collection[-1].add_chars(' ')
+                except AttributeError:
+                    pass
+
+        return instance
+
+
+def _get_layout_from_tuple(position_tuple):
+    """Create a Layout object from the positioning information given
+
+    The row can have a value from 1 to 15 inclusive. (vertical positioning)
+    The column can have a value from 0 to 31 inclusive. (horizontal)
+
+    :param position_tuple: a tuple of ints (row, col)
+    :type position_tuple: tuple
+    :rtype: Layout
+    """
+    if not position_tuple:
+        return None
+
+    row, column = position_tuple
+
+    horizontal = Size(100 * column / 32.0, UnitEnum.PERCENT)
+    vertical = Size(100 * (row - 1) / 15.0, UnitEnum.PERCENT)
+    return Layout(origin=Point(horizontal, vertical),
+                  alignment=Alignment(HorizontalAlignmentEnum.LEFT,
+                                      VerticalAlignmentEnum.TOP)
+                  )
+
+
+class _InstructionNode(object):
+    """Value object, that can contain text information, or interpretable
+    commands (such as explicit line breaks or turning italics on/off).
+
+    These nodes will be aggregated into a RepresentableNode, which will then
+    be easily converted to a CaptionNode.
+    """
+    TEXT = 0
+    BREAK = 1
+    ITALICS_ON = 2
+    ITALICS_OFF = 3
+    CHANGE_POSITION = 4
+
+    def __init__(self, text=None, position=None, type_=0):
+        """
+        :type text: unicode
+        :param position: a tuple of ints (row, column)
+        :param type_: self.TEXT | self.BREAK | self.ITALICS
+        :type type_: int
+        """
+        self.text = text
+        self.position = position
+        self._type = type_
+
+    def add_chars(self, *args):
+        """This being a text node, add characters to it.
+        :param args:
+        :type args: tuple[unicode]
+        :return:
+        """
+        if self.text is None:
+            self.text = ''
+
+        self.text += ''.join(args)
+
+    def is_text_node(self):
+        """
+        :rtype: bool
+        """
+        return self._type == self.TEXT
+
+    def is_empty(self):
+        """
+        :rtype: bool
+        """
+        if self._type == self.TEXT:
+            return not self.text
+
+        return False
+
+    def is_explicit_break(self):
+        """
+        :rtype: bool
+        """
+        return self._type == self.BREAK
+
+    def sets_italics_on(self):
+        """
+        :rtype: bool
+        """
+        return self._type == self.ITALICS_ON
+
+    def sets_italics_off(self):
+        """
+        :rtype: bool
+        """
+        return self._type == self.ITALICS_OFF
+
+    def is_italics_node(self):
+        """
+        :rtype: bool
+        """
+        return self._type in (self.ITALICS_OFF, self.ITALICS_ON)
+
+    def requires_repositioning(self):
+        """Whether the node must be interpreted as a change in positioning
+
+        :rtype: bool
+        """
+        return self._type == self.CHANGE_POSITION
+
+    def get_text(self):
+        """A little legacy code.
+        """
+        return ' '.join(self.text.split())
+
+    @classmethod
+    def create_break(cls, position):
+        """Create a node, interpretable as an explicit line break
+
+        :type position: tuple[int]
+        :param position: a tuple (row, col) containing the positioning info
+
+        :rtype: _InstructionNode
+        """
+        return cls(type_=cls.BREAK, position=position)
+
+    @classmethod
+    def create_text(cls, position, *chars):
+        """Create a node interpretable as text
+
+        :type position: tuple[int]
+        :param position: a tuple (row, col) to mark the positioning
+
+        :type chars: tuple[unicode]
+        :param chars: characters to add to the text
+
+        :rtype: _InstructionNode
+        """
+        return cls(''.join(chars), position=position)
+
+    @classmethod
+    def create_italics_style(cls, position, turn_on=True):
+        """Create a node, interpretable as a command to switch italics on/off
+
+        :type position: tuple[int]
+        :param position: a tuple (row, col) to mark the positioning
+
+        :type turn_on: bool
+        :param turn_on: whether to turn the italics on or off
+
+        :rtype: _InstructionNode
+        """
+        return cls(
+            position=position,
+            type_=cls.ITALICS_ON if turn_on else cls.ITALICS_OFF
+        )
+
+    @classmethod
+    def create_repositioning_command(cls, position=None):
+        """Create node interpretable as a command to change the current
+        position
+
+        :type position:
+        """
+        return cls(type_=cls.CHANGE_POSITION, position=position)
+
+    def __repr__(self):         # pragma: no cover
+        if self._type == self.BREAK:
+            extra = 'BR'
+        elif self._type == self.TEXT:
+            extra = '"{}"'.format(self.text)
+        elif self._type in (self.ITALICS_ON, self.ITALICS_OFF):
+            extra = 'italics {}'.format(
+                'on' if self._type == self.ITALICS_ON else 'off'
+            )
+        else:
+            extra = 'change position'
+
+        return '<INode: {extra} >'.format(extra=extra)
+
+
+def _format_italics(collection):
+    """Given a raw list of _InstructionNodes, returns a new equivalent list
+    where all the italics nodes properly close and open.
+
+    The list is equivalent in the sense that the SCC commands that would have
+    generated the output list, would have had the exact same visual effect
+    as the ones that generated the output, as far as italics are concerned.
+
+    This is useful because the raw commands read from the SCC can't be used
+    the way they are by the writers for the other formats. Those other writers
+    require the list of CaptionNodes to be formatted in a certain way.
+
+    Note: Using state machines to manage the italics didn't work well because
+    we're using state machines already to track the position, and their
+    interactions got crazy.
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = _skip_initial_italics_off_nodes(collection)
+
+    new_collection = _skip_empty_text_nodes(new_collection)
+
+    # after this step we're guaranteed a proper ordering of the nodes
+    new_collection = _skip_redundant_italics_nodes(new_collection)
+
+    # after this, we're guaranteed that the italics are properly contained
+    # within their context
+    new_collection = _close_italics_before_repositioning(new_collection)
+
+    # all nodes will be closed after this step
+    new_collection = _ensure_final_italics_node_closes(new_collection)
+
+    # removes pairs of italics nodes that don't do anything noticeable
+    new_collection = _remove_noop_italics(new_collection)
+
+    return new_collection
+
+
+def _remove_noop_on_off_italics(collection):
+    """Return an equivalent list to `collection`. It removes the italics node
+     pairs that don't surround text nodes, if those nodes are in the order:
+     on, off
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = []
+    to_commit = None
+
+    for node in collection:
+        if node.is_italics_node() and node.sets_italics_on():
+            to_commit = node
+            continue
+
+        elif node.is_italics_node() and node.sets_italics_off():
+            if to_commit:
+                to_commit = None
+                continue
+        else:
+            if to_commit:
+                new_collection.append(to_commit)
+                to_commit = None
+
+        new_collection.append(node)
+
+    return new_collection
+
+
+def _remove_noon_off_on_italics(collection):
+    """Removes pairs of off-on italics nodes, that don't surround any other
+    node
+
+    :type collection: list[_InstructionNode]
+    :return: list[_InstructionNode]
+    """
+    new_collection = []
+    to_commit = None
+
+    for node in collection:
+        if node.is_italics_node() and node.sets_italics_off():
+            to_commit = node
+            continue
+
+        elif node.is_italics_node() and node.sets_italics_on():
+            if to_commit:
+                to_commit = None
+                continue
+        else:
+            if to_commit:
+                new_collection.append(to_commit)
+                to_commit = None
+
+        new_collection.append(node)
+
+    if to_commit:
+        new_collection.append(to_commit)
+
+    return new_collection
+
+
+def _remove_noop_italics(collection):
+    """Return an equivalent list to `collection`. It removes the italics node
+     pairs that don't surround text nodes
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = _remove_noop_on_off_italics(collection)
+
+    new_collection = _remove_noon_off_on_italics(new_collection)
+
+    return new_collection
+
+
+def _skip_initial_italics_off_nodes(collection):
+    """Return a collection like the one given, but without the
+    initial <Italics OFF> nodes
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = []
+    can_add_italics_off_nodes = False
+
+    for node in collection:
+        if node.is_italics_node():
+            if node.sets_italics_on():
+                can_add_italics_off_nodes = True
+                new_collection.append(node)
+            elif can_add_italics_off_nodes:
+                new_collection.append(node)
+        else:
+            new_collection.append(node)
+
+    return new_collection
+
+
+def _skip_empty_text_nodes(collection):
+    """Return an iterable containing all the nodes in the previous
+    collection except for the empty text nodes
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    return [node for node in collection
+            if not (node.is_text_node() and node.is_empty())]
+
+
+def _skip_redundant_italics_nodes(collection):
+    """Return a list where the <Italics On> nodes only appear after
+    <Italics OFF>, and vice versa. This ignores the other node types, and
+    only removes redundant italic nodes
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = []
+    state = None
+
+    for node in collection:
+        if node.is_italics_node():
+            if state is None:
+                state = node.sets_italics_on()
+                new_collection.append(node)
+                continue
+            # skip the nodes that are like the previous
+            if node.sets_italics_on() is state:
+                continue
+            else:
+                state = node.sets_italics_on()
+        new_collection.append(node)
+
+    return new_collection
+
+
+def _close_italics_before_repositioning(collection):
+    """Make sure that for every opened italic node, there's a corresponding
+     closing node.
+
+     Will insert a closing italic node, before each repositioning node
+
+     :type collection: list[_InstructionNode]
+     :rtype: list[_InstructionNode]
+    """
+    new_collection = []
+
+    italics_on = False
+    last_italics_on_node = None
+
+    for idx, node in enumerate(collection):
+        if node.is_italics_node() and node.sets_italics_on():
+            italics_on = True
+            last_italics_on_node = node
+        if node.is_italics_node() and node.sets_italics_off():
+            italics_on = False
+        if node.requires_repositioning() and italics_on:
+            # Append an italics closing node before the position change
+            new_collection.append(
+                _InstructionNode.create_italics_style(
+                    # The position info of this new node should be the same
+                    position=last_italics_on_node.position,
+                    turn_on=False
+                )
+            )
+            new_collection.append(node)
+            # Append an italics opening node after the positioning change
+            new_collection.append(
+                _InstructionNode.create_italics_style(
+                    position=node.position
+                )
+            )
+            continue
+        new_collection.append(node)
+
+    return new_collection
+
+
+def _ensure_final_italics_node_closes(collection):
+    """The final italics command needs to be closed
+
+    :type collection: list[_InstructionNode]
+    :rtype: list[_InstructionNode]
+    """
+    new_collection = list(collection)
+
+    italics_on = False
+    last_italics_on_node = None
+
+    for node in collection:
+        if node.is_italics_node() and node.sets_italics_on():
+            italics_on = True
+            last_italics_on_node = node
+        if node.is_italics_node() and node.sets_italics_off():
+            italics_on = False
+
+    if italics_on:
+        new_collection.append(
+            _InstructionNode.create_italics_style(
+                position=last_italics_on_node.position,
+                turn_on=False
+            )
+        )
+    return new_collection
--- a/utils/modules/pycaption/scc/state_machines.py
+++ b/utils/modules/pycaption/scc/state_machines.py
@@ -0,0 +1,128 @@
+from ..exceptions import CaptionReadSyntaxError
+
+
+class _PositioningTracker(object):
+    """Helps determine the positioning of a node, having kept track of
+    positioning-related commands.
+    """
+    def __init__(self, positioning=None):
+        """
+        :param positioning: positioning information (row, column)
+        :type positioning: tuple[int]
+        """
+        self._positions = [positioning]
+        self._break_required = False
+        self._repositioning_required = False
+
+    def update_positioning(self, positioning):
+        """Being notified of a position change, updates the internal state,
+        to as to be able to tell if it was a trivial change (a simple line
+        break) or not.
+
+        :type positioning: tuple[int]
+        :param positioning: a tuple (row, col)
+        """
+        current = self._positions[-1]
+
+        if not current:
+            if positioning:
+                # set the positioning for the first time
+                self._positions = [positioning]
+            return
+
+        row, col = current
+        new_row, _ = positioning
+
+        # is the new position simply one line below?
+        if new_row == row + 1:
+            self._positions.append((new_row, col))
+            self._break_required = True
+        else:
+            # reset the "current" position altogether.
+            self._positions = [positioning]
+            self._repositioning_required = True
+
+    def get_current_position(self):
+        """Returns the current usable position
+
+        :rtype: tuple[int]
+
+        :raise: CaptionReadSyntaxError
+        """
+        if not any(self._positions):
+            raise CaptionReadSyntaxError(
+                'No Preamble Address Code [PAC] was provided'
+            )
+        else:
+            return self._positions[0]
+
+    def is_repositioning_required(self):
+        """Determines whether the current positioning has changed non-trivially
+
+        Trivial would be mean that a line break should suffice.
+        :rtype: bool
+        """
+        return self._repositioning_required
+
+    def acknowledge_position_changed(self):
+        """Acknowledge the position tracer that the position was changed
+        """
+        self._repositioning_required = False
+
+    def is_linebreak_required(self):
+        """If the current position is simply one line below the previous.
+        :rtype: bool
+        """
+        return self._break_required
+
+    def acknowledge_linebreak_consumed(self):
+        """Call to acknowledge that the line required was consumed
+        """
+        self._break_required = False
+
+
+class DefaultProvidingPositionTracker(_PositioningTracker):
+    """A _PositioningTracker that provides if needed a default value (14, 0), or
+    uses the last positioning value set anywhere in the document
+    """
+    default = (14, 0)
+
+    def __init__(self, positioning=None, default=None):
+        """
+        :type positioning: tuple[int]
+        :param positioning: a tuple of ints (row, column)
+
+        :type default: tuple[int]
+        :param default: a tuple of ints (row, column) to use as fallback
+        """
+        super(DefaultProvidingPositionTracker, self).__init__(positioning)
+
+        if default:
+            self.default = default
+
+    def get_current_position(self):
+        """Returns the currently tracked positioning, the last positioning that
+        was set (anywhere), or the default it was initiated with
+
+        :rtype: tuple[int]
+        """
+        try:
+            return (
+                super(DefaultProvidingPositionTracker, self).
+                get_current_position()
+            )
+        except CaptionReadSyntaxError:
+            return self.default
+
+    def update_positioning(self, positioning):
+        """If called, sets this positioning as the default, then delegates
+        to the super class.
+
+        :param positioning: a tuple of ints (row, col)
+        :type positioning: tuple[int]
+        """
+        if positioning:
+            self.default = positioning
+
+        super(DefaultProvidingPositionTracker, self).update_positioning(
+            positioning)
--- a/utils/modules/pycaption/srt.py
+++ b/utils/modules/pycaption/srt.py
@@ -0,0 +1,155 @@
+from copy import deepcopy
+import six
+
+from .base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode)
+from .exceptions import CaptionReadNoCaptions, InvalidInputError
+
+
+class SRTReader(BaseReader):
+    def detect(self, content):
+        lines = content.splitlines()
+        if lines[0].isdigit() and '-->' in lines[1]:
+            return True
+        else:
+            return False
+
+    def read(self, content, lang='en-US'):
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        lines = content.splitlines()
+        start_line = 0
+        captions = CaptionList()
+
+        while start_line < len(lines):
+            if not lines[start_line].isdigit():
+                break
+
+            end_line = self._find_text_line(start_line, lines)
+
+            timing = lines[start_line + 1].split('-->')
+            start = self._srttomicro(timing[0].strip(' \r\n'))
+            end = self._srttomicro(timing[1].strip(' \r\n'))
+
+            nodes = []
+
+            for line in lines[start_line + 2:end_line - 1]:
+                # skip extra blank lines
+                if not nodes or line != '':
+                    nodes.append(CaptionNode.create_text(line))
+                    nodes.append(CaptionNode.create_break())
+
+            if len(nodes):
+                # remove last line break from end of caption list
+                nodes.pop()
+                caption = Caption(start, end, nodes)
+                captions.append(caption)
+
+            start_line = end_line
+
+        caption_set = CaptionSet({lang: captions})
+
+        if caption_set.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+
+        return caption_set
+
+    def _srttomicro(self, stamp):
+        timesplit = stamp.split(':')
+        if ',' not in timesplit[2]:
+            timesplit[2] += ',000'
+        secsplit = timesplit[2].split(',')
+        microseconds = (int(timesplit[0]) * 3600000000 +
+                        int(timesplit[1]) * 60000000 +
+                        int(secsplit[0]) * 1000000 +
+                        int(secsplit[1]) * 1000)
+
+        return microseconds
+
+    def _find_text_line(self, start_line, lines):
+        end_line = start_line
+
+        found = False
+        while end_line < len(lines):
+            if lines[end_line].strip() == "":
+                found = True
+            elif found is True:
+                end_line -= 1
+                break
+            end_line += 1
+
+        return end_line + 1
+
+
+class SRTWriter(BaseWriter):
+    def write(self, caption_set):
+        caption_set = deepcopy(caption_set)
+
+        srt_captions = []
+
+        for lang in caption_set.get_languages():
+            srt_captions.append(
+                self._recreate_lang(caption_set.get_captions(lang))
+            )
+
+        caption_content = 'MULTI-LANGUAGE SRT\n'.join(srt_captions)
+        return caption_content
+
+    def _recreate_lang(self, captions):
+        
+        # Merge caption's that are on the exact same timestamp otherwise some
+        # players will play them in reversed order, libass specifically which is
+        # used quite a lot, including VLC and MPV.
+        # Fixes #189 - https://github.com/pbs/pycaption/issues/189
+        new_captions = []
+        i = 0
+        while len(captions) > i:
+            # if there's a caption after this, and they have the same timestamps
+            if len(captions) > i+1 and captions[i].start == captions[i+1].start and captions[i].end == captions[i+1].end:
+                # merge them together as a new caption
+                new_caption = Caption(start=captions[i].start, end=captions[i].end, nodes=captions[i].nodes + captions[i+1].nodes)
+                # delete the caption after this as we merged them to the current one
+                del captions[i]
+            else:
+                # don't do anything different
+                new_caption = captions[i]
+            # add final caption to new list
+            new_captions.append(new_caption)
+            # increment index
+            i += 1
+        captions = new_captions
+        
+        srt = ''
+        count = 1
+
+        for caption in captions:
+            srt += '%s\n' % count
+
+            start = caption.format_start(msec_separator=',')
+            end = caption.format_end(msec_separator=',')
+            timestamp = '%s --> %s\n' % (start[:12], end[:12])
+
+            srt += timestamp.replace('.', ',')
+
+            new_content = ''
+            for node in caption.nodes:
+                new_content = self._recreate_line(new_content, node)
+
+            # Eliminate excessive line breaks
+            new_content = new_content.strip()
+            while '\n\n' in new_content:
+                new_content = new_content.replace('\n\n', '\n')
+
+            srt += "%s%s" % (new_content, '\n\n')
+            count += 1
+
+        return srt[:-1]  # remove unwanted newline at end of file
+
+    def _recreate_line(self, srt, line):
+        if line.type_ == CaptionNode.TEXT:
+            return srt + '%s ' % line.content
+        elif line.type_ == CaptionNode.BREAK:
+            return srt + '\n'
+        else:
+            return srt
--- a/utils/modules/pycaption/transcript.py
+++ b/utils/modules/pycaption/transcript.py
@@ -0,0 +1,33 @@
+import os
+
+try:
+    import nltk.data
+except ImportError:
+    raise ImportError('You must install nltk==2.0.4 and numpy==1.7.1 to be able to use this.')
+from pycaption.base import BaseWriter, CaptionNode
+
+
+class TranscriptWriter(BaseWriter):
+    def __init__(self, *args, **kw):
+        self.nltk = nltk.data.load('file:%s/english.pickle' %
+                                   os.path.dirname(__file__))
+
+    def write(self, captions):
+        transcripts = []
+
+        for lang in captions.get_languages():
+            lang_transcript = '* %s Transcript *\n' % lang.upper()
+
+            for caption in captions.get_captions(lang):
+                lang_transcript = self._strip_text(caption.nodes, lang_transcript)
+
+            lang_transcript = '\n'.join(self.nltk.tokenize(lang_transcript))
+            transcripts.append(lang_transcript)
+
+        return '\n'.join(transcripts)
+
+    def _strip_text(self, elements, lang_transcript):
+        for el in elements:
+            if el.type_ == CaptionNode.TEXT:
+                lang_transcript += el.content
+        return lang_transcript
--- a/utils/modules/pycaption/utils.py
+++ b/utils/modules/pycaption/utils.py
@@ -0,0 +1,10 @@
+def is_leaf(element):
+    """
+    Return True if the element is a leaf, False otherwise. The element is
+    considered a leaf if it is either NavigableString or the "br" tag
+    :param element: A BeautifulSoup tag or NavigableString
+    """
+    name = getattr(element, 'name', None)
+    if not name or name == 'br':
+        return True
+    return False
--- a/utils/modules/pycaption/webvtt.py
+++ b/utils/modules/pycaption/webvtt.py
@@ -0,0 +1,470 @@
+import re
+import six
+import sys
+import datetime
+from copy import deepcopy
+
+
+from .base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode
+)
+
+from .geometry import Layout
+
+from .exceptions import (
+    CaptionReadError, CaptionReadSyntaxError, CaptionReadNoCaptions,
+    InvalidInputError
+)
+
+# A WebVTT timing line has both start/end times and layout related settings
+# (referred to as 'cue settings' in the documentation)
+# The following pattern captures [start], [end] and [cue settings] if existent
+from pycaption.geometry import HorizontalAlignmentEnum
+
+TIMING_LINE_PATTERN = re.compile('^(\S+)\s+-->\s+(\S+)(?:\s+(.*?))?\s*$')
+TIMESTAMP_PATTERN = re.compile('^(\d+):(\d{2})(:\d{2})?\.(\d{3})')
+VOICE_SPAN_PATTERN = re.compile('<v(\\.\\w+)* ([^>]*)>')
+OTHER_SPAN_PATTERN = (
+    re.compile(
+        '</?([cibuv]|ruby|rt|lang|(\d+):(\d{2})(:\d{2})?\.(\d{3})).*?>'
+    )
+)  # These WebVTT tags are stripped off the cues on conversion
+
+WEBVTT_VERSION_OF = {
+    HorizontalAlignmentEnum.LEFT: 'left',
+    HorizontalAlignmentEnum.CENTER: 'middle',
+    HorizontalAlignmentEnum.RIGHT: 'right',
+    HorizontalAlignmentEnum.START: 'start',
+    HorizontalAlignmentEnum.END: 'end'
+}
+
+DEFAULT_ALIGNMENT = 'middle'
+
+
+def microseconds(h, m, s, f):
+    """
+    Returns an integer representing a number of microseconds
+    :rtype: int
+    """
+    return (int(h) * 3600 + int(m) * 60 + int(s)) * 1000000 + int(f) * 1000
+
+
+class WebVTTReader(BaseReader):
+    def __init__(self, ignore_timing_errors=True, *args, **kwargs):
+        """
+        :param ignore_timing_errors: Whether to ignore timing checks
+        """
+        self.ignore_timing_errors = ignore_timing_errors
+
+    def detect(self, content):
+        return 'WEBVTT' in content
+
+    def read(self, content, lang='en-US'):
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        caption_set = CaptionSet({lang: self._parse(content.splitlines())})
+
+        if caption_set.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+
+        return caption_set
+
+    def _parse(self, lines):
+        captions = CaptionList()
+        start = None
+        end = None
+        nodes = []
+        layout_info = None
+        found_timing = False
+
+        for i, line in enumerate(lines):
+
+            if '-->' in line:
+                found_timing = True
+                timing_line = i
+                last_start_time = captions[-1].start if captions else 0
+                try:
+                    start, end, layout_info = self._parse_timing_line(
+                        line, last_start_time)
+                except CaptionReadError as e:
+                    new_message = '%s (line %d)' % (e.args[0], timing_line)
+                    six.reraise(type(e), type(e)(new_message), sys.exc_info()[2])
+
+            elif '' == line:
+                if found_timing:
+                    if not nodes:
+                        raise CaptionReadSyntaxError(
+                            'Cue without content. (line %d)' % timing_line)
+                    else:
+                        found_timing = False
+                        caption = Caption(
+                            start, end, nodes, layout_info=layout_info)
+                        captions.append(caption)
+                        nodes = []
+            else:
+                if found_timing:
+                    if nodes:
+                        nodes.append(CaptionNode.create_break())
+                    nodes.append(CaptionNode.create_text(
+                        self._decode(line)))
+                else:
+                    # it's a comment or some metadata; ignore it
+                    pass
+
+        # Add a last caption if there are remaining nodes
+        if nodes:
+            caption = Caption(start, end, nodes, layout_info=layout_info)
+            captions.append(caption)
+
+        return captions
+
+    def _remove_styles(self, line):
+        partial_result = VOICE_SPAN_PATTERN.sub('\\2: ', line)
+        return OTHER_SPAN_PATTERN.sub('', partial_result)
+
+    def _validate_timings(self, start, end, last_start_time):
+        if start is None:
+            raise CaptionReadSyntaxError(
+                'Invalid cue start timestamp.')
+        if end is None:
+            raise CaptionReadSyntaxError('Invalid cue end timestamp.')
+        if start > end:
+            raise CaptionReadError(
+                'End timestamp is not greater than start timestamp.')
+        if start < last_start_time:
+            raise CaptionReadError(
+                'Start timestamp is not greater than or equal'
+                'to start timestamp of previous cue.')
+
+    def _parse_timing_line(self, line, last_start_time):
+        """
+        :returns: Tuple (int, int, Layout)
+        """
+        m = TIMING_LINE_PATTERN.search(line)
+        if not m:
+            raise CaptionReadSyntaxError(
+                'Invalid timing format.')
+
+        start = self._parse_timestamp(m.group(1))
+        end = self._parse_timestamp(m.group(2))
+
+        cue_settings = m.group(3)
+
+        if not self.ignore_timing_errors:
+            self._validate_timings(start, end, last_start_time)
+
+        layout_info = None
+        if cue_settings:
+            layout_info = Layout(webvtt_positioning=cue_settings)
+
+        return start, end, layout_info
+
+    def _parse_timestamp(self, timestamp):
+        """Returns an integer representing a number of microseconds
+        :rtype: int
+        """
+        m = TIMESTAMP_PATTERN.search(timestamp)
+        if not m:
+            raise CaptionReadSyntaxError(
+                'Invalid timing format.')
+
+        m = m.groups()
+
+        if m[2]:
+            # Timestamp takes the form of [hours]:[minutes]:[seconds].[milliseconds]
+            return microseconds(m[0], m[1], m[2].replace(":", ""), m[3])
+        else:
+            # Timestamp takes the form of [minutes]:[seconds].[milliseconds]
+            return microseconds(0, m[0], m[1], m[3])
+
+    def _decode(self, s):
+        """
+        Convert cue text from WebVTT XML-like format to plain unicode.
+        :type s: unicode
+        """
+        s = s.strip()
+        # Covert voice span
+        s = VOICE_SPAN_PATTERN.sub('\\2: ', s)
+        # TODO: Add support for other WebVTT tags. For now just strip them
+        # off the text.
+        s = OTHER_SPAN_PATTERN.sub('', s)
+        # Replace WebVTT special XML codes with plain unicode values
+        s = s.replace('&lt;', '<')
+        s = s.replace('&gt;', '>')
+        s = s.replace('&lrm;', '\u200e')
+        s = s.replace('&rlm;', '\u200f')
+        s = s.replace('&nbsp;', '\u00a0')
+        # Must do ampersand last
+        s = s.replace('&amp;', '&')
+        return s
+
+
+class WebVTTWriter(BaseWriter):
+    HEADER = 'WEBVTT\n\n'
+    global_layout = None
+    video_width = None
+    video_height = None
+
+    def write(self, caption_set):
+        """
+        :type caption_set: CaptionSet
+        """
+        output = self.HEADER
+
+        if caption_set.is_empty():
+            return output
+
+        caption_set = deepcopy(caption_set)
+
+        # TODO: styles. These go into a separate CSS file, which doesn't really
+        # fit the API here. Figure that out.  Though some style stuff can be
+        # done in-line.  This format is a little bit crazy.
+
+        # WebVTT's language support seems to be a bit crazy, so let's just
+        # support a single one for now.
+        lang = list(caption_set.get_languages())[0]
+
+        self.global_layout = caption_set.get_layout_info(lang)
+
+        captions = caption_set.get_captions(lang)
+
+        return output + '\n'.join(
+            [self._write_caption(caption_set, caption) for caption in captions])
+
+    def _timestamp(self, ts):
+        td = datetime.timedelta(microseconds=ts)
+        mm, ss = divmod(td.seconds, 60)
+        hh, mm = divmod(mm, 60)
+        s = "%02d:%02d.%03d" % (mm, ss, td.microseconds/1000)
+        if hh:
+            s = "%d:%s" % (hh, s)
+        return s
+
+    def _tags_for_style(self, style):
+        if style == 'italics':
+            return ['<i>', '</i>']
+        elif style == 'underline':
+            return ['<u>', '</u>']
+        elif style == 'bold':
+            return ['<b>', '</b>']
+        else:
+            return ['', '']
+
+    def _calculate_resulting_style(self, style, caption_set):
+        resulting_style = {}
+
+        style_classes = []
+        if 'classes' in style:
+            style_classes = style['classes']
+        elif 'class' in style:
+            style_classes = [style['class']]
+
+        for style_class in style_classes:
+            sub_style = caption_set.get_style(style_class).copy()
+            # Recursively resolve class attributes and calculate style
+            resulting_style.update(self._calculate_resulting_style(sub_style, caption_set))
+
+        resulting_style.update(style)
+
+        return resulting_style
+
+    def _write_caption(self, caption_set, caption):
+        """
+        :type caption: Caption
+        """
+        layout_groups = self._layout_groups(caption.nodes, caption_set)
+
+        start = self._timestamp(caption.start)
+        end = self._timestamp(caption.end)
+        timespan = "{} --> {}".format(start, end)
+
+        output = ''
+
+        cue_style_tags = ['', '']
+
+        style = self._calculate_resulting_style(caption.style, caption_set)
+        for key, value in sorted(style.items()):
+            if value:
+                tags = self._tags_for_style(key)
+#                    print "tags: " + str(tags) + "\n"
+                cue_style_tags[0] += tags[0]
+                cue_style_tags[1] = tags[1] + cue_style_tags[1]
+
+        for cue_text, layout in layout_groups:
+            if not layout:
+                layout = caption.layout_info or self.global_layout
+            cue_settings = self._cue_settings_from(layout)
+            output += timespan + cue_settings + '\n'
+            output += cue_style_tags[0] + cue_text + cue_style_tags[1] + '\n'
+
+        return output
+
+    def _cue_settings_from(self, layout):
+        """
+        Return WebVTT cue settings string based on layout info
+        :type layout: Layout
+        :rtype: unicode
+        """
+        if not layout:
+            return ''
+
+        # If it's converting from WebVTT to WebVTT, keep positioning info
+        # unchanged
+        if layout.webvtt_positioning:
+            return ' {}'.format(layout.webvtt_positioning)
+
+        left_offset = None
+        top_offset = None
+        cue_width = None
+        alignment = None
+
+        already_relative = False
+        if not self.relativize:
+            if layout.is_relative():
+                already_relative = True
+            else:
+                # There are absolute positioning values for this cue but the
+                # Writer is explicitly configured not to do any relativization.
+                # Ignore all positioning for this cue.
+                return ''
+
+        # Ensure that all positioning values are measured using percentage.
+        # This may raise an exception if layout.is_relative() == False
+        # If you want to avoid it, you have to turn off relativization by
+        # initializing this Writer with relativize=False.
+        if not already_relative:
+            layout = layout.as_percentage_of(
+                self.video_width, self.video_height)
+
+        # Ensure that when there's a left offset the caption is not pushed out
+        # of the screen. If the execution got this far it means origin and
+        # extent are already relative by now.
+        if self.fit_to_screen:
+            layout = layout.fit_to_screen()
+
+        if layout.origin:
+            left_offset = layout.origin.x
+            top_offset = layout.origin.y
+
+        if layout.extent:
+            cue_width = layout.extent.horizontal
+
+        if layout.padding:
+            if layout.padding.start and left_offset:
+                # Since there is no padding in WebVTT, the left padding is
+                # added to the total left offset (if it is defined and not
+                # relative),
+                if left_offset:
+                    left_offset += layout.padding.start
+                # and removed from the total cue width
+                if cue_width:
+                    cue_width -= layout.padding.start
+            # the right padding is cut out of the total cue width,
+            if layout.padding.end and cue_width:
+                cue_width -= layout.padding.end
+            # the top padding is added to the top offset
+            # (if it is defined and not relative)
+            if layout.padding.before and top_offset:
+                top_offset += layout.padding.before
+            # and the bottom padding is ignored because the cue box is only as
+            # long vertically as the text it contains and nothing can be cut
+            # out
+
+        try:
+            alignment = WEBVTT_VERSION_OF[layout.alignment.horizontal]
+        except (AttributeError, KeyError):
+            pass
+
+        cue_settings = ''
+
+        if alignment and alignment != 'middle':
+            cue_settings += " align:" + alignment
+        if left_offset:
+            cue_settings += " position:{},start".format(six.text_type(left_offset))
+        if top_offset:
+            cue_settings += " line:" + six.text_type(top_offset)
+        if cue_width:
+            cue_settings += " size:" + six.text_type(cue_width)
+
+        return cue_settings
+
+    def _layout_groups(self, nodes, caption_set):
+        """
+        Convert a Caption's nodes to WebVTT cue or cues (depending on
+        whether they have the same positioning or not).
+        """
+        if not nodes:
+            return []
+
+        current_layout = None
+
+        # A list with layout groups. Since WebVTT only support positioning
+        # for different cues, each layout group has to be represented in a
+        # new cue with the same timing but different positioning settings.
+        layout_groups = []
+        # A properly encoded WebVTT string (plain unicode must be properly
+        # escaped before being appended to this string)
+        s = ''
+        for i, node in enumerate(nodes):
+            if node.type_ == CaptionNode.TEXT:
+                if s and current_layout and node.layout_info != current_layout:
+                    # If the positioning changes from one text node to
+                    # another, a new WebVTT cue has to be created.
+                    layout_groups.append((s, current_layout))
+                    s = ''
+                # ATTENTION: This is where the plain unicode node content is
+                # finally encoded as WebVTT.
+                s += self._encode(node.content) or '&nbsp;'
+                current_layout = node.layout_info
+            elif node.type_ == CaptionNode.STYLE:
+                resulting_style = self._calculate_resulting_style(node.content, caption_set)
+
+                styles = ['italics', 'underline', 'bold']
+                if not node.start:
+                    styles.reverse()
+
+                for style in styles:
+                    if style in resulting_style and resulting_style[style]:
+                        tags = self._tags_for_style(style)
+                        if node.start:
+                            s += tags[0]
+                        else:
+                            s += tags[1]
+
+                # TODO: Refactor pycaption and eliminate the concept of a
+                # "Style node"
+            elif node.type_ == CaptionNode.BREAK:
+                if i > 0 and nodes[i - 1].type_ != CaptionNode.TEXT:
+                    s += '&nbsp;'
+                if i == 0:  # cue text starts with a break
+                    s += '&nbsp;'
+                s += '\n'
+
+        if s:
+            layout_groups.append((s, current_layout))
+        return layout_groups
+
+    def _encode(self, s):
+        """
+        Convert cue text from plain unicode to WebVTT XML-like format
+        escaping illegal characters. For a list of illegal characters see:
+            - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-text-span
+        :type s: unicode
+        """
+        s = s.replace('&', '&amp;')
+        s = s.replace('<', '&lt;')
+
+        # The substring "-->" is also not allowed according to this:
+        #   - http://dev.w3.org/html5/webvtt/#dfn-webvtt-cue-block
+        s = s.replace('-->', '--&gt;')
+
+        # The following characters have escaping codes for some reason, but
+        # they're not illegal, so for now I'll leave this commented out so that
+        # we stay as close as possible to the specification and avoid doing
+        # extra stuff "just to be safe".
+        # s = s.replace(u'>', u'&gt;')
+        # s = s.replace(u'\u200e', u'&lrm;')
+        # s = s.replace(u'\u200f', u'&rlm;')
+        # s = s.replace(u'\u00a0', u'&nbsp;')
+        return s