upload

2021-09-01 02:57:54 +05:00
parent 9df940f1fd
commit bf3c3712dd
222 changed files with 1007430 additions and 0 deletions
@@ -0,0 +1,5 @@
+from .kanji_to_romaji_module import convert_hiragana_to_katakana, translate_to_romaji, translate_soukon, \
+    translate_long_vowel, translate_soukon_ch, kanji_to_romaji
+__all__ = ["load_mappings_dict", "convert_hiragana_to_katakana", "convert_katakana_to_hiragana",
+           "translate_to_romaji", "translate_soukon",
+           "translate_long_vowel", "translate_soukon_ch", "kanji_to_romaji"]
@@ -0,0 +1,154 @@
+{
+  "！": "!",
+  "＂": "\"",
+  "＃": "#",
+  "＄": "$",
+  "％": "%",
+  "＆": "&",
+  "＇": "'",
+
+  "＊": "*",
+  "＋": "+",
+  "，": ",",
+  "－": "-",
+  "．": ".",
+  "／": "\/",
+  "０": "0",
+  "１": "1",
+  "２": "2",
+  "３": "3",
+  "４": "4",
+  "５": "5",
+  "６": "6",
+  "７": "7",
+  "８": "8",
+  "９": "9",
+  "：": ":",
+  "；": ";",
+  "＜": "<",
+  "＝": "=",
+  "＞": ">",
+  "？": "?",
+  "＠": "@",
+  "Ａ": "A",
+  "Ｂ": "B",
+  "Ｃ": "C",
+  "Ｄ": "D",
+  "Ｅ": "E",
+  "Ｆ": "F",
+  "Ｇ": "G",
+  "Ｈ": "H",
+  "Ｉ": "I",
+  "Ｊ": "J",
+  "Ｋ": "K",
+  "Ｌ": "L",
+  "Ｍ": "M",
+  "Ｎ": "N",
+  "И": "N",
+  "Ｏ": "O",
+  "Ｐ": "P",
+  "Ｑ": "Q",
+  "Ｒ": "R",
+  "Ｓ": "S",
+  "Ｔ": "T",
+  "Ｕ": "U",
+  "Ｖ": "V",
+  "Ｗ": "W",
+  "Ｘ": "X",
+  "Ｙ": "Y",
+  "Ｚ": "Z",
+  "［": "[",
+  "＼": "\\",
+  "］": "]",
+  "＾": "^",
+  "＿": "_",
+  "｀": "'",
+  "ａ": "a",
+  "ｂ": "b",
+  "ｃ": "c",
+  "ｄ": "d",
+  "ｅ": "e",
+  "ｆ": "f",
+  "ｇ": "g",
+  "ｈ": "h",
+  "ｉ": "i",
+  "ｊ": "j",
+  "ｋ": "k",
+  "ｌ": "l",
+  "ｍ": "m",
+  "ｎ": "n",
+  "ｏ": "o",
+  "ｐ": "p",
+  "ｑ": "q",
+  "ｒ": "r",
+  "ｓ": "s",
+  "ｔ": "t",
+  "ｕ": "u",
+  "ｖ": "v",
+  "ｗ": "w",
+  "ｘ": "x",
+  "ｙ": "y",
+  "ｚ": "z",
+  "｛": "{",
+  "｜": "|",
+  "｝": "}",
+  "～": "~",
+
+  "Ā": "A",
+  "Ă": "A",
+  "Ą": "A",
+  "â": "a",
+  "ā": "a",
+  "ă": "a",
+  "ą": "a",
+  "Ē": "E",
+  "Ĕ": "E",
+  "Ė": "E",
+  "Ę": "E",
+  "Ě": "E",
+  "ē": "e",
+  "ĕ": "e",
+  "ė": "e",
+  "ę": "e",
+  "ě": "e",
+  "Ī": "I",
+  "Ĭ": "I",
+  "Į": "I",
+  "ī": "i",
+  "ĭ": "i",
+  "į": "i",
+  "Ō": "O",
+  "Ŏ": "O",
+  "Ő": "O",
+  "ō": "o",
+  "ŏ": "o",
+  "ő": "o",
+  "Ũ": "U",
+  "Ū": "U",
+  "Ŭ": "U",
+  "Ů": "U",
+  "Ű": "U",
+  "Ų": "U",
+  "ũ": "u",
+  "ū": "u",
+  "ŭ": "u",
+  "ů": "u",
+  "ű": "u",
+  "ų": "u",
+
+  "Ӓ": "A",
+  "Ӑ": "A",
+  "Ѐ": "E",
+  "Ё": "E",
+  "Ӧ": "O",
+  "ӓ": "a",
+  "ӑ": "a",
+  "ѐ": "e",
+  "ё": "e",
+  "ӧ": "o",
+
+  "ω": "w",
+  "€": "E",
+  "∃": "E",
+  "ϛ": "c"
+}
@@ -0,0 +1,120 @@
+{
+  "ぁ": "a",
+  "あ": "a",
+  "ぃ": "i",
+  "い": "i",
+  "ぅ": "u",
+  "う": "u",
+  "ぇ": "e",
+  "え": "e",
+  "ぉ": "o",
+  "お": "o",
+  "か": "ka",
+  "が": "ga",
+  "き": "ki",
+  "きゃ": "kya",
+  "きゅ": "kyu",
+  "きょ": "kyo",
+  "ぎ": "gi",
+  "ぎゃ": "gya",
+  "ぎゅ": "gyu",
+  "ぎょ": "gyo",
+  "く": "ku",
+  "ぐ": "gu",
+  "け": "ke",
+  "げ": "ge",
+  "こ": "ko",
+  "ご": "go",
+  "さ": "sa",
+  "ざ": "za",
+  "し": "shi",
+  "しゃ": "sha",
+  "しゅ": "shu",
+  "しょ": "sho",
+  "じ": "ji",
+  "じゃ": "ja",
+  "じゅ": "ju",
+  "じょ": "jo",
+  "す": "su",
+  "ず": "zu",
+  "せ": "se",
+  "ぜ": "ze",
+  "そ": "so",
+  "ぞ": "zo",
+  "た": "ta",
+  "だ": "da",
+  "ち": "chi",
+  "ちゃ": "cha",
+  "ちゅ": "chu",
+  "ちょ": "cho",
+  "ぢ": "ji",
+  "つ": "tsu",
+  "づ": "zu",
+  "て": "te",
+  "で": "de",
+  "と": "to",
+  "ど": "do",
+  "な": "na",
+  "に": "ni",
+  "にゃ": "nya",
+  "にゅ": "nyu",
+  "にょ": "nyo",
+  "ぬ": "nu",
+  "ね": "ne",
+  "の": "no",
+  "は": "ha",
+  "ば": "ba",
+  "ぱ": "pa",
+  "ひ": "hi",
+  "ひゃ": "hya",
+  "ひゅ": "hyu",
+  "ひょ": "hyo",
+  "び": "bi",
+  "びゃ": "bya",
+  "びゅ": "byu",
+  "びょ": "byo",
+  "ぴ": "pi",
+  "ぴゃ": "pya",
+  "ぴゅ": "pyu",
+  "ぴょ": "pyo",
+  "ふ": "fu",
+  "ぶ": "bu",
+  "ぷ": "pu",
+  "へ": "he",
+  "べ": "be",
+  "ぺ": "pe",
+  "ほ": "ho",
+  "ぼ": "bo",
+  "ぽ": "po",
+  "ま": "ma",
+  "み": "mi",
+  "みゃ": "mya",
+  "みゅ": "myu",
+  "みょ": "myo",
+  "む": "mu",
+  "め": "me",
+  "も": "mo",
+  "や": "ya",
+  "ゆ": "yu",
+  "よ": "yo",
+  "ら": "ra",
+  "り": "ri",
+  "りゃ": "rya",
+  "りゅ": "ryu",
+  "りょ": "ryo",
+  "る": "ru",
+  "れ": "re",
+  "ろ": "ro",
+  "ゎ": "wa",
+  "わ": "wa",
+  "ゐ": "wi",
+  "ゑ": "we",
+  "を": " wo ",
+  "ん": "n",
+  "ゔ": "vu",
+  "ゕ": "ka",
+  "ゖ": "ke",
+  "ゝ": "iteration_mark",
+  "ゞ": "voiced_iteration_mark",
+  "ゟ": "yori"
+}
@@ -0,0 +1,18 @@
+{
+  "今日": {
+    "w_type": "noun",
+    "romaji": "kyou"
+  },
+  "明日": {
+    "w_type": "noun",
+    "romaji": "ashita"
+  },
+  "本": {
+    "w_type": "noun",
+    "romaji": "hon"
+  },
+  "中": {
+    "w_type": "noun",
+    "romaji": "naka"
+  }
+}
@@ -0,0 +1,78 @@
+{
+  "朝日奈丸佳": {
+    "w_type": "noun",
+    "romaji": "Asahina Madoka"
+  },
+  "高海千歌": {
+    "w_type": "noun",
+    "romaji": "Takami Chika"
+  },
+  "鏡音レン": {
+    "w_type": "noun",
+    "romaji": "Kagamine Len"
+  },
+  "鏡音リン": {
+    "w_type": "noun",
+    "romaji": "Kagamine Rin"
+  },
+  "逢坂大河": {
+    "w_type": "noun",
+    "romaji": "Aisaka Taiga"
+  },
+  "水樹奈々": {
+    "w_type": "noun",
+    "romaji": "Mizuki Nana"
+  },
+  "桜内梨子": {
+    "w_type": "noun",
+    "romaji": "Sakurauchi Riko"
+  },
+  "山吹沙綾": {
+    "w_type": "noun",
+    "romaji": "Yamabuki Saaya"
+  },
+  "初音ミク": {
+    "w_type": "noun",
+    "romaji": "Hatsune Miku"
+  },
+  "渡辺曜": {
+    "w_type": "noun",
+    "romaji": "Watanabe You"
+  },
+  "原由実": {
+    "w_type": "noun",
+    "romaji": "Hara Yumi"
+  },
+  "北宇治": {
+    "w_type": "noun",
+    "romaji": "Kita Uji"
+  },
+  "六本木": {
+    "w_type": "noun",
+    "romaji": "Roppongi"
+  },
+  "久美子": {
+    "w_type": "noun",
+    "romaji": "Kumiko"
+  },
+  "政宗": {
+    "w_type": "noun",
+    "romaji": "Masamune"
+  },
+  "小林": {
+    "w_type": "noun",
+    "romaji": "Kobayashi"
+  },
+  "奥寺": {
+    "w_type": "noun",
+    "romaji": "Okudera"
+  },
+  "佐藤": {
+    "w_type": "noun",
+    "romaji": "Satou"
+  },
+  "玲子": {
+    "w_type": "noun",
+    "romaji": "Reiko"
+  }
+}
@@ -0,0 +1,159 @@
+{
+  "ァ": "a",
+  "ア": "a",
+  "ィ": "i",
+  "イ": "i",
+  "イィ": "yi",
+  "イェ": "ye",
+  "ゥ": "u",
+  "ウ": "u",
+  "ウィ": "wi",
+  "ウェ": "we",
+  "ウォ": "wo",
+  "ェ": "e",
+  "エ": "e",
+  "ォ": "o",
+  "オ": "o",
+  "カ": "ka",
+  "ガ": "ga",
+  "キ": "ki",
+  "キェ": "kye",
+  "キャ": "kya",
+  "キュ": "kyu",
+  "キョ": "kyo",
+  "ギ": "gi",
+  "ギェ": "gye",
+  "ギャ": "gya",
+  "ギュ": "gyu",
+  "ギョ": "gyo",
+  "ク": "ku",
+  "クァ": "kwa",
+  "クィ": "kwi",
+  "クェ": "kwe",
+  "クォ": "kwo",
+  "グ": "gu",
+  "グァ": "gwa",
+  "グィ": "gwi",
+  "グェ": "gwe",
+  "グォ": "gwo",
+  "ケ": "ke",
+  "ゲ": "ge",
+  "コ": "ko",
+  "ゴ": "go",
+  "サ": "sa",
+  "ザ": "za",
+  "シ": "shi",
+  "シェ": "she",
+  "シャ": "sha",
+  "シュ": "shu",
+  "ショ": "sho",
+  "ジ": "ji",
+  "ジェ": "je",
+  "ジャ": "ja",
+  "ジュ": "ju",
+  "ジョ": "jo",
+  "ス": "su",
+  "スィ": "si",
+  "ズ": "zu",
+  "ズィ": "zi",
+  "セ": "se",
+  "ゼ": "ze",
+  "ソ": "so",
+  "ゾ": "zo",
+  "タ": "ta",
+  "ダ": "da",
+  "チ": "chi",
+  "チェ": "che",
+  "チャ": "cha",
+  "チュ": "chu",
+  "チョ": "cho",
+  "ヂ": "ji",
+  "ツ": "tsu",
+  "ツァ": "tsa",
+  "ツィ": "tsi",
+  "ツェ": "tse",
+  "ツォ": "tso",
+  "ヅ": "zu",
+  "テ": "te",
+  "ティ": "ti",
+  "デ": "de",
+  "ディ": "di",
+  "ト": "to",
+  "トゥ": "tu",
+  "ド": "do",
+  "ドゥ": "du",
+  "ナ": "na",
+  "ニ": "ni",
+  "ニャ": "nya",
+  "ニュ": "nyu",
+  "ニョ": "nyo",
+  "ヌ": "nu",
+  "ネ": "ne",
+  "ノ": "no",
+  "ハ": "ha",
+  "バ": "ba",
+  "パ": "pa",
+  "ヒ": "hi",
+  "ヒャ": "hya",
+  "ヒュ": "hyu",
+  "ヒョ": "hyo",
+  "ビ": "bi",
+  "ビャ": "bya",
+  "ビュ": "byu",
+  "ビョ": "byo",
+  "ピ": "pi",
+  "ピャ": "pya",
+  "ピュ": "pyu",
+  "ピョ": "pyo",
+  "フ": "fu",
+  "ファ": "fa",
+  "フィ": "fi",
+  "フェ": "fe",
+  "フォ": "fo",
+  "ブ": "bu",
+  "プ": "pu",
+  "ヘ": "he",
+  "ベ": "be",
+  "ペ": "pe",
+  "ホ": "ho",
+  "ホゥ": "hu",
+  "ボ": "bo",
+  "ポ": "po",
+  "マ": "ma",
+  "ミ": "mi",
+  "ミャ": "mya",
+  "ミュ": "myu",
+  "ミョ": "myo",
+  "ム": "mu",
+  "メ": "me",
+  "モ": "mo",
+  "ヤ": "ya",
+  "ユ": "yu",
+  "ヨ": "yo",
+  "ラ": "ra",
+  "リ": "ri",
+  "リャ": "rya",
+  "リュ": "ryu",
+  "リョ": "ryo",
+  "ル": "ru",
+  "レ": "re",
+  "ロ": "ro",
+  "ヮ": "wa",
+  "ワ": "wa",
+  "ヰ": "wi",
+  "ヱ": "we",
+  "ヲ": "wo",
+  "ン": "n",
+  "ヴ": "vu",
+  "ヴァ": "va",
+  "ヴィ": "vi",
+  "ヴェ": "ve",
+  "ヴォ": "vo",
+  "ヵ": "ka",
+  "ヶ": "ke",
+  "ヺ": "vo",
+  "・": " ",
+  "ヽ": "iteration_mark",
+  "ヾ": "voiced_iteration_mark",
+  "ヿ": "koto"
+}
@@ -0,0 +1,103 @@
+{
+  "\u200b": "",
+
+  "「": "[",
+  "」": "]",
+
+  "『": "[",
+  "』": "]",
+
+  "（": "(",
+  "）": ")",
+
+  "［": "[",
+  "］": "]",
+
+  "｛": "{",
+  "｝": "}",
+
+  "〈": "(",
+  "〉": ")",
+
+  "【": "[",
+  "】": "]",
+
+  "〔": "[",
+  "〕": "]",
+
+  "〖": "[",
+  "〗": "]",
+
+  "〘": "[",
+  "〙": "]",
+
+  "〚": "[",
+  "〛": "]",
+
+  "゠": "--",
+  "〓": "-",
+  "＝": "=",
+
+  "〜": "~",
+  "…": "_",
+
+  "※": "",
+
+  "♪": "",
+  "♫": "",
+  "♬": "",
+  "♩": "",
+
+  "！": "!",
+  "？": "?",
+
+  "、": ",",
+  "♥": " ",
+  "«": "(",
+  "»": ")",
+  "≪": "(",
+  "≫": ")",
+  "∕": "-",
+  "”": "",
+  "“": "",
+
+  "゙": "",
+  "’": "'",
+  "": "",
+  "→": "",
+  "⇒": "",
+  "∞": " ",
+  "☆": " ",
+  "♠": " ",
+  "ᷨ": " ",
+  "ꯑ": " ",
+  "ᤙ": " ",
+  "": " ",
+  "△": "" ,
+  "★": " ",
+  "♡": " ",
+  "。": "",
+  "゚": "",
+  "(": "(",
+  ")": ")",
+  "∀": "a",
+  "ά": "a",
+  "ɪ": "I",
+  "˥": "l",
+  "ﾟ": "",
+  "—": "-",
+  "Я": "",
+  "Ψ": "",
+  "┐": "",
+  "ə": "",
+  "ˈ": "",
+  "×": " x ",
+  "†": "",
+  "≡": " ",
+  "⁄": "",
+  "–": "-",
+  "⇔": " ",
+  "≒": " ",
+  "〆": "shime",
+  "\u3000": " "
+}
@@ -0,0 +1,646 @@
+# coding=utf-8
+import os
+import sys
+from collections import OrderedDict
+
+try:
+    # noinspection PyPackageRequirements
+    import simplejson as json
+except ImportError:
+    import json
+
+from .models import UnicodeRomajiMapping
+from .models import KanjiBlock
+from .models import Particle
+
+PATH_TO_MODULE = os.path.dirname(__file__)
+JP_MAPPINGS_PATH = os.path.join(PATH_TO_MODULE, "jp_mappings")
+
+hiragana_iter_mark = "ゝ"
+hiragana_voiced_iter_mark = "ゞ"
+katakana_iter_mark = "ヽ"
+katakana_voiced_iter_mark = "ヾ"
+kanji_iteration_mark = "々"
+
+hirgana_soukon_unicode_char = "っ"
+katakana_soukon_unicode_char = "ッ"
+katakana_long_vowel_mark = "ー"
+
+
+def load_kana_mappings_dict():
+    kana_romaji_mapping = {}
+    for f in os.listdir(JP_MAPPINGS_PATH):
+        if os.path.splitext(f)[1] == ".json" and "kanji" not in f:
+            with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
+                kana_romaji_mapping.update(json.load(data_file))
+    return kana_romaji_mapping
+
+
+def load_kanji_mappings_dict():
+    """
+    read through all json files that contain "kanji" in filename
+    load json data from files to kanji_romaji_mapping dictionary
+    if the key(kanji char) has already been added to kanji_romaji_mapping then create "other_readings" key
+        "other_readings" will consist of w_type for its key and the new romaji reading for it
+        e.g:
+            {u"係り":
+                'w_type': 'noun',
+                'romaji': 'kakari',
+                {'other_readings': {'godan verb stem': 'kakawari'}
+            }
+    :return: dict - kanji to romaji mapping
+    """
+
+    kanji_romaji_mapping = {}
+    f_list = os.listdir(JP_MAPPINGS_PATH)
+    for f in f_list[:]:  # shift all conjugated files to end, lower priority for verb stems
+        if "conjugated" in f:
+            f_list.remove(f)
+            f_list.append(f)
+
+    for f in f_list:
+        if os.path.splitext(f)[1] == ".json" and "kanji" in f:
+            with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
+                data_file_dict = json.load(data_file)
+                for k in list(data_file_dict.keys()):
+                    if k in kanji_romaji_mapping and \
+                                    data_file_dict[k]["w_type"] != kanji_romaji_mapping[k]["w_type"]:
+                        # if "other_readings" in kanji_romaji_mapping[k] and \
+                        #                 data_file_dict[k]["w_type"] in kanji_romaji_mapping[k]["other_readings"]:
+                        #     raise
+
+                        if "other_readings" not in kanji_romaji_mapping[k]:
+                            kanji_romaji_mapping[k]["other_readings"] = {}
+
+                        kanji_romaji_mapping[k]["other_readings"][data_file_dict[k]["w_type"]] = \
+                            data_file_dict[k]["romaji"]
+                    else:
+                        kanji_romaji_mapping[k] = data_file_dict[k]
+    return kanji_romaji_mapping
+
+
+def _convert_hira_kata_char(hira_or_kata_char, h_to_k=True):
+    """
+    take second last hex character from unicode and add/subtract 6 hex to it to get hiragana/katakana char
+    e.g hiragana u3041 -> 0x3041 + 0x6 = 0x30A1 -> katakana u30A1
+
+    :param hira_or_kata_char: unicode hiragana character
+    :return: converterd hiragana or katakana depending on h_to_k value
+    """
+    if h_to_k:
+        suffix_offset = 6
+    else:
+        suffix_offset = -6
+    unicode_second_last_char = list(hira_or_kata_char.encode("unicode_escape"))[-2]
+    suffix = hex(int(unicode_second_last_char, 16) + suffix_offset)
+    char_list = list(hira_or_kata_char.encode("unicode_escape"))
+    char_list[-2] = suffix[-1]
+    result_char = "".join(char_list).decode('unicode-escape').encode('utf-8')
+    return result_char
+
+
+def convert_hiragana_to_katakana(hiragana):
+    converted_str = ""
+
+    for c in hiragana:
+        if is_hiragana(c) or c in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char]:
+            converted_str += _convert_hira_kata_char(c)
+        else:
+            converted_str += c.encode('utf-8')
+    return converted_str.decode("utf-8")
+
+
+def convert_katakana_to_hiragana(katakana):
+    converted_str = ""
+
+    for c in katakana:
+        if is_katakana(c) or c in [katakana_iter_mark, katakana_voiced_iter_mark,
+                                   katakana_soukon_unicode_char]:
+            converted_str += _convert_hira_kata_char(c, h_to_k=False)
+        else:
+            converted_str += c.encode('utf-8')
+    return converted_str.decode("utf-8")
+
+
+def is_hiragana(c):
+    hiragana_starting_unicode = "\u3041"
+    hiragana_ending_unicode = "\u3096"
+    return c not in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char] and \
+        hiragana_starting_unicode <= c <= hiragana_ending_unicode
+
+
+def is_katakana(c):
+    katakana_starting_unicode = "\u30A1"
+    katakana_ending_unicode = "\u30F6"
+    return c not in [katakana_iter_mark, katakana_voiced_iter_mark,
+                     katakana_soukon_unicode_char, katakana_long_vowel_mark] and \
+        katakana_starting_unicode <= c <= katakana_ending_unicode
+
+
+def is_kanji(c):
+    cjk_start_range = "\u4E00"
+    cjk_end_range = "\u9FD5"
+    if isinstance(c, KanjiBlock):
+        return True
+    else:
+        return c != kanji_iteration_mark and cjk_start_range <= c <= cjk_end_range
+
+
+def get_char_type(c):
+    """
+    determine type of passed character by checking if it belongs in a certan unicode range
+    :param c: kana or kanji character
+    :return: type of character
+    """
+    char_type = None
+    if is_hiragana(c):
+        char_type = "hiragana"
+    elif is_katakana(c):
+        char_type = "katakana"
+    elif is_kanji(c):
+        char_type = "kanji"
+
+    return char_type
+
+
+def translate_particles(kana_list):
+    """
+    try to find particles which are in hirgana and turn them in to Particle objects
+    Particle will provide spacing and will be translated in to appropriate romaji (e.g wa instead of ha for は)
+
+    rules (varies depending on the hiragana char):
+        char between two KanjiBlocks(that can be nouns) then assume to be a particle
+            e.g: 私は嬉 -> KanjiBlock(私), は, KanjiBlock(嬉) -> は is particle use wa instead of ha
+        type(Kanji, Hiragana, Katakana) changes adjacent to the char
+            e.g: アパートへくる -> ト, へ, く -> katakana, へ, hiragana -> へ is a particle, use e instead of he
+        char is last char and previous char is a noun
+            e.g: 会いました友達に -> KanjiBlock(友達) which is a noun, に
+
+    :param kana_list: list of kana characters and KanjiBlock objects
+    :return: None; update the kana_list that is passed
+    """
+    def is_noun(k_block):
+        return hasattr(k_block, "w_type") and ("noun" in k_block.w_type or "pronoun" in k_block.w_type)
+
+    def type_changes(p, n):
+        if get_char_type(p) is not None and get_char_type(n) is not None:
+            return get_char_type(p) != get_char_type(n)
+        else:
+            return False
+
+    def particle_imm_follows(prev_c_, valid_prev_particles):
+        """
+        check if prev_c is a Particle object
+        check that prev_c is one of the valid_prev_particles
+        e.g: wa particle can't be followed by wa particle again but ni particle can be followed by wa.
+        :param prev_c_: previous character compared to current character in the iteration
+        :param valid_prev_particles: list of previous particles that can be followed by current character.
+        :return:
+        """
+        return isinstance(prev_c_, Particle) and prev_c_ in valid_prev_particles
+
+    no_hira_char = "\u306E"
+    ha_hira_char = "\u306F"
+    he_hira_char = "\u3078"
+    to_hira_char = "\u3068"
+    ni_hira_char = "\u306B"
+    de_hira_char = "\u3067"
+    mo_hira_char = "\u3082"
+    ga_hira_char = "\u304C"
+
+    no_prtcle = Particle("no")
+    wa_prtcle = Particle("wa")
+    e_prtcle = Particle("e")
+    to_prtcle = Particle("to")
+    ni_prtcle = Particle("ni")
+    de_prtcle = Particle("de")
+    mo_prtcle = Particle("mo")
+    ga_prtcle = Particle("ga")
+
+    for i in range(1, len(kana_list)):
+        is_last_char = False
+        prev_c = kana_list[i - 1]
+        if i == len(kana_list) - 1:
+            is_last_char = True
+            next_c = ""
+        else:
+            next_c = kana_list[i + 1]
+
+        if kana_list[i] == no_hira_char:
+            if (is_noun(prev_c) and is_noun(next_c)) or \
+                    type_changes(prev_c, next_c) or \
+                    (is_noun(prev_c) and is_last_char):
+                kana_list[i] = no_prtcle
+
+        elif kana_list[i] == ha_hira_char:
+            if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
+                    type_changes(prev_c, next_c) or \
+                    particle_imm_follows(prev_c, [e_prtcle, to_prtcle, ni_prtcle, de_prtcle]) or \
+                    (is_noun(prev_c) and is_last_char):
+                kana_list[i] = wa_prtcle
+
+        elif kana_list[i] == mo_hira_char:
+            if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
+                    type_changes(prev_c, next_c) or \
+                    particle_imm_follows(prev_c, [ni_prtcle, de_prtcle]) or \
+                    (is_noun(prev_c) and is_last_char):
+                kana_list[i] = mo_prtcle
+
+        elif kana_list[i] in [he_hira_char, to_hira_char, ni_hira_char, de_hira_char, ga_hira_char] and \
+                (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
+                type_changes(prev_c, next_c) or \
+                (is_noun(prev_c) and is_last_char):
+
+            if kana_list[i] == he_hira_char:
+                kana_list[i] = e_prtcle
+
+            elif kana_list[i] == to_hira_char:
+                kana_list[i] = to_prtcle
+
+            elif kana_list[i] == ni_hira_char:
+                kana_list[i] = ni_prtcle
+
+            elif kana_list[i] == de_hira_char:
+                kana_list[i] = de_prtcle
+
+            elif kana_list[i] == ga_hira_char:
+                kana_list[i] = ga_prtcle
+
+
+def translate_kanji_iteration_mark(kana_list):
+    """
+    translate kanji_iteration_mark: 々
+    e.g:
+        在々: zaizai
+    :param kana_list: unicode consisting of kana and kanji chars
+    :return: unicode with kanji iteration marks translated
+    """
+    prev_c = ""
+    for i in range(0, len(kana_list)):
+        if kana_list[i] == kanji_iteration_mark:
+            kana_list[i] = prev_c.romaji.strip()
+        prev_c = kana_list[i]
+
+
+def get_type_if_verb_stem(curr_chars):
+    """
+    get verb type for given verb stem. verb types can be ichidan, godan or None.
+    No stem for irregulars
+    :param curr_chars: kanji chars that is a verb stem
+    :return: type of verb stem
+    """
+    v_type = None
+
+    if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
+        v_type = UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]
+
+    elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
+        if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
+            v_type = "godan verb"
+        elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
+            v_type = "ichidan verb"
+
+    return v_type
+
+
+def check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len):
+    """
+    if the given curr_chars has a verb stem reading then try to match it with an one of the listed verb endings
+    otherwise return/use its .romaji property
+
+    e.g:
+    kana_list = [KanjiBlock(灯り), ま, し, た]
+    curr_chars = 灯り can be verb stem reading
+    try and match 灯り with an ending within kana_list
+    灯り + ました matches
+    romaji is tomori + mashita (this modifies kana_list to remove matched ending)
+    kana_list = [tomorimashita]
+
+    kana_list = [KanjiBlock(灯り), を, 見ます]
+    curr_chars = 灯り can be verb stem reading
+    try and match 灯り with an ending within kana_list
+    no matching ending
+    romaji is akari
+    kana_list = [akari, を, 見ます]
+
+    :param kana_list:
+    :param curr_chars: KanjiBlock current characters to parse out of entire kana_list
+    :param start_pos:
+    :param char_len:
+    :return: ending kanji, ending romaji; both will be None if ending not found
+    """
+    endings = OrderedDict({})
+    endings["ませんでした"] = "masen deshita"
+    endings["ませんで"] = "masende"
+    endings["なさるな"] = "nasaruna"
+    endings["なかった"] = "nakatta"
+    endings["れて"] = "rete"
+    endings["ましょう"] = "mashou"
+    endings["ました"] = "mashita"
+    endings["まして"] = "mashite"
+    endings["ません"] = "masen"
+    endings["ないで"] = "naide"
+    endings["なさい"] = "nasai"
+    endings["ます"] = "masu"
+    endings["よう"] = "you"  # ichidan
+    endings["ない"] = "nai"
+    endings["た"] = "ta"  # ichidan
+    endings["て"] = "te"  # ichidan
+    endings["ろ"] = "ro"  # ichidan
+    endings["う"] = "u"
+
+    dict_entry = None
+
+    if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
+        dict_entry = UnicodeRomajiMapping.kanji_mapping[curr_chars]
+
+    elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
+
+        if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
+            dict_entry = {
+                "romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["godan verb stem"]
+            }
+        elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
+            dict_entry = {
+                "romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["ichidan verb stem"]
+            }
+    e_k = None
+    e_r = None
+    if dict_entry is not None:
+        for e in list(endings.keys()):
+            possible_conj = curr_chars + e
+            actual_conj = "".join(kana_list[start_pos: (start_pos + char_len + len(e))])
+            if possible_conj == actual_conj:
+                e_k = e
+                e_r = endings[e] + " "
+                break
+
+    return e_k, e_r
+
+
+def has_non_verb_stem_reading(curr_chars):
+    """
+    check if curr_chars has an alternative reading aside from the verb stem
+    :param curr_chars: unicode kanji chars to check
+    :return: true/false depending on if curr_chars has a verb stem reading
+    """
+    res = False
+
+    if "verb stem" not in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
+        res = True
+
+    elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
+        if any(["verb stem" not in ork
+                for ork in list(UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"].keys())]):
+            res = True
+
+    return res
+
+
+def get_verb_stem_romaji(verb_stem_kanji):
+    """
+    find romaji for verb stem within kanji_mapping
+    :param verb_stem_kanji: unicode verb stem kanji
+    :return: romaji for verb stem kanji
+    """
+    romaji = None
+    if "verb stem" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["w_type"]:
+        romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["romaji"]
+    elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]:
+        for k in list(UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"].keys()):
+            if "verb stem" in k:
+                romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"][k]
+                break
+
+    return romaji
+
+
+def prepare_kanjiblocks(kchar_list):
+    """
+    create and replace matched Kanji characters that are within kanji_mapping with KanjiBlock
+    KanjiBlock will be used for spacing and particle translation later
+    if the kanji found is a verb stem then try to find an ending to match it with what's in kchar_list
+    :param kchar_list: list containing kana and kanji characters
+    :return: kchar_list with all found Kanji characters turned in to KanjiBlock objects
+    """
+    if len(UnicodeRomajiMapping.kanji_mapping) == 0:
+        UnicodeRomajiMapping.kanji_mapping = load_kanji_mappings_dict()
+
+    max_char_len = len(kchar_list)
+    kana_list = list(kchar_list)
+
+    start_pos = 0
+    while start_pos < max_char_len:
+        char_len = len(kana_list) - start_pos
+        while char_len > 0:
+            curr_chars = "".join(kana_list[start_pos: (start_pos + char_len)])
+            if curr_chars in UnicodeRomajiMapping.kanji_mapping:
+                verb_stem_type = get_type_if_verb_stem(curr_chars)
+                ending_match_found = False
+                if verb_stem_type is not None:
+                    ending_kana, ending_romaji = check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len)
+                    if ending_kana is not None and ending_romaji is not None:
+                        ending_match_found = True
+                        conjugated_val = {
+                            "romaji": get_verb_stem_romaji(curr_chars) + ending_romaji,
+                            "w_type": "conjugated " + verb_stem_type
+                        }
+
+                        for i in range(start_pos + char_len - 1 + len(ending_kana), start_pos - 1, -1):
+                            del kana_list[i]
+
+                        kana_list.insert(start_pos,
+                                         KanjiBlock(curr_chars + ending_kana, conjugated_val))
+
+                if ending_match_found is False and has_non_verb_stem_reading(curr_chars):
+                    for i in range(start_pos + char_len - 1, start_pos - 1, -1):
+                        del kana_list[i]
+                    kana_list.insert(start_pos,
+                                     KanjiBlock(curr_chars, UnicodeRomajiMapping.kanji_mapping[curr_chars]))
+            char_len -= 1
+        start_pos += 1
+    return kana_list
+
+
+def translate_kanji(kana_list):
+    i = 0
+    while i < len(kana_list):
+        if type(kana_list[i]) == KanjiBlock:
+            kana_list[i] = kana_list[i].romaji
+        i += 1
+
+    kana = "".join(kana_list)
+    return kana
+
+
+def prep_kanji(kana):
+    kana_list = list(kana)
+    if any([is_kanji(k) for k in kana]):
+        kana_list = prepare_kanjiblocks(kana)
+        translate_kanji_iteration_mark(kana_list)
+
+    return kana_list
+
+
+def translate_to_romaji(kana):
+    """
+    translate hiragana, katakana, typographic, and fhw latin
+    :param kana: unicode kana(+kanji) characters
+    :return: translated base kana characters to romaji as well as typographic, and fhw latin
+    """
+    if len(UnicodeRomajiMapping.kana_mapping) == 0:
+        UnicodeRomajiMapping.kana_mapping = load_kana_mappings_dict()
+
+    max_char_len = 2
+
+    for char_len in range(max_char_len, 0, -1):
+        start_pos = 0
+        while start_pos < len(kana) - char_len + 1:
+            curr_chars = kana[start_pos: (start_pos + char_len)]
+            if curr_chars in UnicodeRomajiMapping.kana_mapping:
+                kana = kana.replace(curr_chars, UnicodeRomajiMapping.kana_mapping[curr_chars], 1)
+                if len(UnicodeRomajiMapping.kana_mapping[curr_chars]) == 0:
+                    start_pos -= 1
+            start_pos += 1
+
+    while "  " in kana:
+        kana = kana.replace("  ", " ")
+    kana = kana.strip()
+
+    lines = kana.split("\n")
+    for i in range(0, len(lines)):
+        lines[i] = lines[i].strip()
+    kana = "\n".join(lines)
+    return kana
+
+
+def translate_soukon(partial_kana):
+    """
+    translate both hiragana and katakana soukon: っ, ッ; repeats next consonant
+    e.g:
+        ちょっと willl be choっto by the time iit is passed to this method and then becomes chotto
+    :param partial_kana: partially translated kana with base kana chars already translated to romaji
+    :return: partial kana with soukon translated
+    """
+    prev_char = ""
+
+    for c in reversed(partial_kana):
+        if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char:  # assuming that soukon can't be last
+            partial_kana = prev_char[0].join(partial_kana.rsplit(c, 1))
+        prev_char = c
+    return partial_kana
+
+
+def translate_long_vowel(partial_kana):
+    """
+    translate katakana long vowel ー; repeats previous vowel
+    e.g:
+        メール will be meーru by the time it is passed to this method and then becomes meeru
+    :param partial_kana: partially translated kana with base kana chars already translated to romaji
+    :return: partial kana with long vowel translated
+    """
+    prev_c = ""
+    for c in partial_kana:
+        if c == katakana_long_vowel_mark:
+            if prev_c[-1] in list("aeiou"):
+                partial_kana = partial_kana.replace(c, prev_c[-1], 1)
+            else:
+                partial_kana = partial_kana.replace(c, "", 1)
+        prev_c = c
+    return partial_kana
+
+
+def translate_soukon_ch(kana):
+    """
+    if soukon(mini-tsu) is followed by chi then soukon romaji becomes 't' sound
+    e.g: ko-soukon-chi -> kotchi instead of kocchi
+    :param kana:
+    :return:
+    """
+
+    prev_char = ""
+    hiragana_chi_unicode_char = "\u3061"
+    katakana_chi_unicode_char = "\u30C1"
+    partial_kana = kana
+    for c in reversed(kana):
+        if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char:  # assuming that soukon can't be last
+            if prev_char == hiragana_chi_unicode_char or prev_char == katakana_chi_unicode_char:
+                partial_kana = "t".join(partial_kana.rsplit(c, 1))
+        prev_char = c
+    return partial_kana
+
+
+def _translate_dakuten_equivalent_char(kana_char):
+    dakuten_mapping = {
+        "か": "が", "き": "ぎ", "く": "ぐ", "け": "げ", "こ": "ご",
+        "さ": "ざ", "し": "じ", "す": "ず", "せ": "ぜ", "そ": "ぞ",
+        "た": "だ", "ち": "ぢ", "つ": "づ", "て": "で", "と": "ど",
+        "は": "ば", "ひ": "び", "ふ": "ぶ", "へ": "べ", "ほ": "ぼ",
+        "タ": "ダ", "チ": "ヂ", "ツ": "ヅ", "テ": "デ", "ト": "ド",
+        "カ": "ガ", "キ": "ギ", "ク": "グ", "ケ": "ゲ", "コ": "ゴ",
+        "サ": "ザ", "シ": "ジ", "ス": "ズ", "セ": "ゼ", "ソ": "ゾ",
+        "ハ": "バ", "ヒ": "ビ", "フ": "ブ", "ヘ": "ベ", "ホ": "ボ"
+    }
+
+    dakuten_equiv = ""
+    if kana_char in dakuten_mapping:
+        dakuten_equiv = dakuten_mapping[kana_char]
+
+    return dakuten_equiv
+
+
+def translate_dakuten_equivalent(kana_char):
+    """
+    translate hiragana and katakana character to their dakuten equivalent
+    e.g:
+        ヒ: ビ
+        く: ぐ
+        み: ""
+    :param kana_char: unicode kana char
+    :return: dakuten equivalent if it exists otherwise empty string
+    """
+    return _translate_dakuten_equivalent_char(kana_char)
+
+
+def translate_kana_iteration_mark(kana):
+    """
+    translate hiragana and katakana iteration marks: ゝ, ゞ, ヽ, ヾ
+    e.g:
+        こゝ: koko
+        タヾ: tada
+        かゞみち: kagaみち
+    :param kana: unicode consisting of kana chars
+    :return: unicode with kana iteration marks translated
+    """
+    prev_char = ""
+    partial_kana = kana
+    for c in kana:
+        if c == hiragana_iter_mark or c == katakana_iter_mark:
+            partial_kana = prev_char.join(partial_kana.split(c, 1))
+        elif c == hiragana_voiced_iter_mark or c == katakana_voiced_iter_mark:
+            partial_kana = translate_dakuten_equivalent(prev_char).join(partial_kana.split(c, 1))
+        else:
+            prev_char = c
+    return partial_kana
+
+
+def kanji_to_romaji(kana):
+    if type(kana) == str:
+        kana = kana
+    pk = translate_kana_iteration_mark(kana)
+    pk = translate_soukon_ch(pk)
+    pk_list = prep_kanji(pk)
+    translate_particles(pk_list)
+    pk = translate_kanji(pk_list)
+    pk = translate_to_romaji(pk)
+    pk = translate_soukon(pk)
+    r = translate_long_vowel(pk)
+    return r.replace("\\\\", "\\").encode("unicode_escape")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print(kanji_to_romaji(("".join(sys.argv[1:])).decode('unicode-escape')))
+    else:
+        print("Missing Kanji/Kana character argument\n" \
+              "e.g: kanji_to_romaji.py \\u30D2")
@@ -0,0 +1,29 @@
+class KanjiBlock(str):
+    def __new__(cls, *args, **kwargs):
+        obj = str.__new__(cls, "@")
+        kanji = args[0]
+        kanji_dict = args[1]
+
+        obj.kanji = kanji
+        if len(kanji) == 1:
+            obj.romaji = " " + kanji_dict["romaji"]
+        else:
+            if "verb stem" in kanji_dict["w_type"]:
+                obj.romaji = " " + kanji_dict["romaji"]
+            else:
+                obj.romaji = " " + kanji_dict["romaji"] + " "
+
+        if "other_readings" in kanji_dict:
+            obj.w_type = [kanji_dict["w_type"]]
+            obj.w_type.extend(
+                [k for k in list(kanji_dict["other_readings"].keys())]
+            )
+        else:
+            obj.w_type = kanji_dict["w_type"]
+        return obj
+
+    def __repr__(self):
+        return self.kanji.encode("unicode_escape")
+
+    def __str__(self):
+        return self.romaji.encode("utf-8")
@@ -0,0 +1,6 @@
+class Particle(str):
+    def __new__(cls, *args, **kwargs):
+        particle_str = args[0]
+        obj = str.__new__(cls, " " + particle_str + " ")
+        obj.pname = particle_str
+        return obj
@@ -0,0 +1,4 @@
+# noinspection PyClassHasNoInit
+class UnicodeRomajiMapping:  # caching
+    kana_mapping = {}
+    kanji_mapping = {}
@@ -0,0 +1,5 @@
+from .UnicodeRomajiMapping import UnicodeRomajiMapping
+from .KanjiBlock import KanjiBlock
+from .Particle import Particle
+
+__all__ = ["UnicodeRomajiMapping", "KanjiBlock", "Particle"]
@@ -0,0 +1,34 @@
+from .base import (
+    CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet)
+from .dfxp import DFXPWriter, DFXPReader
+from .sami import SAMIReader, SAMIWriter
+from .srt import SRTReader, SRTWriter
+from .scc import SCCReader, SCCWriter
+from .webvtt import WebVTTReader, WebVTTWriter
+from .exceptions import (
+    CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError)
+
+
+__all__ = [
+    'CaptionConverter', 'DFXPReader', 'DFXPWriter',
+    'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter',
+    'SCCReader', 'SCCWriter', 'WebVTTReader', 'WebVTTWriter',
+    'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError',
+    'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet'
+]
+
+SUPPORTED_READERS = (
+    DFXPReader, WebVTTReader, SAMIReader, SRTReader, SCCReader)
+
+
+def detect_format(caps):
+    """
+    Detect the format of the provided caption string.
+
+    :returns: the reader class for the detected format.
+    """
+    for reader in SUPPORTED_READERS:
+        if reader().detect(caps):
+            return reader
+
+    return None
@@ -0,0 +1,409 @@
+from datetime import timedelta
+from numbers import Number
+from six import text_type
+
+from .exceptions import CaptionReadError, CaptionReadTimingError
+
+DEFAULT_LANGUAGE_CODE = 'en-US'
+
+
+def force_byte_string(content):
+    try:
+        return content.encode('UTF-8')
+    except UnicodeEncodeError:
+        raise RuntimeError('Invalid content encoding')
+    except UnicodeDecodeError:
+        return content
+
+
+class CaptionConverter(object):
+    def __init__(self, captions=None):
+        self.captions = captions if captions else []
+
+    def read(self, content, caption_reader):
+        try:
+            self.captions = caption_reader.read(content)
+        except AttributeError as e:
+            raise Exception(e)
+        return self
+
+    def write(self, caption_writer):
+        try:
+            return caption_writer.write(self.captions)
+        except AttributeError as e:
+            raise Exception(e)
+
+
+class BaseReader(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def detect(self, content):
+        if content:
+            return True
+        else:
+            return False
+
+    def read(self, content):
+        return CaptionSet()
+
+
+class BaseWriter(object):
+    def __init__(self, relativize=True, video_width=None, video_height=None,
+                 fit_to_screen=True):
+        """
+        Initialize writer with the given parameters.
+
+        :param relativize: If True (default), converts absolute positioning
+            values (e.g. px) to percentage. ATTENTION: WebVTT does not support
+            absolute positioning. If relativize is set to False and it finds
+            an absolute positioning parameter for a given caption, it will
+            ignore all positioning for that cue and show it in the default
+            position.
+        :param video_width: The width of the video for which the captions being
+            converted were made. This is necessary for relativization.
+        :param video_height: The height of the video for which the captions
+            being converted were made. This is necessary for relativization.
+        :param fit_to_screen: If extent is not set or
+            if origin + extent > 100%, (re)calculate it based on origin.
+            It is a pycaption fix for caption files that are technically valid
+            but contains inconsistent settings that may cause long captions to
+            be cut out of the screen.
+        """
+        self.relativize = relativize
+        self.video_width = video_width
+        self.video_height = video_height
+        self.fit_to_screen = fit_to_screen
+
+    def _relativize_and_fit_to_screen(self, layout_info):
+        if layout_info:
+            if self.relativize:
+                # Transform absolute values (e.g. px) into percentages
+                layout_info = layout_info.as_percentage_of(
+                    self.video_width, self.video_height)
+            if self.fit_to_screen:
+                # Make sure origin + extent <= 100%
+                layout_info = layout_info.fit_to_screen()
+        return layout_info
+
+    def write(self, content):
+        return content
+
+
+class Style(object):
+    def __init__(self):
+        pass
+
+
+class CaptionNode(object):
+    """
+    A single node within a caption, representing either
+    text, a style, or a linebreak.
+
+    Rules:
+        1. All nodes should have the property layout_info set.
+        The value None means specifically that no positioning information
+        should be specified. Each reader is to supply its own default
+        values (if necessary) when reading their respective formats.
+    """
+
+    TEXT = 1
+    # When and if this is extended, it might be better to turn it into a
+    # property of the node, not a type of node itself.
+    STYLE = 2
+    BREAK = 3
+
+    def __init__(self, type_, layout_info=None):
+        """
+        :type type_: int
+        :type layout_info: Layout
+        """
+        self.type_ = type_
+        self.content = None
+
+        # Boolean. Marks the beginning/ end of a Style node.
+        self.start = None
+        self.layout_info = layout_info
+
+    def __repr__(self):
+        t = self.type_
+
+        if t == CaptionNode.TEXT:
+            return repr(self.content)
+        elif t == CaptionNode.BREAK:
+            return repr('BREAK')
+        elif t == CaptionNode.STYLE:
+            return repr('STYLE: %s %s' % (self.start, self.content))
+        else:
+            raise RuntimeError('Unknown node type: ' + str(t))
+
+    @staticmethod
+    def create_text(text, layout_info=None):
+        data = CaptionNode(CaptionNode.TEXT, layout_info=layout_info)
+        data.content = text
+        return data
+
+    @staticmethod
+    def create_style(start, content, layout_info=None):
+        data = CaptionNode(CaptionNode.STYLE, layout_info=layout_info)
+        data.content = content
+        data.start = start
+        return data
+
+    @staticmethod
+    def create_break(layout_info=None):
+        return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
+
+
+class Caption(object):
+    """
+    A single caption, including the time and styling information
+    for its display.
+    """
+    def __init__(self, start, end, nodes, style={}, layout_info=None):
+        """
+        Initialize the Caption object
+        :param start: The start time in microseconds
+        :type start: Number
+        :param end: The end time in microseconds
+        :type end: Number
+        :param nodes: A list of CaptionNodes
+        :type nodes: list
+        :param style: A dictionary with CSS-like styling rules
+        :type style: dict
+        :param layout_info: A Layout object with the necessary positioning
+            information
+        :type layout_info: Layout
+        """
+        if not isinstance(start, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid start time")
+        if not isinstance(end, Number):
+            raise CaptionReadTimingError("Captions must be initialized with a"
+                                         " valid end time")
+        if not nodes:
+            raise CaptionReadError("Node list cannot be empty")
+        self.start = start
+        self.end = end
+        self.nodes = nodes
+        self.style = style
+        self.layout_info = layout_info
+
+    def is_empty(self):
+        return len(self.nodes) == 0
+
+    def format_start(self, msec_separator=None):
+        """
+        Format the start time value in milliseconds into a string
+        value suitable for some of the supported output formats (ex.
+        SRT, DFXP).
+        """
+        return self._format_timestamp(self.start, msec_separator)
+
+    def format_end(self, msec_separator=None):
+        """
+        Format the end time value in milliseconds into a string value suitable
+        for some of the supported output formats (ex. SRT, DFXP).
+        """
+        return self._format_timestamp(self.end, msec_separator)
+
+    def __repr__(self):
+        return repr(
+            '{start} --> {end}\n{text}'.format(
+                start=self.format_start(),
+                end=self.format_end(),
+                text=self.get_text()
+            )
+        )
+
+    def get_text(self):
+        """
+        Get the text of the caption.
+        """
+        def get_text_for_node(node):
+            if node.type_ == CaptionNode.TEXT:
+                return node.content
+            if node.type_ == CaptionNode.BREAK:
+                return '\n'
+            return ''
+        text_nodes = [get_text_for_node(node) for node in self.nodes]
+        return ''.join(text_nodes).strip()
+
+    def _format_timestamp(self, value, msec_separator=None):
+        datetime_value = timedelta(milliseconds=(int(value / 1000)))
+
+        str_value = text_type(datetime_value)[:11]
+        if not datetime_value.microseconds:
+            str_value += '.000'
+
+        if msec_separator is not None:
+            str_value = str_value.replace(".", msec_separator)
+
+        return '0' + str_value
+
+
+class CaptionList(list):
+    """ A list of captions with a layout object attached to it """
+    def __init__(self, iterable=None, layout_info=None):
+        """
+        :param iterable: An iterator used to populate the caption list
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self.layout_info = layout_info
+        args = [iterable] if iterable else []
+        super(CaptionList, self).__init__(*args)
+
+    def __getslice__(self, i, j):
+        return CaptionList(
+            list.__getslice__(self, i, j), layout_info=self.layout_info)
+
+    def __getitem__(self, y):
+        item = list.__getitem__(self, y)
+        if isinstance(item, Caption):
+            return item
+        return CaptionList(item, layout_info=self.layout_info)
+
+    def __add__(self, other):
+        add_is_safe = (
+            not hasattr(other, 'layout_info') or
+            not other.layout_info or
+            self.layout_info == other.layout_info
+        )
+        if add_is_safe:
+            return CaptionList(
+                list.__add__(self, other), layout_info=self.layout_info)
+        else:
+            raise ValueError(
+                "Cannot add CaptionList objects with different layout_info")
+
+    def __mul__(self, other):
+        return CaptionList(
+            list.__mul__(self, other), layout_info=self.layout_info)
+
+    __rmul__ = __mul__
+
+
+class CaptionSet(object):
+    """
+    A set of captions in potentially multiple languages,
+    all representing the same underlying content.
+
+    The .layout_info attribute, keeps information that should be inherited
+    by all the children.
+    """
+    def __init__(self, captions, styles={}, layout_info=None):
+        """
+        :param captions: A dictionary of the format {'language': CaptionList}
+        :param styles: A dictionary with CSS-like styling rules
+        :param Layout layout_info: A Layout object with the positioning info
+        """
+        self._captions = captions
+        self._styles = styles
+        self.layout_info = layout_info
+
+    def set_captions(self, lang, captions):
+        self._captions[lang] = captions
+
+    def get_languages(self):
+        return list(self._captions.keys())
+
+    def get_captions(self, lang):
+        return self._captions.get(lang, [])
+
+    def add_style(self, selector, rules):
+        """
+        :param selector: The selector indicating the elements to which the
+            rules should be applied.
+        :param rules: A dictionary with CSS-like styling rules.
+        """
+        self._styles[selector] = rules
+
+    def get_style(self, selector):
+        """
+        Returns a dictionary with CSS-like styling rules for a given selector.
+        :param selector: The selector whose rules should be returned (e.g. an
+            element or class name).
+        """
+        return self._styles.get(selector, {})
+
+    def get_styles(self):
+        return sorted(self._styles.items())
+
+    def set_styles(self, styles):
+        self._styles = styles
+
+    def is_empty(self):
+        return all(
+            [len(captions) == 0 for captions in list(self._captions.values())]
+        )
+
+    def set_layout_info(self, lang, layout_info):
+        self._captions[lang].layout_info = layout_info
+
+    def get_layout_info(self, lang):
+        caption_list = self._captions.get(lang)
+        if caption_list:
+            return caption_list.layout_info
+        return None
+
+    def adjust_caption_timing(self, offset=0, rate_skew=1.0):
+        """
+        Adjust the timing according to offset and rate_skew.
+        Skew is applied first, then offset.
+
+        e.g. if skew == 1.1, and offset is 5, a caption originally
+        displayed from 10-11 seconds would instead be at 16-17.1
+        """
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                caption.start = caption.start * rate_skew + offset
+                caption.end = caption.end * rate_skew + offset
+                if caption.start >= 0:
+                    out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+
+# Functions
+def merge_concurrent_captions(caption_set):
+    """Merge captions that have the same start and end times"""
+    for lang in caption_set.get_languages():
+        captions = caption_set.get_captions(lang)
+        last_caption = None
+        concurrent_captions = CaptionList()
+        merged_captions = CaptionList()
+        for caption in captions:
+            if last_caption:
+                last_timespan = last_caption.start, last_caption.end
+                current_timespan = caption.start, caption.end
+                if current_timespan == last_timespan:
+                    concurrent_captions.append(caption)
+                    last_caption = caption
+                    continue
+                else:
+                    merged_captions.append(merge(concurrent_captions))
+            concurrent_captions = [caption]
+            last_caption = caption
+
+        if concurrent_captions:
+            merged_captions.append(merge(concurrent_captions))
+        if merged_captions:
+            caption_set.set_captions(lang, merged_captions)
+    return caption_set
+
+
+def merge(captions):
+    """
+    Merge list of captions into one caption. The start/end times from the first
+    caption are kept.
+    """
+    new_nodes = []
+    for caption in captions:
+        if new_nodes:
+            new_nodes.append(CaptionNode.create_break())
+        for node in caption.nodes:
+            new_nodes.append(node)
+    caption = Caption(
+        captions[0].start, captions[0].end, new_nodes, captions[0].style)
+    return caption
@@ -0,0 +1,2 @@
+from .base import *
+from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter
@@ -0,0 +1,248 @@
+# We thought about making pycaption.base objects immutable. This would be nice
+# in a lot of cases, but since the transformations on them could be quite
+# complex, the deepcopy method is good enough sometimes.
+from copy import deepcopy
+
+from .base import DFXPWriter, DFXP_DEFAULT_REGION
+from ..base import BaseWriter, CaptionNode, merge_concurrent_captions
+
+from xml.sax.saxutils import escape
+from bs4 import BeautifulSoup
+
+LEGACY_DFXP_BASE_MARKUP = '''
+<tt xmlns="http://www.w3.org/ns/ttml"
+    xmlns:tts="http://www.w3.org/ns/ttml#styling">
+    <head>
+        <styling/>
+        <layout/>
+    </head>
+    <body/>
+</tt>
+'''
+
+LEGACY_DFXP_DEFAULT_STYLE = {
+    'color': 'white',
+    'font-family': 'monospace',
+    'font-size': '1c',
+}
+
+LEGACY_DFXP_DEFAULT_STYLE_ID = 'default'
+LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom'
+
+LEGACY_DFXP_DEFAULT_REGION = {
+    'text-align': 'center',
+    'display-align': 'after'
+}
+
+
+class SinglePositioningDFXPWriter(DFXPWriter):
+    """A dfxp writer, that ignores all positioning, using a single provided value
+    """
+    def __init__(self, default_positioning=DFXP_DEFAULT_REGION,
+                 *args, **kwargs):
+        super(SinglePositioningDFXPWriter, self).__init__(*args, **kwargs)
+        self.default_positioning = default_positioning
+
+    def write(self, captions_set, force=''):
+        """Writes a DFXP file using the positioning provided in the initializer
+
+        :type captions_set: pycaption.base.CaptionSet
+        :param force: only write this language, if available in the CaptionSet
+        :rtype: unicode
+        """
+        captions_set = self._create_single_positioning_caption_set(
+            captions_set, self.default_positioning)
+
+        return super(SinglePositioningDFXPWriter, self).write(captions_set, force)  # noqa
+
+    @staticmethod
+    def _create_single_positioning_caption_set(caption_set, positioning):
+        """Return a caption where all the positioning information was
+        replaced from positioning
+
+        :type caption_set: pycaption.base.CaptionSet
+        :rtype: pycaption.base.CaptionSet
+        """
+        # If SinglePositioningDFXPWriter would modify the state of the caption
+        # set, any writer using the same caption_set thereafter would be
+        # affected. At the moment we know we don't use any other writers, but
+        # this is important and mustn't be neglected
+        caption_set = deepcopy(caption_set)
+        caption_set = merge_concurrent_captions(caption_set)
+        caption_set.layout_info = positioning
+
+        for lang in caption_set.get_languages():
+            caption_set.set_layout_info(lang, positioning)
+
+            caption_list = caption_set.get_captions(lang)
+            for caption in caption_list:
+                caption.layout_info = positioning
+
+                for node in caption.nodes:
+                    if hasattr(node, 'layout_info'):
+                        node.layout_info = positioning
+
+        for _, style in caption_set.get_styles():
+            if 'text-align' in style:
+                style.pop('text-align')
+
+        return caption_set
+
+
+class LegacyDFXPWriter(BaseWriter):
+    """Ported the legacy DFXPWriter from 0.4.5"""
+    def __init__(self, *args, **kw):
+        self.p_style = False
+        self.open_span = False
+
+    def write(self, caption_set, force=''):
+        caption_set = deepcopy(caption_set)
+        caption_set = merge_concurrent_captions(caption_set)
+
+        dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml')
+        dfxp.find('tt')['xml:lang'] = "en"
+
+        for style_id, style in caption_set.get_styles():
+            if style != {}:
+                dfxp = self._recreate_styling_tag(style_id, style, dfxp)
+        if not caption_set.get_styles():
+            dfxp = self._recreate_styling_tag(
+                LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp)
+
+        # XXX For now we will always use this default region. In the future if
+        # regions are provided, they will be kept
+        dfxp = self._recreate_region_tag(
+            LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp)
+
+        body = dfxp.find('body')
+
+        if force:
+            langs = [self._force_language(force, caption_set.get_languages())]
+        else:
+            langs = caption_set.get_languages()
+
+        for lang in langs:
+            div = dfxp.new_tag('div')
+            div['xml:lang'] = '%s' % lang
+
+            for caption in caption_set.get_captions(lang):
+                if caption.style:
+                    caption_style = caption.style
+                    caption_style.update({'region': LEGACY_DFXP_DEFAULT_REGION_ID})
+                else:
+                    caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID,
+                                     'region': LEGACY_DFXP_DEFAULT_REGION_ID}
+                p = self._recreate_p_tag(caption, caption_style, dfxp)
+                div.append(p)
+
+            body.append(div)
+
+        caption_content = dfxp.prettify(formatter=None)
+        return caption_content
+
+    # force the DFXP to only have one language, trying to match on "force"
+    def _force_language(self, force, langs):
+        for lang in langs:
+            if force == lang:
+                return lang
+
+        return langs[-1]
+
+    def _recreate_region_tag(self, region_id, styling, dfxp):
+        dfxp_region = dfxp.new_tag('region')
+        dfxp_region.attrs.update({'xml:id': region_id})
+
+        attributes = self._recreate_style(styling, dfxp)
+        dfxp_region.attrs.update(attributes)
+
+        new_tag = dfxp.new_tag('region')
+        new_tag.attrs.update({'xml:id': region_id})
+        if dfxp_region != new_tag:
+            dfxp.find('layout').append(dfxp_region)
+        return dfxp
+
+    def _recreate_styling_tag(self, style, content, dfxp):
+        dfxp_style = dfxp.new_tag('style')
+        dfxp_style.attrs.update({'xml:id': style})
+
+        attributes = self._recreate_style(content, dfxp)
+        dfxp_style.attrs.update(attributes)
+
+        new_tag = dfxp.new_tag('style')
+        new_tag.attrs.update({'xml:id': style})
+        if dfxp_style != new_tag:
+            dfxp.find('styling').append(dfxp_style)
+
+        return dfxp
+
+    def _recreate_p_tag(self, caption, caption_style, dfxp):
+        start = caption.format_start()
+        end = caption.format_end()
+        p = dfxp.new_tag("p", begin=start, end=end)
+        p.string = self._recreate_text(caption, dfxp)
+
+        if dfxp.find("style", {"xml:id": "p"}):
+            p['style'] = 'p'
+
+        p.attrs.update(self._recreate_style(caption_style, dfxp))
+
+        return p
+
+    def _recreate_text(self, caption, dfxp):
+        line = ''
+
+        for node in caption.nodes:
+            if node.type_ == CaptionNode.TEXT:
+                line += escape(node.content) + ' '
+
+            elif node.type_ == CaptionNode.BREAK:
+                line = line.rstrip() + '<br/>\n    '
+
+            elif node.type_ == CaptionNode.STYLE:
+                line = self._recreate_span(line, node, dfxp)
+
+        return line.rstrip()
+
+    def _recreate_span(self, line, node, dfxp):
+        if node.start:
+            styles = ''
+
+            content_with_style = self._recreate_style(node.content, dfxp)
+            for style, value in list(content_with_style.items()):
+                styles += ' %s="%s"' % (style, value)
+
+            if styles:
+                if self.open_span:
+                    line = line.rstrip() + '</span> '
+                line += '<span%s>' % styles
+                self.open_span = True
+
+        elif self.open_span:
+            line = line.rstrip() + '</span> '
+            self.open_span = False
+
+        return line
+
+    def _recreate_style(self, content, dfxp):
+        dfxp_style = {}
+
+        if 'region' in content:
+            if dfxp.find('region', {'xml:id': content['region']}):
+                dfxp_style['region'] = content['region']
+        if 'class' in content:
+            if dfxp.find("style", {"xml:id": content['class']}):
+                dfxp_style['style'] = content['class']
+        if 'text-align' in content:
+            dfxp_style['tts:textAlign'] = content['text-align']
+        if 'italics' in content:
+            dfxp_style['tts:fontStyle'] = 'italic'
+        if 'font-family' in content:
+            dfxp_style['tts:fontFamily'] = content['font-family']
+        if 'font-size' in content:
+            dfxp_style['tts:fontSize'] = content['font-size']
+        if 'color' in content:
+            dfxp_style['tts:color'] = content['color']
+        if 'display-align' in content:
+            dfxp_style['tts:displayAlign'] = content['display-align']
+
+        return dfxp_style
@@ -0,0 +1,40 @@
+
+
+class CaptionReadError(Exception):
+    """
+    Generic error raised when the reading of the caption file failed.
+    """
+    def __str__(self):
+        return "%s(%s)" % (self.__class__.__name__, self.args)
+
+
+class CaptionReadNoCaptions(CaptionReadError):
+    """
+    Error raised when the provided caption file was not containing any
+    actual captions.
+    """
+
+
+class CaptionReadSyntaxError(CaptionReadError):
+    """
+    Error raised when the provided caption file has syntax errors and could
+    not be parsed.
+    """
+
+
+class CaptionReadTimingError(CaptionReadError):
+    """
+    Error raised when a Caption is initialized with invalid timings.
+    """
+
+
+class RelativizationError(Exception):
+    """
+    Error raised when absolute positioning cannot be converted to
+    percentage
+    """
+
+
+class InvalidInputError(RuntimeError):
+    """ Error raised when the input is invalid (i.e. a unicode string)
+    """
@@ -0,0 +1,916 @@
+"""
+This module implements the classes used to represent positioning information.
+
+CONVENTIONS:
+* None of the methods should modify the state of the objects on which they're
+  called. If the values of an object need to be recalculated, the method
+  responsible for the recalculation should return a new object with the
+  necessary modifications.
+"""
+import six
+
+from enum import Enum
+from .exceptions import RelativizationError
+
+
+class UnitEnum(Enum):
+    """Enumeration-like object, specifying the units of measure for length
+
+    Usage:
+        unit = UnitEnum.PIXEL
+        unit = UnitEnum.EM
+        if unit == UnitEnum.CELL :
+            ...
+    """
+    PIXEL = 'px'
+    EM = 'em'
+    PERCENT = '%'
+    CELL = 'c'
+    PT = 'pt'
+
+
+class VerticalAlignmentEnum(Enum):
+    """Enumeration object, specifying the allowed vertical alignment options
+
+    Usage:
+        alignment = VerticalAlignmentEnum.TOP
+        if alignment == VerticalAlignmentEnum.BOTTOM:
+            ...
+    """
+    TOP = 'top'
+    CENTER = 'center'
+    BOTTOM = 'bottom'
+
+
+class HorizontalAlignmentEnum(Enum):
+    """Enumeration object specifying the horizontal alignment preferences
+    """
+    LEFT = 'left'
+    CENTER = 'center'
+    RIGHT = 'right'
+    START = 'start'
+    END = 'end'
+
+
+class Alignment(object):
+    def __init__(self, horizontal, vertical):
+        """
+        :type horizontal: HorizontalAlignmentEnum
+        :param horizontal: HorizontalAlignmentEnum member
+        :type vertical: VerticalAlignmentEnum
+        :param vertical: VerticalAlignmentEnum member
+        """
+        self.horizontal = horizontal
+        self.vertical = vertical
+
+    def __hash__(self):
+        return hash(
+            hash(self.horizontal) * 83 +
+            hash(self.vertical) * 89 +
+            97
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.horizontal == other.horizontal and
+            self.vertical == other.vertical
+        )
+
+    def __repr__(self):
+        return "<Alignment ({horizontal} {vertical})>".format(
+            horizontal=self.horizontal, vertical=self.vertical
+        )
+
+    def serialized(self):
+        """Returns a tuple of the useful information regarding this object
+        """
+        return self.horizontal, self.vertical
+
+    @classmethod
+    def from_horizontal_and_vertical_align(cls, text_align=None,
+                                           display_align=None):
+        horizontal_obj = None
+        vertical_obj = None
+
+        if text_align == 'left':
+            horizontal_obj = HorizontalAlignmentEnum.LEFT
+        if text_align == 'start':
+            horizontal_obj = HorizontalAlignmentEnum.START
+        if text_align == 'center':
+            horizontal_obj = HorizontalAlignmentEnum.CENTER
+        if text_align == 'right':
+            horizontal_obj = HorizontalAlignmentEnum.RIGHT
+        if text_align == 'end':
+            horizontal_obj = HorizontalAlignmentEnum.END
+
+        if display_align == 'before':
+            vertical_obj = VerticalAlignmentEnum.TOP
+        if display_align == 'center':
+            vertical_obj = VerticalAlignmentEnum.CENTER
+        if display_align == 'after':
+            vertical_obj = VerticalAlignmentEnum.BOTTOM
+
+        if not any([horizontal_obj, vertical_obj]):
+            return None
+        return cls(horizontal_obj, vertical_obj)
+
+
+class TwoDimensionalObject(object):
+    """Adds a couple useful methods to its subclasses, nothing fancy.
+    """
+    @classmethod
+    # TODO - highly cachable. Should use WeakValueDictionary here to return
+    # flyweights, not new objects.
+    def from_xml_attribute(cls, attribute):
+        """Instantiate the class from a value of the type "4px" or "5%"
+        or any number concatenated with a measuring unit (member of UnitEnum)
+
+        :type attribute: unicode
+        """
+        horizontal, vertical = six.text_type(attribute).split(' ')
+        horizontal = Size.from_string(horizontal)
+        vertical = Size.from_string(vertical)
+
+        return cls(horizontal, vertical)
+
+
+class Stretch(TwoDimensionalObject):
+    """Used for specifying the extent of a rectangle (how much it stretches),
+    or the padding in a rectangle (how much space should be left empty until
+    text can be displayed)
+    """
+    def __init__(self, horizontal, vertical):
+        """Use the .from_xxx methods. They know what's best for you.
+
+        :type horizontal: Size
+        :type vertical: Size
+        """
+        for parameter in [horizontal, vertical]:
+            if not isinstance(parameter, Size):
+                raise ValueError("Stretch must be initialized with two valid "
+                                 "Size objects.")
+        self.horizontal = horizontal
+        self.vertical = vertical
+
+    def is_measured_in(self, measure_unit):
+        """Whether the stretch is only measured in the provided units
+
+        :param measure_unit: a UnitEnum member
+        :return: True/False
+        """
+        return (
+            self.horizontal.unit == measure_unit and
+            self.vertical.unit == measure_unit
+        )
+
+    def __repr__(self):
+        return '<Stretch ({horizontal}, {vertical})>'.format(
+            horizontal=self.horizontal, vertical=self.vertical
+        )
+
+    def serialized(self):
+        """Returns a tuple of the useful attributes of this object"""
+        return (
+            None if not self.horizontal else self.horizontal.serialized(),
+            None if not self.vertical else self.vertical.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.horizontal == other.horizontal and
+            self.vertical == other.vertical
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.horizontal) * 59 +
+            hash(self.vertical) * 61 +
+            67
+        )
+
+    def __bool__(self):
+        return True if self.horizontal or self.vertical else False
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+        """
+        return '{horizontal} {vertical}'.format(
+            horizontal=self.horizontal.to_xml_attribute(),
+            vertical=self.vertical.to_xml_attribute()
+        )
+
+    def is_relative(self):
+        """
+        Returns True if all dimensions are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.horizontal:
+            is_relative &= self.horizontal.is_relative()
+        if self.vertical:
+            is_relative &= self.vertical.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        """
+        Converts absolute units (e.g. px, pt etc) to percentage
+        """
+        return Stretch(
+            self.horizontal.as_percentage_of(video_width=video_width),
+            self.vertical.as_percentage_of(video_height=video_height)
+        )
+
+
+class Region(object):
+    """Represents the spatial coordinates of a rectangle
+
+    Don't instantiate by hand. use Region.from_points or Region.from_extent
+    """
+    @classmethod
+    def from_points(cls, p1, p2):
+        """Create a rectangle, knowing 2 points on the plane.
+        We assume that p1 is in the upper left (closer to the origin)
+
+        :param p1: Point instance
+        :param p2: Point instance
+        :return: a Point instance
+        """
+        inst = cls()
+        inst._p1 = p1
+        inst._p2 = p2
+        return inst
+
+    @classmethod
+    def from_extent(cls, extent, origin):
+        """Create a rectangle, knowing its upper left origin, and
+        spatial extension
+
+        :type extent: Stretch
+        :type origin: Point
+        :return: a Point instance
+        """
+        inst = cls()
+        inst._extent = extent
+        inst._origin = origin
+        return inst
+
+    @property
+    def extent(self):
+        """How wide this rectangle stretches (horizontally and vertically)
+        """
+        if hasattr(self, '_extent'):
+            return self._extent
+        else:
+            return self._p1 - self._p2
+
+    @property
+    def origin(self):
+        """Out of its 4 points, returns the one closest to the origin
+        """
+        if hasattr(self, '_origin'):
+            return self._origin
+        else:
+            return Point.align_from_origin(self._p1, self._p2)[0]
+
+    upper_left_point = origin
+
+    @property
+    def lower_right_point(self):
+        """The point furthest from the origin from the rectangle's 4 points
+        """
+        if hasattr(self, '_p2'):
+            return Point.align_from_origin(self._p1, self._p2)[1]
+        else:
+            return self.origin.add_extent(self.extent)
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.extent == other.extent and
+            self.origin == other.origin
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.origin) * 71 +
+            hash(self.extent) * 73 +
+            79
+        )
+
+
+class Point(TwoDimensionalObject):
+    """Represent a point in 2d space.
+    """
+    def __init__(self, x, y):
+        """
+        :type x: Size
+        :type y: Size
+        """
+        for parameter in [x, y]:
+            if not isinstance(parameter, Size):
+                raise ValueError("Point must be initialized with two valid "
+                                 "Size objects.")
+        self.x = x
+        self.y = y
+
+    def __sub__(self, other):
+        """Returns an Stretch object, if the other point's units are compatible
+        """
+        return Stretch(abs(self.x - other.x), abs(self.y - other.y))
+
+    def add_stretch(self, stretch):
+        """Returns another Point instance, whose coordinates are the sum of the
+         current Point's, and the Stretch instance's.
+        """
+        return Point(self.x + stretch.horizontal, self.y + stretch.vertical)
+
+    def is_relative(self):
+        """
+        Returns True if all dimensions are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.x:
+            is_relative &= self.x.is_relative()
+        if self.y:
+            is_relative &= self.y.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        """
+        Converts absolute units (e.g. px, pt etc) to percentage
+        """
+        return Point(
+            self.x.as_percentage_of(video_width=video_width),
+            self.y.as_percentage_of(video_height=video_height)
+        )
+
+    @classmethod
+    def align_from_origin(cls, p1, p2):
+        """Returns a tuple of 2 points. The first is closest to the origin
+        on both axes than the second.
+
+        If the 2 points fulfill this condition, returns them (ordered), if not,
+        creates 2 new points.
+        """
+        if p1.x <= p2.x and p1.y <= p2.y:
+            return p1
+        if p1.x >= p2.x and p1.y >= p2.y:
+            return p2
+        else:
+            return (Point(min(p1.x, p2.x), min(p1.y, p2.y)),
+                    Point(max(p1.x, p2.x), max(p1.y, p2.y)))
+
+    def __repr__(self):
+        return '<Point ({x}, {y})>'.format(
+            x=self.x, y=self.y
+        )
+
+    def serialized(self):
+        """Returns the "useful" values of this object.
+        """
+        return (
+            None if not self.x else self.x.serialized(),
+            None if not self.y else self.y.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.x == other.x and
+            self.y == other.y
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.x) * 51 +
+            hash(self.y) * 53 +
+            57
+        )
+
+    def __bool__(self):
+        return True if self.x or self.y else False
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+        """
+        return '{x} {y}'.format(
+            x=self.x.to_xml_attribute(), y=self.y.to_xml_attribute())
+
+
+@six.python_2_unicode_compatible
+class Size(object):
+    """Ties together a number with a unit, to represent a size.
+
+    Use as value objects! (don't change after creation)
+    """
+    def __init__(self, value, unit):
+        """
+        :param value: A number (float or int will do)
+        :param unit: A UnitEnum member
+        """
+        if value is None:
+            raise ValueError("Size must be initialized with a value.")
+        if not isinstance(unit,UnitEnum):
+            raise ValueError("Size must be initialized with a valid unit.")
+
+        self.value = float(value)
+        self.unit = unit
+
+    def __sub__(self, other):
+        if self.unit == other.unit:
+            return Size(self.value - other.value, self.unit)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def __abs__(self):
+        return Size(abs(self.value), self.unit)
+
+    def __cmp__(self, other):
+        if self.unit == other.unit:
+            # python3 does not have cmp
+            return (self.value > other.value) - (self.value < other.value)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+
+    def __add__(self, other):
+        if self.unit == other.unit:
+            return Size(self.value + other.value, self.unit)
+        else:
+            raise ValueError("The sizes should have the same measure units.")
+
+    def is_relative(self):
+        """
+        Returns True if value is expressed as percentage, False otherwise.
+        """
+        return self.unit == UnitEnum.PERCENT
+
+    def as_percentage_of(self, video_width=None, video_height=None):
+        """
+        :param video_width: An integer representing a width in pixels
+        :param video_height: An integer representing a height in pixels
+        """
+        value = self.value
+        unit = self.unit
+
+        if unit == UnitEnum.PERCENT:
+            return self  # Nothing to do here
+
+        # The input must be valid so that any conversion can be done
+        if not (video_width or video_height):
+            raise RelativizationError(
+                "Either video width or height must be given as a reference")
+        elif video_width and video_height:
+            raise RelativizationError(
+                "Only video width or height can be given as reference")
+
+        if unit == UnitEnum.EM:
+            # TODO: Implement proper conversion of em in function of font-size
+            # The em unit is relative to the font-size, to which we currently
+            # have no access. As a workaround, we presume the font-size is 16px,
+            # which is a common default value but not guaranteed.
+            value *= 16
+            unit = UnitEnum.PIXEL
+
+        if unit == UnitEnum.PT:
+            # XXX: we will convert first to "px" and from "px" this will be
+            # converted to percent. we don't take into consideration the
+            # font-size
+            value = value / 72.0 * 96.0
+            unit = UnitEnum.PIXEL
+
+        if unit == UnitEnum.PIXEL:
+            value = value * 100.0 / (video_width or video_height)
+            unit = UnitEnum.PERCENT
+
+        if unit == UnitEnum.CELL:
+            # TODO: Implement proper cell resolution
+            # (w3.org/TR/ttaf1-dfxp/#parameter-attribute-cellResolution)
+            # For now we will use the default values (32 columns and 15 rows)
+            cell_reference = 32 if video_width else 15
+            value = value * 100.0 / cell_reference
+            unit = UnitEnum.PERCENT
+
+        return Size(value, unit)
+
+    @classmethod
+    # TODO - this also looks highly cachable. Should use a WeakValueDict here
+    # to return flyweights
+    def from_string(cls, string):
+        """Given a string of the form "46px" or "5%" etc., returns the proper
+        size object
+
+        :param string: a number concatenated to any of the UnitEnum members.
+        :type string: unicode
+        :rtype: Size
+        """
+
+        raw_number = string
+        for unit in list(UnitEnum):
+            if raw_number.endswith(unit.value):
+                raw_number = raw_number.rstrip(unit.value)
+                break
+        else:
+            unit = None
+
+        if unit is not None:
+            value = None
+            try:
+                value = float(raw_number)
+                value = int(raw_number)
+            except ValueError:
+                pass
+
+            if value is None:
+                raise ValueError(
+                    """Couldn't recognize the value "{value}" as a number"""
+                    .format(value=raw_number)
+                )
+            instance = cls(value, unit)
+            return instance
+        else:
+            raise ValueError(
+                "The specified value is not valid because its unit "
+                "is not recognized: {value}. "
+                "The only supported units are: {supported}"
+                .format(value=raw_number, supported=', '.join(UnitEnum._member_map_))
+            )
+
+    def __repr__(self):
+        return '<Size ({value} {unit})>'.format(
+            value=self.value, unit=self.unit.value
+        )
+
+    def __str__(self):
+        value = round(self.value, 2)
+        if value.is_integer():
+            s = "{}".format(int(value))
+        else:
+            s = "{:.2f}".format(value).rstrip('0').rstrip('.')
+        return "{}{}".format(s, self.unit.value)
+
+    def to_xml_attribute(self, **kwargs):
+        """Returns a unicode representation of this object, as an xml attribute
+        """
+        return six.text_type(self)
+
+    def serialized(self):
+        """Returns the "useful" values of this object"""
+        return self.value, self.unit
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.value == other.value and
+            self.unit == other.unit
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.value) * 41 +
+            hash(self.unit) * 43 +
+            47
+        )
+
+    def __bool__(self):
+        return self.unit in UnitEnum and self.value is not None
+
+
+class Padding(object):
+    """Represents padding information. Consists of 4 Size objects, representing
+    padding from (in this order): before (up), after (down), start (left) and
+    end (right).
+
+    A valid Padding object must always have all paddings set and different from
+    None. If this is not true Writers may fail for they rely on this assumption.
+    """
+    def __init__(self, before=None, after=None, start=None, end=None):
+        """
+        :type before: Size
+        :type after: Size
+        :type start: Size
+        :type end: Size
+        """
+        self.before = before  # top
+        self.after = after  # bottom
+        self.start = start  # left
+        self.end = end  # right
+
+        for attr in ['before', 'after', 'start', 'end']:
+            # Ensure that a Padding object always explicitly defines all
+            # four possible paddings
+            if not isinstance(getattr(self, attr), Size):
+                # Sets default padding (0%)
+                setattr(self, attr, Size(0, UnitEnum.PERCENT))
+
+    @classmethod
+    def from_xml_attribute(cls, attribute):
+        """As per the docs, the style attribute can contain 1,2,3 or 4 values.
+
+        If 1 value: apply to all edges
+        If 2: first applies to before and after, second to start and end
+        If 3: first applies to before, second to start and end, third to after
+        If 4: before, end, after, start;
+
+        http://www.w3.org/TR/ttaf1-dfxp/#style-attribute-padding
+
+        :param attribute: a string like object, representing a dfxp attr. value
+        :return: a Padding object
+        """
+        values_list = six.text_type(attribute).split(' ')
+        sizes = []
+
+        for value in values_list:
+            sizes.append(Size.from_string(value))
+
+        if len(sizes) == 1:
+            return cls(sizes[0], sizes[0], sizes[0], sizes[0])
+        elif len(sizes) == 2:
+            return cls(sizes[0], sizes[0], sizes[1], sizes[1])
+        elif len(sizes) == 3:
+            return cls(sizes[0], sizes[2], sizes[1], sizes[1])
+        elif len(sizes) == 4:
+            return cls(sizes[0], sizes[2], sizes[3], sizes[1])
+        else:
+            raise ValueError('The provided value "{value}" could not be '
+                             "parsed into the a padding. Check out "
+                             "http://www.w3.org/TR/ttaf1-dfxp/"
+                             "#style-attribute-padding for the definition "
+                             "and examples".format(value=attribute))
+
+    def __repr__(self):
+        return (
+            "<Padding (before: {before}, after: {after}, start: {start}, "
+            "end: {end})>".format(
+                before=self.before, after=self.after, start=self.start,
+                end=self.end
+            )
+        )
+
+    def serialized(self):
+        """Returns a tuple containing the useful values of this object
+        """
+        return (
+            None if not self.before else self.before.serialized(),
+            None if not self.after else self.after.serialized(),
+            None if not self.start else self.start.serialized(),
+            None if not self.end else self.end.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            other and
+            type(self) == type(other) and
+            self.before == other.before and
+            self.after == other.after and
+            self.start == other.start and
+            self.end == other.end
+        )
+
+    def __hash__(self):
+        return hash(
+            hash(self.before) * 19 +
+            hash(self.after) * 23 +
+            hash(self.start) * 29 +
+            hash(self.end) * 31 +
+            37
+        )
+
+    def to_xml_attribute(
+            self, attribute_order=('before', 'end', 'after', 'start'),
+            **kwargs):
+        """Returns a unicode representation of this object as an xml attribute
+
+        TODO - should extend the attribute_order tuple to contain 4 tuples,
+        so we can reduce the output length to 3, 2 or 1 element.
+
+        :type attribute_order: tuple
+        :param attribute_order: the order that the attributes should be
+            serialized
+        """
+        try:
+            string_list = []
+            for attrib in attribute_order:
+                if hasattr(self, attrib):
+                    string_list.append(
+                        getattr(self, attrib).to_xml_attribute())
+        except AttributeError:
+            # A Padding object with attributes set to None is considered
+            # invalid. All four possible paddings must be set. If one of them
+            # is not, this error is raised.
+            raise ValueError("The attribute order specified is invalid.")
+
+        return ' '.join(string_list)
+
+    def as_percentage_of(self, video_width, video_height):
+        return Padding(
+            self.before.as_percentage_of(video_height=video_height),
+            self.after.as_percentage_of(video_height=video_height),
+            self.start.as_percentage_of(video_width=video_width),
+            self.end.as_percentage_of(video_width=video_width)
+        )
+
+    def is_relative(self):
+        is_relative = True
+        if self.before:
+            is_relative &= self.before.is_relative()
+        if self.after:
+            is_relative &= self.after.is_relative()
+        if self.start:
+            is_relative &= self.start.is_relative()
+        if self.end:
+            is_relative &= self.end.is_relative()
+        return is_relative
+
+
+class Layout(object):
+    """Should encapsulate all the information needed to determine (as correctly
+    as possible) the layout (positioning) of elements on the screen.
+
+     Inheritance of this property, from the CaptionSet to its children is
+     specific for each caption type.
+    """
+    def __init__(self, origin=None, extent=None, padding=None, alignment=None,
+                 webvtt_positioning=None, inherit_from=None):
+        """
+        :type origin: Point
+        :param origin: The point on the screen which is the top left vertex
+            of a rectangular region where the captions should be placed
+
+        :type extent: Stretch
+        :param extent: The width and height of the rectangle where the caption
+            should be placed on the screen.
+
+        :type padding: Padding
+        :param padding: The padding of the text inside the region described
+            by the origin and the extent
+
+        :type alignment: Alignment
+
+        :type webvtt_positioning: unicode
+        :param webvtt_positioning: A string with the raw WebVTT cue settings.
+            This is used so that WebVTT positioning isn't lost on conversion
+            from WebVTT to WebVTT. It is needed only because pycaption
+            currently doesn't support reading positioning from WebVTT.
+
+        :type inherit_from: Layout
+        :param inherit_from: A Layout with the positioning parameters to be
+            used if not specified by the positioning arguments,
+        """
+
+        self.origin = origin
+        self.extent = extent
+        self.padding = padding
+        self.alignment = alignment
+        self.webvtt_positioning = webvtt_positioning
+
+        if inherit_from:
+            for attr_name in ['origin', 'extent', 'padding', 'alignment']:
+                attr = getattr(self, attr_name)
+                if not attr:
+                    setattr(self, attr_name, getattr(inherit_from, attr_name))
+
+    def __bool__(self):
+        return any([
+            self.origin, self.extent, self.padding, self.alignment,
+            self.webvtt_positioning
+        ])
+
+    def __repr__(self):
+        return (
+            "<Layout (origin: {origin}, extent: {extent}, "
+            "padding: {padding}, alignment: {alignment})>".format(
+                origin=self.origin, extent=self.extent, padding=self.padding,
+                alignment=self.alignment
+            )
+        )
+
+    def serialized(self):
+        """Returns nested tuple containing the "useful" values of this object
+        """
+        return (
+            None if not self.origin else self.origin.serialized(),
+            None if not self.extent else self.extent.serialized(),
+            None if not self.padding else self.padding.serialized(),
+            None if not self.alignment else self.alignment.serialized()
+        )
+
+    def __eq__(self, other):
+        return (
+            type(self) == type(other) and
+            self.origin == other.origin and
+            self.extent == other.extent and
+            self.padding == other.padding and
+            self.alignment == other.alignment
+        )
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(
+            hash(self.origin) * 7
+            + hash(self.extent) * 11
+            + hash(self.padding) * 13
+            + hash(self.alignment) * 5
+            + 17
+        )
+
+    def is_relative(self):
+        """
+        Returns True if all positioning values are expressed as percentages,
+        False otherwise.
+        """
+        is_relative = True
+        if self.origin:
+            is_relative &= self.origin.is_relative()
+        if self.extent:
+            is_relative &= self.extent.is_relative()
+        if self.padding:
+            is_relative &= self.padding.is_relative()
+        return is_relative
+
+    def as_percentage_of(self, video_width, video_height):
+        params = {'alignment': self.alignment}
+        # We don't need to preserve webvtt_positioning on Layout
+        # transformations because, if it is set, the WebVTT writer
+        # returns as soon as it's found and the transformations are
+        # never triggered.
+        for attr_name in ['origin', 'extent', 'padding']:
+            attr = getattr(self, attr_name)
+            if attr:
+                params[attr_name] = attr.as_percentage_of(video_width,
+                                                          video_height)
+        return Layout(**params)
+
+    def fit_to_screen(self):
+        """
+        If extent is not set or if origin + extent > 100%, (re)calculate it
+        based on origin. It is a pycaption fix for caption files that are
+        technically valid but contain inconsistent settings that may cause
+        long captions to be cut out of the screen.
+
+        ATTENTION: This must be called on relativized objects (such as the one
+        returned by as_percentage_of). All units are presumed to be percentages.
+        """
+
+        if self.origin:
+            # Calculated values to be used if replacement is needed
+            diff_horizontal = Size(100 - self.origin.x.value, UnitEnum.PERCENT)
+            diff_vertical = Size(100 - self.origin.y.value, UnitEnum.PERCENT)
+            if not self.extent:
+                # Extent is not set, use the calculated values
+                new_extent = Stretch(diff_horizontal, diff_vertical)
+            else:
+                # Extent is set but may have inconsistent values,
+                # e.g. origin="35% 25%" extent="80% 80%", which would cause
+                # captions to end horizontally at 115% and vertically at 105%,
+                # which would result in them being cut out of the screen.
+                # In this case, the horizontal and vertical values are
+                # corrected so that origin + extent = 100%.
+                bottom_right = self.origin.add_stretch(self.extent)
+
+                found_absolute_unit = False
+                if bottom_right.x.unit != UnitEnum.PERCENT:
+                    found_absolute_unit = True
+                elif bottom_right.x.unit != UnitEnum.PERCENT:
+                    found_absolute_unit = True
+
+                if found_absolute_unit:
+                    raise ValueError("Units must be relativized before extent "
+                                     "can be calculated based on origin.")
+
+                new_horizontal = self.extent.horizontal
+                new_vertical = self.extent.vertical
+                # If extent is set but it's inconsistent, replace with
+                # calculated values
+                if bottom_right.x.value > 100:
+                    new_horizontal = diff_horizontal
+                if bottom_right.y.value > 100:
+                    new_vertical = diff_vertical
+
+                new_extent = Stretch(new_horizontal, new_vertical)
+
+            return Layout(
+                origin=self.origin,
+                extent=new_extent,
+                padding=self.padding,
+                alignment=self.alignment
+                # We don't need to preserve webvtt_positioning on Layout
+                # transformations because, if it is set, the WebVTT writer
+                # returns as soon as it's found and the transformations are
+                # never triggered.
+            )
+
+        return self
@@ -0,0 +1,805 @@
+"""
+The classes in this module handle SAMI reading and writing. It supports several
+CSS attributes, some of which are handled as positioning settings (and applied
+to Layout objects) and others as simple styling (applied to legacy style nodes).
+
+The following attributes are handled as positioning:
+
+    'text-align' # Converted to Alignment
+    'margin-top'
+    'margin-right'
+    'margin-bottom'
+    'margin-left'
+
+OBS:
+    * Margins are converted to Padding
+    * Margins defined inline are not supported
+      TODO: Add support for inline margins
+
+Any other CSS the BeautifulSoup library manages to parse is handled as simple
+styling and applied to style nodes. However, apparently only these are actually
+used by writers on conversion:
+
+    'font-family'
+    'font-size'
+    'font-style'
+    'color'
+OBS:
+    * Other parameters are preserved, but not if they're specified inline.
+    TODO:
+      Make this less confusing. Confirm whether these really are the only
+      supported styling attributes and make it more clear, perhaps by listing
+      them in constants in the beginning of the file and using them to filter
+      out unneeded attributes either everywhere in the code or not at all, but
+      most importantly regardless of whether they're defined inline or not,
+      because this is irrelevant.
+
+"""
+import re
+import six
+from logging import FATAL
+from collections import deque
+from copy import deepcopy
+from future.backports.html.parser import HTMLParseError
+
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+from xml.sax.saxutils import escape
+
+
+from bs4 import BeautifulSoup, NavigableString
+from cssutils import parseString, log, css as cssutils_css
+
+from .base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
+    DEFAULT_LANGUAGE_CODE)
+from .exceptions import (
+    CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError)
+from .geometry import Layout, Alignment, Padding, Size
+
+
+# change cssutils default logging
+log.setLevel(FATAL)
+
+
+SAMI_BASE_MARKUP = '''
+<sami>
+    <head>
+        <style type="text/css"/>
+    </head>
+    <body/>
+</sami>'''
+
+
+class SAMIReader(BaseReader):
+
+    def __init__(self, *args, **kw):
+        super(SAMIReader, self).__init__(*args, **kw)
+        self.line = []
+        self.first_alignment = None
+
+    def detect(self, content):
+        if '<sami' in content.lower():
+            return True
+        else:
+            return False
+
+    def read(self, content):
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        content, doc_styles, doc_langs = (
+            self._get_sami_parser_class()().feed(content))
+        sami_soup = self._get_xml_parser_class()(content)
+
+        # Get the global layout that applies to all <p> tags
+        global_layout = self._build_layout(doc_styles.get('p', {}))
+
+        caption_dict = {}
+        for language in doc_langs:
+            lang_layout = None
+            for target, styling in list(doc_styles.items()):
+                if target not in ['p', 'sync', 'span']:
+                    if styling.get('lang', None) == language:
+                        lang_layout = self._build_layout(
+                            doc_styles.get(target, {}),
+                            inherit_from=global_layout
+                        )
+                        break
+            lang_layout = lang_layout or global_layout
+            lang_captions = self._translate_lang(
+                language, sami_soup, lang_layout)
+
+            caption_dict[language] = lang_captions
+
+        caption_set = CaptionSet(
+            caption_dict,
+            layout_info=global_layout
+        )
+
+        # Convert styles from CSS to internal representation
+        for style in list(doc_styles.items()):
+            style = (style[0], self._translate_parsed_style(style[1]))
+
+        caption_set.set_styles(doc_styles)
+
+        if caption_set.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+
+        return caption_set
+
+    @staticmethod
+    def _get_sami_parser_class():
+        """Hook method for providing custom SAMIParser classes"""
+        return SAMIParser
+
+    @staticmethod
+    def _get_xml_parser_class():
+        """Hook method for providing a custom XML parser class"""
+        return BeautifulSoup
+
+    def _build_layout(self, styles, inherit_from=None):
+        """
+        :type styles: dict
+        :param styles: a dictionary with CSS-like styling rules
+
+        :type inherit_from: Layout
+        :param inherit_from: The Layout with values to be used in case the
+            positioning settings in the styles parameter don't specify
+            something.
+        """
+        alignment = Alignment.from_horizontal_and_vertical_align(
+            text_align=styles.get('text-align')
+        )
+        return self._get_layout_class()(
+            origin=None,
+            extent=None,
+            padding=self._get_padding(styles),
+            alignment=alignment,
+            inherit_from=inherit_from
+        )
+
+    @staticmethod
+    def _get_layout_class():
+        """Hook method for providing a custom Layout class"""
+        return Layout
+
+    def _get_padding(self, styles):
+        margin_before = self._get_size(styles, 'margin-top')
+        margin_after = self._get_size(styles, 'margin-bottom')
+        margin_start = self._get_size(styles, 'margin-left')
+        margin_end = self._get_size(styles, 'margin-right')
+        if not any([margin_before, margin_after, margin_start, margin_end]):
+            return None
+        return Padding(
+            before=margin_before,  # top
+            after=margin_after,  # bottom
+            start=margin_start,  # left
+            end=margin_end  # right
+        )
+
+    def _get_size(self, styles, style_label):
+        value_from_style = styles.get(style_label, None)
+        if not value_from_style:
+            return None
+        return Size.from_string(value_from_style)
+
+    def _translate_lang(self, language, sami_soup, parent_layout):
+        """
+        For a given language, translate the SAMI XML to internal list of
+        captions.
+
+        :rtype: list
+        """
+        captions = CaptionList(layout_info=parent_layout)
+        milliseconds = 0
+
+        for p in sami_soup.select('p[lang|=%s]' % language):
+            milliseconds = int(float(p.parent['start']))
+            start = milliseconds * 1000
+            end = 0
+
+            if captions != [] and captions[-1].end == 0:
+                captions[-1].end = milliseconds * 1000
+
+            if p.get_text().strip():
+                self.first_alignment = None
+                styles = self._translate_attrs(p)
+                layout_info = self._build_layout(styles,
+                                                 inherit_from=parent_layout)
+                self.line = []
+
+                self._translate_tag(p, layout_info)
+                caption_layout = self._get_layout_class()(
+                    alignment=self.first_alignment,
+                    inherit_from=layout_info
+                )
+                for node in self.line:
+                    node.layout_info = Layout(
+                        alignment=self.first_alignment,
+                        inherit_from=node.layout_info
+                    )
+                self.first_alignment = None
+
+                caption = Caption(start, end, self.line, styles, caption_layout)
+                captions.append(caption)
+
+        if captions and captions[-1].end == 0:
+            # Arbitrarily make this last 4 seconds. Not ideal...
+            captions[-1].end = (milliseconds + 4000) * 1000
+
+        return captions
+
+    def _get_style_name_from_tag(self, tag):
+        if tag == 'i':
+            return 'italics'
+        elif tag == 'b':
+            return 'bold'
+        elif tag == 'u':
+            return 'underline'
+        else:
+            raise RuntimeError("Unknown style tag")
+
+    def _translate_tag(self, tag, inherit_from=None):
+        """
+        :param inherit_from: A Layout object extracted from an ancestor tag
+                to be attached to leaf nodes
+        """
+        # convert text
+        if isinstance(tag, NavigableString):
+            # BeautifulSoup apparently handles unescaping character codes
+            # (e.g. &amp;) automatically. The following variable, therefore,
+            # should contain a plain unicode string.
+            # strips indentation whitespace only
+            pattern = re.compile("^(?:[\n\r]+\s*)?(.+)")
+            result = pattern.search(tag)
+            if not result:
+                return
+            tag_text = result.groups()[0]
+            self.line.append(CaptionNode.create_text(tag_text, inherit_from))
+        # convert line breaks
+        elif tag.name == 'br':
+            self.line.append(CaptionNode.create_break(inherit_from))
+        # convert italics, bold, and underline
+        elif tag.name == 'i' or tag.name == 'b' or tag.name == 'u':
+            style_name = self._get_style_name_from_tag(tag.name)
+            self.line.append(
+                CaptionNode.create_style(True, {style_name: True})
+            )
+            # recursively call function for any children elements
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+            self.line.append(
+                CaptionNode.create_style(False, {style_name: True}))
+        elif tag.name == 'span':
+            self._translate_span(tag, inherit_from)
+        else:
+            # recursively call function for any children elements
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+
+    def _translate_span(self, tag, inherit_from=None):
+        # convert tag attributes
+        args = self._translate_attrs(tag)
+        # only include span tag if attributes returned
+        if args:
+            layout_info = self._build_layout(args, inherit_from)
+            # OLD: Create legacy style node
+            # NEW: But pass new layout object
+            node = CaptionNode.create_style(True, args, layout_info)
+            self.line.append(node)
+            # recursively call function for any children elements
+            for a in tag.contents:
+                # NEW: Pass the layout along so that it's eventually attached
+                # to leaf nodes (e.g. text or break)
+                self._translate_tag(a, layout_info)
+            node = CaptionNode.create_style(False, args, layout_info)
+            self.line.append(node)
+        else:
+            for a in tag.contents:
+                self._translate_tag(a, inherit_from)
+
+    def _translate_attrs(self, tag):
+        attrs = {}
+        css_attrs = tag.attrs
+
+        if 'class' in css_attrs:
+            attrs['class'] = css_attrs['class'][0].lower()
+        if 'id' in css_attrs:
+            attrs['class'] = css_attrs['id'].lower()
+        if 'style' in css_attrs:
+            styles = css_attrs['style'].split(';')
+            attrs.update(self._translate_style(attrs, styles))
+
+        return attrs
+
+    # convert attributes from inline CSS
+    def _translate_style(self, attrs, styles):
+        for style in styles:
+            style = style.split(':')
+            if len(style) == 2:
+                css_property, value = style
+            else:
+                continue
+            if css_property == 'text-align':
+                self._save_first_alignment(value.strip())
+            else:
+                self._translate_css_property(attrs, css_property, value)
+
+        return attrs
+
+    def _translate_parsed_style(self, styles):
+        # Keep unknown styles by default
+        attrs = styles
+        for css_property in list(styles.keys()):
+            value = styles[css_property]
+            self._translate_css_property(attrs, css_property, value)
+
+        return attrs
+
+    def _translate_css_property(self, attrs, css_property, value):
+        if css_property == 'font-family':
+            attrs['font-family'] = value.strip()
+        elif css_property == 'font-size':
+            attrs['font-size'] = value.strip()
+        elif css_property == 'font-style' and value.strip() == 'italic':
+            attrs['italics'] = True
+        elif css_property == 'text-decoration' and value.strip() == 'underline':
+            attrs['underline'] = True
+        elif css_property == 'font-weight' and value.strip() == 'bold':
+            attrs['bold'] = True
+        elif css_property == 'lang':
+            attrs['lang'] = value.strip()
+        elif css_property == 'color':
+            attrs['color'] = value.strip()
+
+    def _save_first_alignment(self, align):
+        """
+        Unlike the other inline CSS attributes parsed in _translate_styles, the
+        'text-align' setting must be applied to a Layout and not to a style
+        because it affects positioning. This Layout must be assigned to the
+        Caption object, and not a Node, because it doesn't make sense to have
+        spans in the same caption with different alignments. Even though the
+        SAMI format seems to in principle accept it, pycaption normalizes to
+        something it can make sense of internally and convert to other formats.
+
+        If there are multiple elements (span, div, etc) in the same line with
+        different alignments, only the first alignment is taken into account.
+
+        If the root element of the caption (sync's first child) has an inline
+        text-align, it is preserved and any children alignment is ignored.
+
+        :param align: A unicode string representing a CSS text-align value
+        """
+        if not self.first_alignment:
+            self.first_alignment = Alignment.from_horizontal_and_vertical_align(  # noqa
+                text_align=align
+            )
+
+
+class SAMIWriter(BaseWriter):
+    def __init__(self, *args, **kwargs):
+        super(SAMIWriter, self).__init__(*args, **kwargs)
+        self.open_span = False
+        self.last_time = None
+
+    def write(self, caption_set):
+        caption_set = deepcopy(caption_set)
+        sami = BeautifulSoup(SAMI_BASE_MARKUP, "lxml-xml")
+
+        caption_set.layout_info = self._relativize_and_fit_to_screen(
+            caption_set.layout_info)
+
+        primary = None
+
+        for lang in caption_set.get_languages():
+            self.last_time = None
+            if primary is None:
+                primary = lang
+
+            caption_set.set_layout_info(
+                lang,
+                self._relativize_and_fit_to_screen(
+                    caption_set.get_layout_info(lang))
+            )
+
+            for caption in caption_set.get_captions(lang):
+                # Loop through all captions/nodes and apply transformations to
+                # layout in function of the provided or default settings
+                caption.layout_info = self._relativize_and_fit_to_screen(
+                    caption.layout_info)
+                for node in caption.nodes:
+                    node.layout_info = self._relativize_and_fit_to_screen(
+                        node.layout_info)
+                sami = self._recreate_p_tag(
+                    caption, sami, lang, primary, caption_set)
+
+        stylesheet = self._recreate_stylesheet(caption_set)
+        sami.find('style').append(stylesheet)
+
+        a = sami.prettify(formatter=None).split('\n')
+        caption_content = '\n'.join(a[1:])
+        return caption_content
+
+    def _recreate_p_tag(self, caption, sami, lang, primary, captions):
+        """
+        Creates a p tag for the given caption, attach it to the sami object
+        and return it.
+
+        :type caption: Caption
+        :type sami: BeautifulSoup
+        :type lang: unicode
+        :type primary: unicode
+        :type captions: CaptionSet
+
+        :rtype: BeautifulSoup
+        """
+        time = caption.start / 1000
+
+        if self.last_time and time != self.last_time:
+            sami = self._recreate_blank_tag(
+                sami, caption, lang, primary, captions)
+
+        self.last_time = caption.end / 1000
+
+        sami, sync = self._recreate_sync(sami, lang, primary, time)
+
+        p = sami.new_tag("p")
+
+        p_style = ''
+        for attr, value in list(self._recreate_style(caption.style).items()):
+            p_style += '%s:%s;' % (attr, value)
+        if p_style:
+            p['p_style'] = p_style
+
+        p['class'] = self._recreate_p_lang(caption, lang, captions)
+        p.string = self._recreate_text(caption.nodes)
+
+        sync.append(p)
+
+        return sami
+
+    def _recreate_sync(self, sami, lang, primary, time):
+        """
+        Creates a sync tag for a given language and timing (if it doesn't
+        already exist), attach it to the sami body and return the sami
+        BeautifulSoupobject.
+
+        :type sami: BeautifulSoup
+        :type lang: unicode
+        :type primary: unicode
+        :type time: int
+
+        :rtype: BeautifulSoup
+        """
+        if lang == primary:
+            sync = sami.new_tag("sync", start="%d" % time)
+            sami.body.append(sync)
+        else:
+            sync = sami.find("sync", start="%d" % time)
+            if sync is None:
+                sami, sync = self._find_closest_sync(sami, time)
+
+        return sami, sync
+
+    def _find_closest_sync(self, sami, time):
+        sync = sami.new_tag("sync", start="%d" % time)
+
+        earlier = sami.find_all("sync", start=lambda x: int(x) < time)
+        if earlier:
+            last_sync = earlier[-1]
+            last_sync.insert_after(sync)
+        else:
+            def later_syncs(start):
+                return int(start) > time
+            later = sami.find_all("sync", start=later_syncs)
+            if later:
+                last_sync = later[0]
+                last_sync.insert_before(sync)
+        return sami, sync
+
+    def _recreate_blank_tag(self, sami, caption, lang, primary, captions):
+        sami, sync = self._recreate_sync(sami, lang, primary, self.last_time)
+
+        p = sami.new_tag("p")
+        p['class'] = self._recreate_p_lang(caption, lang, captions)
+        p.string = '&nbsp;'
+
+        sync.append(p)
+
+        return sami
+
+    def _recreate_p_lang(self, caption, lang, captions):
+        try:
+            if 'lang' in captions.get_style(caption.style['class']):
+                return caption.style['class']
+        except KeyError:
+            pass
+        return lang
+
+    def _recreate_stylesheet(self, caption_set):
+        stylesheet = '<!--'
+
+        for attr, value in caption_set.get_styles():
+            if value != {}:
+                stylesheet += self._recreate_style_block(
+                    attr, value, caption_set.layout_info)
+
+        for lang in caption_set.get_languages():
+            lang_string = 'lang: {}'.format(lang)
+            if lang_string not in stylesheet:
+                stylesheet += self._recreate_style_block(
+                    lang, {'lang': lang}, caption_set.get_layout_info(lang))
+
+        return stylesheet + '   -->'
+
+    def _recreate_style_block(self, target, rules, layout_info):
+        """
+        :param target: A unicode string representing the target of the styling
+            rules.
+        :param rules: A dictionary with CSS-like styling rules.
+
+        :param layout_info: A Layout object providing positioning information
+            to be converted to CSS
+        """
+        if target not in ['p', 'sync', 'span']:
+            # If it's not a valid SAMI element, then it's a custom class name
+            selector = '.{}'.format(target)
+        else:
+            selector = target
+
+        sami_style = '\n    {} {{\n    '.format(selector)
+
+        if layout_info and layout_info.padding:
+            rules.update({
+                'margin-top': six.text_type(layout_info.padding.before),
+                'margin-right': six.text_type(layout_info.padding.end),
+                'margin-bottom': six.text_type(layout_info.padding.after),
+                'margin-left': six.text_type(layout_info.padding.start),
+            })
+
+        for attr, value in sorted(self._recreate_style(rules).items()):
+            sami_style += ' {}: {};\n    '.format(attr, value)
+
+        return sami_style + '}\n'
+
+    def _recreate_text(self, caption):
+        line = ''
+
+        for node in caption:
+            if node.type_ == CaptionNode.TEXT:
+                line += self._encode(node.content) + ' '
+            elif node.type_ == CaptionNode.BREAK:
+                line = line.rstrip() + '<br/>\n    '
+            elif node.type_ == CaptionNode.STYLE:
+                line = self._recreate_line_style(line, node)
+
+        return line.rstrip()
+
+    def _recreate_line_style(self, line, node):
+        if node.start:
+            if self.open_span:
+                line = line.rstrip() + '</span> '
+            line = self._recreate_span(line, node.content)
+        else:
+            if self.open_span:
+                line = line.rstrip() + '</span> '
+                self.open_span = False
+
+        return line
+
+    def _recreate_span(self, line, content):
+        style = ''
+        klass = ''
+        if 'class' in content:
+            klass += ' class="%s"' % content['class']
+
+        for attr, value in list(self._recreate_style(content).items()):
+            style += '%s:%s;' % (attr, value)
+
+        if style or klass:
+            if style:
+                style = ' style="%s"' % style
+            line += '<span%s%s>' % (klass, style)
+            self.open_span = True
+
+        return line
+
+    def _recreate_style(self, rules):
+        """
+        :param rules: A dictionary with CSS-like styling rules
+        """
+        sami_style = {}
+
+        for key, value in list(rules.items()):
+            # Recreate original CSS rules from internal style
+            if key == 'italics' and value == True:
+                sami_style['font-style'] = 'italic'
+            elif key == 'bold' and value == True:
+                sami_style['font-weight'] = 'bold'
+            elif key == 'underline' and value == True:
+                sami_style['text-decoration'] = 'underline'
+            else:
+                sami_style[key] = value
+
+        return sami_style
+
+    def _encode(self, s):
+        """
+        Encodes plain unicode string to proper SAMI file escaping special
+        characters in case they appear in the string.
+        :type s: unicode
+        """
+        return escape(s)
+
+
+class SAMIParser(HTMLParser):
+    def __init__(self, *args, **kw):
+        HTMLParser.__init__(self, *args, **kw)
+        self.sami = ''
+        self.line = ''
+        self.styles = {}
+        self.queue = deque()
+        self.langs = set()
+        self.last_element = ''
+        self.name2codepoint = name2codepoint.copy()
+        self.name2codepoint['apos'] = 0x0027
+        self.convert_charrefs = False
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Override the parser's handling of starttags
+        :param tag: unicode string indicating the tag type (e.g. "head" or "p")
+        :param tag: list of attribute tuples of type (u'name', u'value')
+        """
+        self.last_element = tag
+
+        # treat divs as spans
+        if tag == 'div':
+            tag = 'span'
+
+        # figure out the caption language of P tags
+        if tag == 'p':
+            lang = self._find_lang(attrs)
+
+            # if no language detected, set it as the default
+            lang = lang or DEFAULT_LANGUAGE_CODE
+            attrs.append(('lang', lang))
+            self.langs.add(lang)
+
+        # clean-up line breaks
+        if tag == 'br':
+            self.sami += "<br/>"
+        # add tag to queue
+        else:
+            # if already in queue, first close tags off in LIFO order
+            while tag in self.queue:
+                closer = self.queue.pop()
+                self.sami += "</%s>" % closer
+            # open new tag in queue
+            self.queue.append(tag)
+            # add tag with attributes
+            for attr, value in attrs:
+                tag += ' %s="%s"' % (attr.lower(), value)
+            self.sami += "<%s>" % tag
+
+    # override the parser's handling of endtags
+    def handle_endtag(self, tag):
+        # treat divs as spans
+        if tag == 'div':
+            tag = 'span'
+
+        # handle incorrectly formatted sync/p tags
+        if tag in ['p', 'sync'] and tag == self.last_element:
+            return
+
+        # close off tags in LIFO order, if matching starting tag in queue
+        while tag in self.queue:
+            closing_tag = self.queue.pop()
+            self.sami += "</%s>" % closing_tag
+
+    def handle_entityref(self, name):
+        if name in ['gt', 'lt']:
+            self.sami += '&%s;' % name
+        else:
+            try:
+                self.sami += chr(self.name2codepoint[name])
+            except (KeyError, ValueError):
+                self.sami += '&%s' % name
+
+        self.last_element = ''
+
+    def handle_charref(self, name):
+        if name[0] == 'x':
+            self.sami += chr(int(name[1:], 16))
+        else:
+            self.sami += chr(int(name))
+
+    # override the parser's handling of data
+    def handle_data(self, data):
+        self.sami += data
+        self.last_element = ''
+
+    # override the parser's feed function
+    def feed(self, data):
+        """
+        :param data: Raw SAMI unicode string
+        :returns: tuple (unicode, dict, set)
+        """
+        no_cc = 'no closed captioning available'
+
+        if '<html' in data.lower():
+            raise CaptionReadSyntaxError(
+                'SAMI File seems to be an HTML file.')
+        elif no_cc in data.lower():
+            raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc)
+
+        # try to find style tag in SAMI
+        try:
+            # prevent BS4 error with huge SAMI files with unclosed tags
+            index = data.lower().find("</head>")
+
+            self.styles = self._css_parse(
+                BeautifulSoup(data[:index], "lxml").find('style').get_text())
+        except AttributeError:
+            self.styles = {}
+
+        # fix erroneous italics tags
+        data = data.replace('<i/>', '<i>')
+
+        # fix awkward tags found in some SAMIs
+        data = data.replace(';>', '>')
+        try:
+            HTMLParser.feed(self, data)
+        except HTMLParseError as e:
+            raise CaptionReadSyntaxError(e)
+
+        # close any tags that remain in the queue
+        while self.queue != deque([]):
+            closing_tag = self.queue.pop()
+            self.sami += "</%s>" % closing_tag
+
+        return self.sami, self.styles, self.langs
+
+    # parse the SAMI's stylesheet
+    def _css_parse(self, css):
+        """
+        Parse styling via cssutils modules
+        :rtype: dict
+        """
+        sheet = parseString(css)
+        style_sheet = {}
+
+        for rule in sheet:
+            new_style = {}
+            selector = rule.selectorText.lower()
+            if selector[0] in ['#', '.']:
+                selector = selector[1:]
+            # keep any style attributes that are needed
+            for prop in rule.style:
+                if prop.name == 'color':
+                    cv = cssutils_css.ColorValue(prop.value)
+                    # Code for RGB to hex conversion comes from
+                    # http://bit.ly/1kwfBnQ
+                    new_style['color'] = "#%02x%02x%02x" % (
+                        cv.red, cv.green, cv.blue)
+                else:
+                    new_style[prop.name] = prop.value
+            if new_style:
+                style_sheet[selector] = new_style
+
+        return style_sheet
+
+    def _find_lang(self, attrs):
+        for attr, value in attrs:
+            # if lang is an attribute of the tag
+            if attr.lower() == 'lang':
+                return value[:2]
+            # if the P tag has a class, try and find the language
+            if attr.lower() == 'class':
+                try:
+                    return self.styles[value.lower()]['lang']
+                except KeyError:
+                    pass
+
+        return None
@@ -0,0 +1,696 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+3 types of SCC captions:
+    Roll-Up
+    Paint-On
+    Pop-On
+
+Commands:
+    94ae - [ENM] - Erase Non-displayed(buffer) Memory
+    942c - [EDM] - Erase Displayed Memory
+    9420 - [RCL] - Resume Caption Loading
+    9429 - [RDC] - Resume Direct Captioning
+
+    9425, 9426, 94a7 - [RU2], [RU3], [RU4] (roll up captions 2,3 or 4 rows)
+        - these commands set the number of expected lines
+
+    94ad - (in CEA-608-E: 142d) - [CR] carriage return.
+        - This actually rolls the captions up as many rows as specified by
+        [RU1], [RU2], or [RU3]
+
+    80 - no-op char. Doesn't do anything, but must be used with other
+        characters, to make a 2 byte word
+
+    97a1, 97a2, 9723 - [TO] move 1, 2 or 3 columns - Tab Over command
+        - this moves the positioning 1, 2, or 3 columns to the right
+        - Nothing regarding this is implemented.
+
+    942f - [EOC] - display the buffer on the screen - End Of Caption
+    ... - [PAC] - Preamble address code (can set positioning and style)
+        - All the PACs are specified by the first and second byte combined
+        from pycaption.scc.constants.PAC_BYTES_TO_POSITIONING_MAP
+
+    9429 - [RDC] - Resume Direct Captioning
+    94a4 - (in CEA-608-E: 1424) - [DER] Delete to End of Row
+
+
+Pop-On:
+    The commands should usually appear in this order. Not strict though, and
+    the the commands don't have to necessarily be on the same row.
+
+    1. 94ae [ENM] (erase non displayed memory)
+    2. 9420 [RCL] (resume caption loading => this command here means we're using Pop-On captions)
+    2.1? [ENM] - if step 0 was skipped?
+    3. [PAC] Positioning/ styling command (can position on columns divisible by 4)
+        The control chars is called Preamble Address Code [PAC].
+    4. If positioning needs to be on columns not divisible by 4, use a [TO] command
+    5. text
+    6. 942c [EDM] - optionally, erase the currently displayed caption
+    7. 942f [EOC] display the caption
+
+
+Roll-Up:
+    1. [RU2], [RU3] or [RU4]    - sets Roll-Up style and depth
+        - these set the Roll-Up style: (characteristic command)
+    2. [CR] to roll the display up 1 row...lol?
+    3. [PAC] - sets the indent of the base row
+
+
+Paint-On:
+    1. [RDC] - sets the Paint-On style (characteristic command)
+    2. [PAC]
+    3. text
+    4. [PAC]
+    5. text or [DER]
+
+There are some rules regarding the parity of the commands.
+
+This resource:
+http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML
+ specifies that there are interpreters which only work if the commands have an
+ odd parity. This however is not consistent, and we might not handle well
+ these cases. Odd parity of a command means that, converting toe word into
+ binary, should result in an odd number of '1's. The PAC commands obey this
+ rule, but some do not. Some commands that do not are found in the COMMANDS
+ dictionary. This is legacy logic, that I didn't know how to handle, and
+ just carried over when implementing positioning.
+"""
+
+import re
+import math
+import textwrap
+from copy import deepcopy
+
+import six
+
+from pycaption.base import (
+    BaseReader, BaseWriter, CaptionSet, CaptionNode,
+)
+from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError
+from .constants import (
+    HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS,
+    MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
+    SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
+    PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
+)
+from .specialized_collections import (
+    TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
+    InstructionNodeCreator)
+from .state_machines import DefaultProvidingPositionTracker
+
+
+class NodeCreatorFactory(object):
+    """Will return instances of the given node_creator.
+
+    This is used as a means of creating new InstructionNodeCreator instances,
+    because these need to share state beyond their garbage collection, but
+    storing the information at the class level is not good either, because
+    this information must be erased after the reader's .read() operation
+    completes.
+    """
+    def __init__(self, position_tracker,
+                 node_creator=InstructionNodeCreator):
+        self.position_tracker = position_tracker
+        self.node_creator = node_creator
+
+    def new_creator(self):
+        """Returns a new instance of self.node_creator, initialized with
+        the same italics_tracker, and position_tracker
+        """
+        return self.node_creator(position_tracker=self.position_tracker)
+
+    def from_list(self, roll_rows):
+        """Wraps the node_creator's method with the same name
+
+        :param roll_rows: list of node_creator instances
+
+        :return: a node_creator instance
+        """
+        return self.node_creator.from_list(
+            roll_rows,
+            position_tracker=self.position_tracker
+        )
+
+
+def get_corrected_end_time(caption):
+    """If the last caption was never explicitly ended, set its end time to
+    start + 4 seconds
+
+    :param Caption caption: the last caption
+    :rtype: int
+    """
+    if caption.end:
+        return caption.end
+
+    return caption.start + 4 * 1000 * 1000
+
+
+class SCCReader(BaseReader):
+    """Converts a given unicode string to a CaptionSet.
+
+    This can be then later used for converting into any other supported formats
+    """
+    def __init__(self, *args, **kw):
+        self.caption_stash = CaptionCreator()
+        self.time_translator = _SccTimeTranslator()
+
+        self.node_creator_factory = NodeCreatorFactory(
+            DefaultProvidingPositionTracker()
+        )
+
+        self.last_command = ''
+
+        self.buffer_dict = NotifyingDict()
+
+        self.buffer_dict['pop'] = self.node_creator_factory.new_creator()
+        self.buffer_dict['paint'] = self.node_creator_factory.new_creator()
+        self.buffer_dict['roll'] = self.node_creator_factory.new_creator()
+
+        # Call this method when the active key changes
+        self.buffer_dict.add_change_observer(self._flush_implicit_buffers)
+        self.buffer_dict.set_active('pop')
+
+        self.roll_rows = []
+        self.roll_rows_expected = 0
+        self.simulate_roll_up = False
+
+        self.time = 0
+
+    def detect(self, content):
+        """Checks whether the given content is a proper SCC file
+
+        :type content: unicode
+
+        :rtype: bool
+        """
+        lines = content.splitlines()
+        if lines[0] == HEADER:
+            return True
+        else:
+            return False
+
+    def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
+        """Converts the unicode string into a CaptionSet
+
+        :type content: six.text_type
+        :param content: The SCC content to be converted to a CaptionSet
+
+        :type lang: six.text_type
+        :param lang: The language of the caption
+
+        :type simulate_roll_up: bool
+        :param simulate_roll_up: If True, when converting to other formats,
+            the resulting captions will contain all the rows that were visible
+            on the screen when the captions were rolling up.
+
+        :type offset: int
+        :param offset:
+
+        :rtype: CaptionSet
+        """
+        if type(content) != six.text_type:
+            raise InvalidInputError('The content is not a unicode string.')
+
+        self.simulate_roll_up = simulate_roll_up
+        self.time_translator.offset = offset * 1000000
+        # split lines
+        lines = content.splitlines()
+
+        # loop through each line except the first
+        for line in lines[1:]:
+            self._translate_line(line)
+
+        self._flush_implicit_buffers()
+
+        captions = CaptionSet({lang: self.caption_stash.get_all()})
+
+        # check captions for incorrect lengths
+        for cap in captions.get_captions(lang):
+            # if there's an end time on a caption and the difference is
+            # less than .05s kill it (this is likely caused by a standalone
+            # EOC marker in the SCC file)
+            if 0 < cap.end - cap.start < 50000:
+                raise ValueError('unsupported length found in SCC input file: ' + str(cap))
+
+        if captions.is_empty():
+            raise CaptionReadNoCaptions("empty caption file")
+        else:
+            last_caption = captions.get_captions(lang)[-1]
+            last_caption.end = get_corrected_end_time(last_caption)
+
+        return captions
+
+    def _fix_last_timing(self, timing):
+        """HACK HACK: Certain Paint-On captions don't specify the 942f [EOC]
+        (End Of Caption) command on the same line.
+        If this is a 942f line, also simulate a 942c (Erase Displayed Memory)
+        to properly set the timing on the last caption.
+
+        This method needs some serious attention, because it proves the timing
+        calculation is not done well for Pop-On captions
+        """
+        # Calculate the end time from the current line
+        time_translator = _SccTimeTranslator()
+        time_translator.start_at(timing)
+        time_translator.offset = self.time_translator.offset
+
+        # But use the current time translator for the start time
+        self.caption_stash.create_and_store(
+            self.buffer, self.time_translator.get_time())
+
+        self.caption_stash.correct_last_timing(time_translator.get_time())
+        self.buffer = self.node_creator_factory.node_creator()
+
+    def _flush_implicit_buffers(self, old_key=None, *args):
+        """Convert to Captions those buffers whose behavior is implicit.
+
+        The Paint-On buffer is explicit. New captions are created from it
+        with the command 'End Of Caption' [EOC], '942f'
+
+        The other 2 buffers, Roll-Up and Paint-On we treat as "more" implicit,
+        meaning that they can be displayed by a command on the next row.
+        If they're on the last row however, or if the caption type is changing,
+        we make sure to convert the buffers to text, so we don't lose any info.
+        """
+        if old_key == 'pop':
+            return
+
+        elif old_key is None or old_key == 'roll':
+            if not self.buffer.is_empty():
+                self._roll_up()
+
+        elif old_key is None or old_key == 'paint':
+            # xxx - perhaps the self.buffer property is sufficient
+            if not self.buffer_dict['paint'].is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer_dict['paint'], self.time)
+
+    def _translate_line(self, line):
+        # ignore blank lines
+        if line.strip() == '':
+            return
+
+        # split line in timestamp and words
+        r = re.compile(r"([0-9:;]*)([\s\t]*)((.)*)")
+        parts = r.findall(line.lower())
+
+        # XXX!!!!!! THESE 2 LINES ARE A HACK
+        if parts[0][2].strip() == '942f':
+            self._fix_last_timing(timing=parts[0][0])
+
+        self.time_translator.start_at(parts[0][0])
+
+        # loop through each word
+        for word in parts[0][2].split(' '):
+            # ignore empty results
+            if word.strip() != '':
+                self._translate_word(word)
+
+    def _translate_word(self, word):
+        # count frames for timing
+        self.time_translator.increment_frames()
+
+        # first check if word is a command
+        # TODO - check that all the positioning commands are here, or use
+        # some other strategy to determine if the word is a command.
+        if word in COMMANDS or _is_pac_command(word):
+            self._translate_command(word)
+
+        # second, check if word is a special character
+        elif word in SPECIAL_CHARS:
+            self._translate_special_char(word)
+
+        elif word in EXTENDED_CHARS:
+            self._translate_extended_char(word)
+
+        # third, try to convert word into 2 characters
+        else:
+            self._translate_characters(word)
+
+    def _handle_double_command(self, word):
+        # ensure we don't accidentally use the same command twice
+        if word == self.last_command:
+            self.last_command = ''
+            return True
+        else:
+            self.last_command = word
+            return False
+
+    def _translate_special_char(self, word):
+        # XXX - this looks highly buggy. Why should special chars be ignored
+        # when printed 2 times one after another?
+        if self._handle_double_command(word):
+            return
+
+        self.buffer.add_chars(SPECIAL_CHARS[word])
+
+    def _translate_extended_char(self, word):
+        # XXX - this looks highly buggy. Why would a special char be ignored
+        # if it's printed 2 times one after another?
+        if self._handle_double_command(word):
+            return
+
+        # add to buffer
+        self.buffer.add_chars(EXTENDED_CHARS[word])
+
+    def _translate_command(self, word):
+        if self._handle_double_command(word):
+            return
+
+        # if command is pop_up
+        if word == '9420':
+            self.buffer_dict.set_active('pop')
+
+        # command is paint_on [Resume Direct Captioning]
+        elif word == '9429':
+            self.buffer_dict.set_active('paint')
+
+            self.roll_rows_expected = 1
+            if not self.buffer.is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer, self.time
+                )
+                self.buffer = self.node_creator_factory.new_creator()
+
+            self.time = self.time_translator.get_time()
+
+        # if command is roll_up 2, 3 or 4 rows
+        elif word in ('9425', '9426', '94a7'):
+            self.buffer_dict.set_active('roll')
+
+            # count how many lines are expected
+            if word == '9425':
+                self.roll_rows_expected = 2
+            elif word == '9426':
+                self.roll_rows_expected = 3
+            elif word == '94a7':
+                self.roll_rows_expected = 4
+
+            # if content is in the queue, turn it into a caption
+            if not self.buffer.is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer, self.time)
+                self.buffer = self.node_creator_factory.new_creator()
+
+            # set rows to empty, configure start time for caption
+            self.roll_rows = []
+            self.time = self.time_translator.get_time()
+
+        # clear pop_on buffer
+        elif word == '94ae':
+            self.buffer = self.node_creator_factory.new_creator()
+
+        # display pop_on buffer [End Of Caption]
+        elif word == '942f':
+            self.time = self.time_translator.get_time()
+            self.caption_stash.create_and_store(self.buffer, self.time)
+            self.buffer = self.node_creator_factory.new_creator()
+
+        # roll up captions [Carriage Return]
+        elif word == '94ad':
+            # display roll-up buffer
+            if not self.buffer.is_empty():
+                self._roll_up()
+
+        # clear screen
+        elif word == '942c':
+            self.roll_rows = []
+
+            # XXX - The 942c command has nothing to do with paint-ons
+            # This however is legacy code, and will break lots of tests if
+            # the proper buffer (self.buffer) is used.
+            # Most likely using `self.buffer` instead of the paint buffer
+            # is the right thing to do, but this needs some further attention.
+            if not self.buffer_dict['paint'].is_empty():
+                self.caption_stash.create_and_store(
+                    self.buffer_dict['paint'], self.time)
+                self.buffer = self.node_creator_factory.new_creator()
+
+            # attempt to add proper end time to last caption(s)
+            self.caption_stash.correct_last_timing(
+                self.time_translator.get_time())
+
+        # if command not one of the aforementioned, add to buffer
+        else:
+            self.buffer.interpret_command(word)
+
+    def _translate_characters(self, word):
+        # split word into the 2 bytes
+        byte1 = word[:2]
+        byte2 = word[2:]
+
+        # check to see if the the bytes are recognized characters
+        if byte1 not in CHARACTERS or byte2 not in CHARACTERS:
+            return
+
+        self.buffer.add_chars(CHARACTERS[byte1], CHARACTERS[byte2])
+
+    @property
+    def buffer(self):
+        """Returns the currently active buffer
+        """
+        return self.buffer_dict.get_active()
+
+    @buffer.setter
+    def buffer(self, value):
+        """Sets a new value to the active key
+
+        :param value: any object
+        """
+        try:
+            key = self.buffer_dict.active_key
+            self.buffer_dict[key] = value
+        except TypeError:
+            pass
+
+    def _roll_up(self):
+        # We expect the active buffer to be the rol buffer
+        if self.simulate_roll_up:
+            if self.roll_rows_expected > 1:
+                if len(self.roll_rows) >= self.roll_rows_expected:
+                    self.roll_rows.pop(0)
+
+                self.roll_rows.append(self.buffer)
+                self.buffer = self.node_creator_factory.from_list(
+                    self.roll_rows)
+
+        # convert buffer and empty
+        self.caption_stash.create_and_store(self.buffer, self.time)
+        self.buffer = self.node_creator_factory.new_creator()
+
+        # configure time
+        self.time = self.time_translator.get_time()
+
+        # try to insert the proper ending time for the previous caption
+        self.caption_stash.correct_last_timing(self.time, force=True)
+
+
+class SCCWriter(BaseWriter):
+
+    def __init__(self, *args, **kw):
+        super(SCCWriter, self).__init__(*args, **kw)
+
+    def write(self, caption_set):
+        output = HEADER + '\n\n'
+
+        if caption_set.is_empty():
+            return output
+
+        caption_set = deepcopy(caption_set)
+
+        # Only support one language.
+        lang = list(caption_set.get_languages())[0]
+        captions = caption_set.get_captions(lang)
+
+        # PASS 1: compute codes for each caption
+        codes = [(self._text_to_code(caption), caption.start, caption.end)
+                 for caption in captions]
+
+        # PASS 2:
+        # Advance start times so as to have time to write to the pop-on
+        # buffer; possibly remove the previous clear-screen command
+        for index, (code, start, end) in enumerate(codes):
+            code_words = len(code) / 5 + 8
+            code_time_microseconds = code_words * MICROSECONDS_PER_CODEWORD
+            code_start = start - code_time_microseconds
+            if index == 0:
+                continue
+            previous_code, previous_start, previous_end = codes[index-1]
+            if previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start:
+                codes[index-1] = (previous_code, previous_start, None)
+            codes[index] = (code, code_start, end)
+
+        # PASS 3:
+        # Write captions.
+        for (code, start, end) in codes:
+            output += ('%s\t' % self._format_timestamp(start))
+            output += '94ae 94ae 9420 9420 '
+            output += code
+            output += '942c 942c 942f 942f\n\n'
+            if end is not None:
+                output += '%s\t942c 942c\n\n' % self._format_timestamp(end)
+
+        return output
+
+    # Wrap lines at 32 chars
+    @staticmethod
+    def _layout_line(caption):
+        def caption_node_to_text(caption_node):
+            if caption_node.type_ == CaptionNode.TEXT:
+                return six.text_type(caption_node.content)
+            elif caption_node.type_ == CaptionNode.BREAK:
+                return '\n'
+        caption_text = ''.join(
+            [caption_node_to_text(node) for node in caption.nodes])
+        inner_lines = caption_text.split('\n')
+        inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines]
+        return '\n'.join(inner_lines_laid_out)
+
+    @staticmethod
+    def _maybe_align(code):
+        # Finish a half-word with a no-op so we can move to a full word
+        if len(code) % 5 == 2:
+            code += '80 '
+        return code
+
+    @staticmethod
+    def _maybe_space(code):
+        if len(code) % 5 == 4:
+            code += ' '
+        return code
+
+    def _print_character(self, code, char):
+        try:
+            char_code = CHARACTER_TO_CODE[char]
+        except KeyError:
+            try:
+                char_code = SPECIAL_OR_EXTENDED_CHAR_TO_CODE[char]
+            except KeyError:
+                char_code = '91b6'  # Use £ as "unknown character" symbol
+
+        if len(char_code) == 2:
+            return code + char_code
+        elif len(char_code) == 4:
+            return self._maybe_align(code) + char_code
+        else:
+            # This should not happen!
+            return code
+
+    def _text_to_code(self, s):
+        code = ''
+        lines = self._layout_line(s).split('\n')
+        for row, line in enumerate(lines):
+            row += 16 - len(lines)
+            # Move cursor to column 0 of the destination row
+            for _ in range(2):
+                code += ('%s%s ' % (PAC_HIGH_BYTE_BY_ROW[row],
+                                    PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]))
+            # Print the line using the SCC encoding
+            for char in line:
+                code = self._print_character(code, char)
+                code = self._maybe_space(code)
+            code = self._maybe_align(code)
+        return code
+
+    @staticmethod
+    def _format_timestamp(microseconds):
+        seconds_float = microseconds / 1000.0 / 1000.0
+        # Convert to non-drop-frame timecode
+        seconds_float *= 1000.0 / 1001.0
+        hours = math.floor(seconds_float / 3600)
+        seconds_float -= hours * 3600
+        minutes = math.floor(seconds_float / 60)
+        seconds_float -= minutes * 60
+        seconds = math.floor(seconds_float)
+        seconds_float -= seconds
+        frames = math.floor(seconds_float * 30)
+        return '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frames)
+
+
+class _SccTimeTranslator(object):
+    """Converts SCC time to microseconds, keeping track of frames passed
+    """
+    def __init__(self):
+        self._time = '00:00:00;00'
+
+        # microseconds. The offset from which we begin the time calculation
+        self.offset = 0
+        self._frames = 0
+
+    def get_time(self):
+        """Returns the time, in microseconds. Takes into account the number of
+        frames passed, and the offset
+
+        :rtype: int
+        """
+        return self._translate_time(
+            self._time[:-2] + six.text_type(int(self._time[-2:]) + self._frames),
+            self.offset
+        )
+
+    @staticmethod
+    def _translate_time(stamp, offset):
+        """
+        :param stamp:
+        :type offset: int
+        :param offset: Subtract this many microseconds from the calculated time
+            Helpful for when the captions are off by some time interval.
+        :rtype: int
+        """
+        if ';' in stamp:
+            # Drop-frame timebase runs at the same rate as wall clock
+            seconds_per_timestamp_second = 1.0
+        else:
+            # Non-drop-frame timebase runs "slow"
+            # 1 second of timecode is longer than an actual second (1.001s)
+            seconds_per_timestamp_second = 1001.0 / 1000.0
+
+        time_split = stamp.replace(';', ':').split(':')
+
+        timestamp_seconds = (int(time_split[0]) * 3600 +
+                             int(time_split[1]) * 60 +
+                             int(time_split[2]) +
+                             int(time_split[3]) / 30.0)
+
+        seconds = timestamp_seconds * seconds_per_timestamp_second
+        microseconds = seconds * 1000 * 1000 - offset
+
+        if microseconds < 0:
+            microseconds = 0
+
+        return microseconds
+
+    def start_at(self, timespec):
+        """Reset the counter to the given time
+
+        :type timespec: unicode
+        """
+        self._time = timespec
+        self._frames = 0
+
+    def increment_frames(self):
+        """After a command was processed, we'd increment the number of frames
+        """
+        self._frames += 1
+
+
+def _is_pac_command(word):
+    """Checks whether the given word is a Preamble Address Code [PAC] command
+
+    :type word: unicode
+    :param word: 4 letter unicode command
+
+    :rtype: bool
+    """
+    if not word or len(word) != 4:
+        return False
+
+    byte1, byte2 = word[:2], word[2:]
+
+    try:
+        PAC_BYTES_TO_POSITIONING_MAP[byte1][byte2]
+    except KeyError:
+        return False
+    else:
+        return True
--- a/Show More
+++ b/Show More