1
0
This commit is contained in:
Khan
2021-09-01 02:57:54 +05:00
parent 9df940f1fd
commit bf3c3712dd
222 changed files with 1007430 additions and 0 deletions
@@ -0,0 +1,5 @@
from .kanji_to_romaji_module import convert_hiragana_to_katakana, translate_to_romaji, translate_soukon, \
translate_long_vowel, translate_soukon_ch, kanji_to_romaji
__all__ = ["load_mappings_dict", "convert_hiragana_to_katakana", "convert_katakana_to_hiragana",
"translate_to_romaji", "translate_soukon",
"translate_long_vowel", "translate_soukon_ch", "kanji_to_romaji"]
Binary file not shown.
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,154 @@
{
"": "!",
"": "\"",
"": "#",
"": "$",
"": "%",
"": "&",
"": "'",
"": "*",
"": "+",
"": ",",
"": "-",
"": ".",
"": "\/",
"": "0",
"": "1",
"": "2",
"": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9",
"": ":",
"": ";",
"": "<",
"": "=",
"": ">",
"": "?",
"": "@",
"": "A",
"": "B",
"": "C",
"": "D",
"": "E",
"": "F",
"": "G",
"": "H",
"": "I",
"": "J",
"": "K",
"": "L",
"": "M",
"": "N",
"И": "N",
"": "O",
"": "P",
"": "Q",
"": "R",
"": "S",
"": "T",
"": "U",
"": "V",
"": "W",
"": "X",
"": "Y",
"": "Z",
"": "[",
"": "\\",
"": "]",
"": "^",
"_": "_",
"": "'",
"": "a",
"": "b",
"": "c",
"": "d",
"": "e",
"": "f",
"": "g",
"": "h",
"": "i",
"": "j",
"": "k",
"": "l",
"": "m",
"": "n",
"": "o",
"": "p",
"": "q",
"": "r",
"": "s",
"": "t",
"": "u",
"": "v",
"": "w",
"": "x",
"": "y",
"": "z",
"": "{",
"": "|",
"": "}",
"": "~",
"Ā": "A",
"Ă": "A",
"Ą": "A",
"â": "a",
"ā": "a",
"ă": "a",
"ą": "a",
"Ē": "E",
"Ĕ": "E",
"Ė": "E",
"Ę": "E",
"Ě": "E",
"ē": "e",
"ĕ": "e",
"ė": "e",
"ę": "e",
"ě": "e",
"Ī": "I",
"Ĭ": "I",
"Į": "I",
"ī": "i",
"ĭ": "i",
"į": "i",
"Ō": "O",
"Ŏ": "O",
"Ő": "O",
"ō": "o",
"ŏ": "o",
"ő": "o",
"Ũ": "U",
"Ū": "U",
"Ŭ": "U",
"Ů": "U",
"Ű": "U",
"Ų": "U",
"ũ": "u",
"ū": "u",
"ŭ": "u",
"ů": "u",
"ű": "u",
"ų": "u",
"Ӓ": "A",
"Ӑ": "A",
"Ѐ": "E",
"Ё": "E",
"Ӧ": "O",
"ӓ": "a",
"ӑ": "a",
"ѐ": "e",
"ё": "e",
"ӧ": "o",
"ω": "w",
"€": "E",
"∃": "E",
"ϛ": "c"
}
@@ -0,0 +1,120 @@
{
"ぁ": "a",
"あ": "a",
"ぃ": "i",
"い": "i",
"ぅ": "u",
"う": "u",
"ぇ": "e",
"え": "e",
"ぉ": "o",
"お": "o",
"か": "ka",
"が": "ga",
"き": "ki",
"きゃ": "kya",
"きゅ": "kyu",
"きょ": "kyo",
"ぎ": "gi",
"ぎゃ": "gya",
"ぎゅ": "gyu",
"ぎょ": "gyo",
"く": "ku",
"ぐ": "gu",
"け": "ke",
"げ": "ge",
"こ": "ko",
"ご": "go",
"さ": "sa",
"ざ": "za",
"し": "shi",
"しゃ": "sha",
"しゅ": "shu",
"しょ": "sho",
"じ": "ji",
"じゃ": "ja",
"じゅ": "ju",
"じょ": "jo",
"す": "su",
"ず": "zu",
"せ": "se",
"ぜ": "ze",
"そ": "so",
"ぞ": "zo",
"た": "ta",
"だ": "da",
"ち": "chi",
"ちゃ": "cha",
"ちゅ": "chu",
"ちょ": "cho",
"ぢ": "ji",
"つ": "tsu",
"づ": "zu",
"て": "te",
"で": "de",
"と": "to",
"ど": "do",
"な": "na",
"に": "ni",
"にゃ": "nya",
"にゅ": "nyu",
"にょ": "nyo",
"ぬ": "nu",
"ね": "ne",
"の": "no",
"は": "ha",
"ば": "ba",
"ぱ": "pa",
"ひ": "hi",
"ひゃ": "hya",
"ひゅ": "hyu",
"ひょ": "hyo",
"び": "bi",
"びゃ": "bya",
"びゅ": "byu",
"びょ": "byo",
"ぴ": "pi",
"ぴゃ": "pya",
"ぴゅ": "pyu",
"ぴょ": "pyo",
"ふ": "fu",
"ぶ": "bu",
"ぷ": "pu",
"へ": "he",
"べ": "be",
"ぺ": "pe",
"ほ": "ho",
"ぼ": "bo",
"ぽ": "po",
"ま": "ma",
"み": "mi",
"みゃ": "mya",
"みゅ": "myu",
"みょ": "myo",
"む": "mu",
"め": "me",
"も": "mo",
"や": "ya",
"ゆ": "yu",
"よ": "yo",
"ら": "ra",
"り": "ri",
"りゃ": "rya",
"りゅ": "ryu",
"りょ": "ryo",
"る": "ru",
"れ": "re",
"ろ": "ro",
"ゎ": "wa",
"わ": "wa",
"ゐ": "wi",
"ゑ": "we",
"を": " wo ",
"ん": "n",
"ゔ": "vu",
"ゕ": "ka",
"ゖ": "ke",
"ゝ": "iteration_mark",
"ゞ": "voiced_iteration_mark",
"ゟ": "yori"
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,18 @@
{
"今日": {
"w_type": "noun",
"romaji": "kyou"
},
"明日": {
"w_type": "noun",
"romaji": "ashita"
},
"本": {
"w_type": "noun",
"romaji": "hon"
},
"中": {
"w_type": "noun",
"romaji": "naka"
}
}
@@ -0,0 +1,78 @@
{
"朝日奈丸佳": {
"w_type": "noun",
"romaji": "Asahina Madoka"
},
"高海千歌": {
"w_type": "noun",
"romaji": "Takami Chika"
},
"鏡音レン": {
"w_type": "noun",
"romaji": "Kagamine Len"
},
"鏡音リン": {
"w_type": "noun",
"romaji": "Kagamine Rin"
},
"逢坂大河": {
"w_type": "noun",
"romaji": "Aisaka Taiga"
},
"水樹奈々": {
"w_type": "noun",
"romaji": "Mizuki Nana"
},
"桜内梨子": {
"w_type": "noun",
"romaji": "Sakurauchi Riko"
},
"山吹沙綾": {
"w_type": "noun",
"romaji": "Yamabuki Saaya"
},
"初音ミク": {
"w_type": "noun",
"romaji": "Hatsune Miku"
},
"渡辺曜": {
"w_type": "noun",
"romaji": "Watanabe You"
},
"原由実": {
"w_type": "noun",
"romaji": "Hara Yumi"
},
"北宇治": {
"w_type": "noun",
"romaji": "Kita Uji"
},
"六本木": {
"w_type": "noun",
"romaji": "Roppongi"
},
"久美子": {
"w_type": "noun",
"romaji": "Kumiko"
},
"政宗": {
"w_type": "noun",
"romaji": "Masamune"
},
"小林": {
"w_type": "noun",
"romaji": "Kobayashi"
},
"奥寺": {
"w_type": "noun",
"romaji": "Okudera"
},
"佐藤": {
"w_type": "noun",
"romaji": "Satou"
},
"玲子": {
"w_type": "noun",
"romaji": "Reiko"
}
}
@@ -0,0 +1,159 @@
{
"ァ": "a",
"ア": "a",
"ィ": "i",
"イ": "i",
"イィ": "yi",
"イェ": "ye",
"ゥ": "u",
"ウ": "u",
"ウィ": "wi",
"ウェ": "we",
"ウォ": "wo",
"ェ": "e",
"エ": "e",
"ォ": "o",
"オ": "o",
"カ": "ka",
"ガ": "ga",
"キ": "ki",
"キェ": "kye",
"キャ": "kya",
"キュ": "kyu",
"キョ": "kyo",
"ギ": "gi",
"ギェ": "gye",
"ギャ": "gya",
"ギュ": "gyu",
"ギョ": "gyo",
"ク": "ku",
"クァ": "kwa",
"クィ": "kwi",
"クェ": "kwe",
"クォ": "kwo",
"グ": "gu",
"グァ": "gwa",
"グィ": "gwi",
"グェ": "gwe",
"グォ": "gwo",
"ケ": "ke",
"ゲ": "ge",
"コ": "ko",
"ゴ": "go",
"サ": "sa",
"ザ": "za",
"シ": "shi",
"シェ": "she",
"シャ": "sha",
"シュ": "shu",
"ショ": "sho",
"ジ": "ji",
"ジェ": "je",
"ジャ": "ja",
"ジュ": "ju",
"ジョ": "jo",
"ス": "su",
"スィ": "si",
"ズ": "zu",
"ズィ": "zi",
"セ": "se",
"ゼ": "ze",
"ソ": "so",
"ゾ": "zo",
"タ": "ta",
"ダ": "da",
"チ": "chi",
"チェ": "che",
"チャ": "cha",
"チュ": "chu",
"チョ": "cho",
"ヂ": "ji",
"ツ": "tsu",
"ツァ": "tsa",
"ツィ": "tsi",
"ツェ": "tse",
"ツォ": "tso",
"ヅ": "zu",
"テ": "te",
"ティ": "ti",
"デ": "de",
"ディ": "di",
"ト": "to",
"トゥ": "tu",
"ド": "do",
"ドゥ": "du",
"ナ": "na",
"ニ": "ni",
"ニャ": "nya",
"ニュ": "nyu",
"ニョ": "nyo",
"ヌ": "nu",
"ネ": "ne",
"": "no",
"ハ": "ha",
"バ": "ba",
"パ": "pa",
"ヒ": "hi",
"ヒャ": "hya",
"ヒュ": "hyu",
"ヒョ": "hyo",
"ビ": "bi",
"ビャ": "bya",
"ビュ": "byu",
"ビョ": "byo",
"ピ": "pi",
"ピャ": "pya",
"ピュ": "pyu",
"ピョ": "pyo",
"フ": "fu",
"ファ": "fa",
"フィ": "fi",
"フェ": "fe",
"フォ": "fo",
"ブ": "bu",
"プ": "pu",
"ヘ": "he",
"ベ": "be",
"ペ": "pe",
"ホ": "ho",
"ホゥ": "hu",
"ボ": "bo",
"ポ": "po",
"マ": "ma",
"ミ": "mi",
"ミャ": "mya",
"ミュ": "myu",
"ミョ": "myo",
"ム": "mu",
"メ": "me",
"モ": "mo",
"ヤ": "ya",
"ユ": "yu",
"ヨ": "yo",
"ラ": "ra",
"リ": "ri",
"リャ": "rya",
"リュ": "ryu",
"リョ": "ryo",
"ル": "ru",
"レ": "re",
"ロ": "ro",
"ヮ": "wa",
"ワ": "wa",
"ヰ": "wi",
"ヱ": "we",
"ヲ": "wo",
"ン": "n",
"ヴ": "vu",
"ヴァ": "va",
"ヴィ": "vi",
"ヴェ": "ve",
"ヴォ": "vo",
"ヵ": "ka",
"ヶ": "ke",
"ヺ": "vo",
"・": " ",
"ヽ": "iteration_mark",
"ヾ": "voiced_iteration_mark",
"ヿ": "koto"
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,103 @@
{
"\u200b": "",
"「": "[",
"」": "]",
"『": "[",
"』": "]",
"": "(",
"": ")",
"": "[",
"": "]",
"": "{",
"": "}",
"〈": "(",
"〉": ")",
"【": "[",
"】": "]",
"": "[",
"": "]",
"〖": "[",
"〗": "]",
"〘": "[",
"〙": "]",
"〚": "[",
"〛": "]",
"": "--",
"〓": "-",
"": "=",
"〜": "~",
"…": "_",
"※": "",
"♪": "",
"♫": "",
"♬": "",
"♩": "",
"": "!",
"": "?",
"、": ",",
"♥": " ",
"«": "(",
"»": ")",
"≪": "(",
"≫": ")",
"": "-",
"”": "",
"“": "",
"゙": "",
"": "'",
"": "",
"→": "",
"⇒": "",
"∞": " ",
"☆": " ",
"♠": " ",
"ᷨ": " ",
"ꯑ": " ",
"ᤙ": " ",
"": " ",
"△": "" ,
"★": " ",
"♡": " ",
"。": "",
"゚": "",
"(": "(",
")": ")",
"∀": "a",
"ά": "a",
"ɪ": "I",
"˥": "l",
"゚": "",
"—": "-",
"Я": "",
"Ψ": "",
"┐": "",
"ə": "",
"ˈ": "",
"×": " x ",
"†": "",
"≡": " ",
"": "",
"": "-",
"⇔": " ",
"≒": " ",
"〆": "shime",
"\u3000": " "
}
@@ -0,0 +1,646 @@
# coding=utf-8
import os
import sys
from collections import OrderedDict
try:
# noinspection PyPackageRequirements
import simplejson as json
except ImportError:
import json
from .models import UnicodeRomajiMapping
from .models import KanjiBlock
from .models import Particle
PATH_TO_MODULE = os.path.dirname(__file__)
JP_MAPPINGS_PATH = os.path.join(PATH_TO_MODULE, "jp_mappings")
hiragana_iter_mark = ""
hiragana_voiced_iter_mark = ""
katakana_iter_mark = ""
katakana_voiced_iter_mark = ""
kanji_iteration_mark = ""
hirgana_soukon_unicode_char = ""
katakana_soukon_unicode_char = ""
katakana_long_vowel_mark = ""
def load_kana_mappings_dict():
kana_romaji_mapping = {}
for f in os.listdir(JP_MAPPINGS_PATH):
if os.path.splitext(f)[1] == ".json" and "kanji" not in f:
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
kana_romaji_mapping.update(json.load(data_file))
return kana_romaji_mapping
def load_kanji_mappings_dict():
"""
read through all json files that contain "kanji" in filename
load json data from files to kanji_romaji_mapping dictionary
if the key(kanji char) has already been added to kanji_romaji_mapping then create "other_readings" key
"other_readings" will consist of w_type for its key and the new romaji reading for it
e.g:
{u"係り":
'w_type': 'noun',
'romaji': 'kakari',
{'other_readings': {'godan verb stem': 'kakawari'}
}
:return: dict - kanji to romaji mapping
"""
kanji_romaji_mapping = {}
f_list = os.listdir(JP_MAPPINGS_PATH)
for f in f_list[:]: # shift all conjugated files to end, lower priority for verb stems
if "conjugated" in f:
f_list.remove(f)
f_list.append(f)
for f in f_list:
if os.path.splitext(f)[1] == ".json" and "kanji" in f:
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
data_file_dict = json.load(data_file)
for k in list(data_file_dict.keys()):
if k in kanji_romaji_mapping and \
data_file_dict[k]["w_type"] != kanji_romaji_mapping[k]["w_type"]:
# if "other_readings" in kanji_romaji_mapping[k] and \
# data_file_dict[k]["w_type"] in kanji_romaji_mapping[k]["other_readings"]:
# raise
if "other_readings" not in kanji_romaji_mapping[k]:
kanji_romaji_mapping[k]["other_readings"] = {}
kanji_romaji_mapping[k]["other_readings"][data_file_dict[k]["w_type"]] = \
data_file_dict[k]["romaji"]
else:
kanji_romaji_mapping[k] = data_file_dict[k]
return kanji_romaji_mapping
def _convert_hira_kata_char(hira_or_kata_char, h_to_k=True):
"""
take second last hex character from unicode and add/subtract 6 hex to it to get hiragana/katakana char
e.g hiragana u3041 -> 0x3041 + 0x6 = 0x30A1 -> katakana u30A1
:param hira_or_kata_char: unicode hiragana character
:return: converterd hiragana or katakana depending on h_to_k value
"""
if h_to_k:
suffix_offset = 6
else:
suffix_offset = -6
unicode_second_last_char = list(hira_or_kata_char.encode("unicode_escape"))[-2]
suffix = hex(int(unicode_second_last_char, 16) + suffix_offset)
char_list = list(hira_or_kata_char.encode("unicode_escape"))
char_list[-2] = suffix[-1]
result_char = "".join(char_list).decode('unicode-escape').encode('utf-8')
return result_char
def convert_hiragana_to_katakana(hiragana):
converted_str = ""
for c in hiragana:
if is_hiragana(c) or c in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char]:
converted_str += _convert_hira_kata_char(c)
else:
converted_str += c.encode('utf-8')
return converted_str.decode("utf-8")
def convert_katakana_to_hiragana(katakana):
converted_str = ""
for c in katakana:
if is_katakana(c) or c in [katakana_iter_mark, katakana_voiced_iter_mark,
katakana_soukon_unicode_char]:
converted_str += _convert_hira_kata_char(c, h_to_k=False)
else:
converted_str += c.encode('utf-8')
return converted_str.decode("utf-8")
def is_hiragana(c):
hiragana_starting_unicode = "\u3041"
hiragana_ending_unicode = "\u3096"
return c not in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char] and \
hiragana_starting_unicode <= c <= hiragana_ending_unicode
def is_katakana(c):
katakana_starting_unicode = "\u30A1"
katakana_ending_unicode = "\u30F6"
return c not in [katakana_iter_mark, katakana_voiced_iter_mark,
katakana_soukon_unicode_char, katakana_long_vowel_mark] and \
katakana_starting_unicode <= c <= katakana_ending_unicode
def is_kanji(c):
cjk_start_range = "\u4E00"
cjk_end_range = "\u9FD5"
if isinstance(c, KanjiBlock):
return True
else:
return c != kanji_iteration_mark and cjk_start_range <= c <= cjk_end_range
def get_char_type(c):
"""
determine type of passed character by checking if it belongs in a certan unicode range
:param c: kana or kanji character
:return: type of character
"""
char_type = None
if is_hiragana(c):
char_type = "hiragana"
elif is_katakana(c):
char_type = "katakana"
elif is_kanji(c):
char_type = "kanji"
return char_type
def translate_particles(kana_list):
"""
try to find particles which are in hirgana and turn them in to Particle objects
Particle will provide spacing and will be translated in to appropriate romaji (e.g wa instead of ha for は)
rules (varies depending on the hiragana char):
char between two KanjiBlocks(that can be nouns) then assume to be a particle
e.g: 私は嬉 -> KanjiBlock(私), は, KanjiBlock(嬉) -> は is particle use wa instead of ha
type(Kanji, Hiragana, Katakana) changes adjacent to the char
e.g: アパートへくる -> ト, へ, く -> katakana, へ, hiragana -> へ is a particle, use e instead of he
char is last char and previous char is a noun
e.g: 会いました友達に -> KanjiBlock(友達) which is a noun, に
:param kana_list: list of kana characters and KanjiBlock objects
:return: None; update the kana_list that is passed
"""
def is_noun(k_block):
return hasattr(k_block, "w_type") and ("noun" in k_block.w_type or "pronoun" in k_block.w_type)
def type_changes(p, n):
if get_char_type(p) is not None and get_char_type(n) is not None:
return get_char_type(p) != get_char_type(n)
else:
return False
def particle_imm_follows(prev_c_, valid_prev_particles):
"""
check if prev_c is a Particle object
check that prev_c is one of the valid_prev_particles
e.g: wa particle can't be followed by wa particle again but ni particle can be followed by wa.
:param prev_c_: previous character compared to current character in the iteration
:param valid_prev_particles: list of previous particles that can be followed by current character.
:return:
"""
return isinstance(prev_c_, Particle) and prev_c_ in valid_prev_particles
no_hira_char = "\u306E"
ha_hira_char = "\u306F"
he_hira_char = "\u3078"
to_hira_char = "\u3068"
ni_hira_char = "\u306B"
de_hira_char = "\u3067"
mo_hira_char = "\u3082"
ga_hira_char = "\u304C"
no_prtcle = Particle("no")
wa_prtcle = Particle("wa")
e_prtcle = Particle("e")
to_prtcle = Particle("to")
ni_prtcle = Particle("ni")
de_prtcle = Particle("de")
mo_prtcle = Particle("mo")
ga_prtcle = Particle("ga")
for i in range(1, len(kana_list)):
is_last_char = False
prev_c = kana_list[i - 1]
if i == len(kana_list) - 1:
is_last_char = True
next_c = ""
else:
next_c = kana_list[i + 1]
if kana_list[i] == no_hira_char:
if (is_noun(prev_c) and is_noun(next_c)) or \
type_changes(prev_c, next_c) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = no_prtcle
elif kana_list[i] == ha_hira_char:
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
particle_imm_follows(prev_c, [e_prtcle, to_prtcle, ni_prtcle, de_prtcle]) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = wa_prtcle
elif kana_list[i] == mo_hira_char:
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
particle_imm_follows(prev_c, [ni_prtcle, de_prtcle]) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = mo_prtcle
elif kana_list[i] in [he_hira_char, to_hira_char, ni_hira_char, de_hira_char, ga_hira_char] and \
(is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
(is_noun(prev_c) and is_last_char):
if kana_list[i] == he_hira_char:
kana_list[i] = e_prtcle
elif kana_list[i] == to_hira_char:
kana_list[i] = to_prtcle
elif kana_list[i] == ni_hira_char:
kana_list[i] = ni_prtcle
elif kana_list[i] == de_hira_char:
kana_list[i] = de_prtcle
elif kana_list[i] == ga_hira_char:
kana_list[i] = ga_prtcle
def translate_kanji_iteration_mark(kana_list):
"""
translate kanji_iteration_mark: 々
e.g:
在々: zaizai
:param kana_list: unicode consisting of kana and kanji chars
:return: unicode with kanji iteration marks translated
"""
prev_c = ""
for i in range(0, len(kana_list)):
if kana_list[i] == kanji_iteration_mark:
kana_list[i] = prev_c.romaji.strip()
prev_c = kana_list[i]
def get_type_if_verb_stem(curr_chars):
"""
get verb type for given verb stem. verb types can be ichidan, godan or None.
No stem for irregulars
:param curr_chars: kanji chars that is a verb stem
:return: type of verb stem
"""
v_type = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
v_type = UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
v_type = "godan verb"
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
v_type = "ichidan verb"
return v_type
def check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len):
"""
if the given curr_chars has a verb stem reading then try to match it with an one of the listed verb endings
otherwise return/use its .romaji property
e.g:
kana_list = [KanjiBlock(灯り), ま, し, た]
curr_chars = 灯り can be verb stem reading
try and match 灯り with an ending within kana_list
灯り + ました matches
romaji is tomori + mashita (this modifies kana_list to remove matched ending)
kana_list = [tomorimashita]
kana_list = [KanjiBlock(灯り), を, 見ます]
curr_chars = 灯り can be verb stem reading
try and match 灯り with an ending within kana_list
no matching ending
romaji is akari
kana_list = [akari, を, 見ます]
:param kana_list:
:param curr_chars: KanjiBlock current characters to parse out of entire kana_list
:param start_pos:
:param char_len:
:return: ending kanji, ending romaji; both will be None if ending not found
"""
endings = OrderedDict({})
endings["ませんでした"] = "masen deshita"
endings["ませんで"] = "masende"
endings["なさるな"] = "nasaruna"
endings["なかった"] = "nakatta"
endings["れて"] = "rete"
endings["ましょう"] = "mashou"
endings["ました"] = "mashita"
endings["まして"] = "mashite"
endings["ません"] = "masen"
endings["ないで"] = "naide"
endings["なさい"] = "nasai"
endings["ます"] = "masu"
endings["よう"] = "you" # ichidan
endings["ない"] = "nai"
endings[""] = "ta" # ichidan
endings[""] = "te" # ichidan
endings[""] = "ro" # ichidan
endings[""] = "u"
dict_entry = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
dict_entry = UnicodeRomajiMapping.kanji_mapping[curr_chars]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
dict_entry = {
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["godan verb stem"]
}
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
dict_entry = {
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["ichidan verb stem"]
}
e_k = None
e_r = None
if dict_entry is not None:
for e in list(endings.keys()):
possible_conj = curr_chars + e
actual_conj = "".join(kana_list[start_pos: (start_pos + char_len + len(e))])
if possible_conj == actual_conj:
e_k = e
e_r = endings[e] + " "
break
return e_k, e_r
def has_non_verb_stem_reading(curr_chars):
"""
check if curr_chars has an alternative reading aside from the verb stem
:param curr_chars: unicode kanji chars to check
:return: true/false depending on if curr_chars has a verb stem reading
"""
res = False
if "verb stem" not in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
res = True
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if any(["verb stem" not in ork
for ork in list(UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"].keys())]):
res = True
return res
def get_verb_stem_romaji(verb_stem_kanji):
"""
find romaji for verb stem within kanji_mapping
:param verb_stem_kanji: unicode verb stem kanji
:return: romaji for verb stem kanji
"""
romaji = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["w_type"]:
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["romaji"]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]:
for k in list(UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"].keys()):
if "verb stem" in k:
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"][k]
break
return romaji
def prepare_kanjiblocks(kchar_list):
"""
create and replace matched Kanji characters that are within kanji_mapping with KanjiBlock
KanjiBlock will be used for spacing and particle translation later
if the kanji found is a verb stem then try to find an ending to match it with what's in kchar_list
:param kchar_list: list containing kana and kanji characters
:return: kchar_list with all found Kanji characters turned in to KanjiBlock objects
"""
if len(UnicodeRomajiMapping.kanji_mapping) == 0:
UnicodeRomajiMapping.kanji_mapping = load_kanji_mappings_dict()
max_char_len = len(kchar_list)
kana_list = list(kchar_list)
start_pos = 0
while start_pos < max_char_len:
char_len = len(kana_list) - start_pos
while char_len > 0:
curr_chars = "".join(kana_list[start_pos: (start_pos + char_len)])
if curr_chars in UnicodeRomajiMapping.kanji_mapping:
verb_stem_type = get_type_if_verb_stem(curr_chars)
ending_match_found = False
if verb_stem_type is not None:
ending_kana, ending_romaji = check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len)
if ending_kana is not None and ending_romaji is not None:
ending_match_found = True
conjugated_val = {
"romaji": get_verb_stem_romaji(curr_chars) + ending_romaji,
"w_type": "conjugated " + verb_stem_type
}
for i in range(start_pos + char_len - 1 + len(ending_kana), start_pos - 1, -1):
del kana_list[i]
kana_list.insert(start_pos,
KanjiBlock(curr_chars + ending_kana, conjugated_val))
if ending_match_found is False and has_non_verb_stem_reading(curr_chars):
for i in range(start_pos + char_len - 1, start_pos - 1, -1):
del kana_list[i]
kana_list.insert(start_pos,
KanjiBlock(curr_chars, UnicodeRomajiMapping.kanji_mapping[curr_chars]))
char_len -= 1
start_pos += 1
return kana_list
def translate_kanji(kana_list):
i = 0
while i < len(kana_list):
if type(kana_list[i]) == KanjiBlock:
kana_list[i] = kana_list[i].romaji
i += 1
kana = "".join(kana_list)
return kana
def prep_kanji(kana):
kana_list = list(kana)
if any([is_kanji(k) for k in kana]):
kana_list = prepare_kanjiblocks(kana)
translate_kanji_iteration_mark(kana_list)
return kana_list
def translate_to_romaji(kana):
"""
translate hiragana, katakana, typographic, and fhw latin
:param kana: unicode kana(+kanji) characters
:return: translated base kana characters to romaji as well as typographic, and fhw latin
"""
if len(UnicodeRomajiMapping.kana_mapping) == 0:
UnicodeRomajiMapping.kana_mapping = load_kana_mappings_dict()
max_char_len = 2
for char_len in range(max_char_len, 0, -1):
start_pos = 0
while start_pos < len(kana) - char_len + 1:
curr_chars = kana[start_pos: (start_pos + char_len)]
if curr_chars in UnicodeRomajiMapping.kana_mapping:
kana = kana.replace(curr_chars, UnicodeRomajiMapping.kana_mapping[curr_chars], 1)
if len(UnicodeRomajiMapping.kana_mapping[curr_chars]) == 0:
start_pos -= 1
start_pos += 1
while " " in kana:
kana = kana.replace(" ", " ")
kana = kana.strip()
lines = kana.split("\n")
for i in range(0, len(lines)):
lines[i] = lines[i].strip()
kana = "\n".join(lines)
return kana
def translate_soukon(partial_kana):
"""
translate both hiragana and katakana soukon: っ, ッ; repeats next consonant
e.g:
ちょっと willl be choっto by the time iit is passed to this method and then becomes chotto
:param partial_kana: partially translated kana with base kana chars already translated to romaji
:return: partial kana with soukon translated
"""
prev_char = ""
for c in reversed(partial_kana):
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
partial_kana = prev_char[0].join(partial_kana.rsplit(c, 1))
prev_char = c
return partial_kana
def translate_long_vowel(partial_kana):
"""
translate katakana long vowel ー; repeats previous vowel
e.g:
メール will be meーru by the time it is passed to this method and then becomes meeru
:param partial_kana: partially translated kana with base kana chars already translated to romaji
:return: partial kana with long vowel translated
"""
prev_c = ""
for c in partial_kana:
if c == katakana_long_vowel_mark:
if prev_c[-1] in list("aeiou"):
partial_kana = partial_kana.replace(c, prev_c[-1], 1)
else:
partial_kana = partial_kana.replace(c, "", 1)
prev_c = c
return partial_kana
def translate_soukon_ch(kana):
"""
if soukon(mini-tsu) is followed by chi then soukon romaji becomes 't' sound
e.g: ko-soukon-chi -> kotchi instead of kocchi
:param kana:
:return:
"""
prev_char = ""
hiragana_chi_unicode_char = "\u3061"
katakana_chi_unicode_char = "\u30C1"
partial_kana = kana
for c in reversed(kana):
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
if prev_char == hiragana_chi_unicode_char or prev_char == katakana_chi_unicode_char:
partial_kana = "t".join(partial_kana.rsplit(c, 1))
prev_char = c
return partial_kana
def _translate_dakuten_equivalent_char(kana_char):
dakuten_mapping = {
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": ""
}
dakuten_equiv = ""
if kana_char in dakuten_mapping:
dakuten_equiv = dakuten_mapping[kana_char]
return dakuten_equiv
def translate_dakuten_equivalent(kana_char):
"""
translate hiragana and katakana character to their dakuten equivalent
e.g:
ヒ: ビ
く: ぐ
み: ""
:param kana_char: unicode kana char
:return: dakuten equivalent if it exists otherwise empty string
"""
return _translate_dakuten_equivalent_char(kana_char)
def translate_kana_iteration_mark(kana):
"""
translate hiragana and katakana iteration marks: ゝ, ゞ, ヽ, ヾ
e.g:
こゝ: koko
タヾ: tada
かゞみち: kagaみち
:param kana: unicode consisting of kana chars
:return: unicode with kana iteration marks translated
"""
prev_char = ""
partial_kana = kana
for c in kana:
if c == hiragana_iter_mark or c == katakana_iter_mark:
partial_kana = prev_char.join(partial_kana.split(c, 1))
elif c == hiragana_voiced_iter_mark or c == katakana_voiced_iter_mark:
partial_kana = translate_dakuten_equivalent(prev_char).join(partial_kana.split(c, 1))
else:
prev_char = c
return partial_kana
def kanji_to_romaji(kana):
if type(kana) == str:
kana = kana
pk = translate_kana_iteration_mark(kana)
pk = translate_soukon_ch(pk)
pk_list = prep_kanji(pk)
translate_particles(pk_list)
pk = translate_kanji(pk_list)
pk = translate_to_romaji(pk)
pk = translate_soukon(pk)
r = translate_long_vowel(pk)
return r.replace("\\\\", "\\").encode("unicode_escape")
if __name__ == "__main__":
if len(sys.argv) > 1:
print(kanji_to_romaji(("".join(sys.argv[1:])).decode('unicode-escape')))
else:
print("Missing Kanji/Kana character argument\n" \
"e.g: kanji_to_romaji.py \\u30D2")
@@ -0,0 +1,29 @@
class KanjiBlock(str):
def __new__(cls, *args, **kwargs):
obj = str.__new__(cls, "@")
kanji = args[0]
kanji_dict = args[1]
obj.kanji = kanji
if len(kanji) == 1:
obj.romaji = " " + kanji_dict["romaji"]
else:
if "verb stem" in kanji_dict["w_type"]:
obj.romaji = " " + kanji_dict["romaji"]
else:
obj.romaji = " " + kanji_dict["romaji"] + " "
if "other_readings" in kanji_dict:
obj.w_type = [kanji_dict["w_type"]]
obj.w_type.extend(
[k for k in list(kanji_dict["other_readings"].keys())]
)
else:
obj.w_type = kanji_dict["w_type"]
return obj
def __repr__(self):
return self.kanji.encode("unicode_escape")
def __str__(self):
return self.romaji.encode("utf-8")
@@ -0,0 +1,6 @@
class Particle(str):
def __new__(cls, *args, **kwargs):
particle_str = args[0]
obj = str.__new__(cls, " " + particle_str + " ")
obj.pname = particle_str
return obj
@@ -0,0 +1,4 @@
# noinspection PyClassHasNoInit
class UnicodeRomajiMapping: # caching
kana_mapping = {}
kanji_mapping = {}
@@ -0,0 +1,5 @@
from .UnicodeRomajiMapping import UnicodeRomajiMapping
from .KanjiBlock import KanjiBlock
from .Particle import Particle
__all__ = ["UnicodeRomajiMapping", "KanjiBlock", "Particle"]
Binary file not shown.
+34
View File
@@ -0,0 +1,34 @@
from .base import (
CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet)
from .dfxp import DFXPWriter, DFXPReader
from .sami import SAMIReader, SAMIWriter
from .srt import SRTReader, SRTWriter
from .scc import SCCReader, SCCWriter
from .webvtt import WebVTTReader, WebVTTWriter
from .exceptions import (
CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError)
__all__ = [
'CaptionConverter', 'DFXPReader', 'DFXPWriter',
'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter',
'SCCReader', 'SCCWriter', 'WebVTTReader', 'WebVTTWriter',
'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError',
'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet'
]
SUPPORTED_READERS = (
DFXPReader, WebVTTReader, SAMIReader, SRTReader, SCCReader)
def detect_format(caps):
"""
Detect the format of the provided caption string.
:returns: the reader class for the detected format.
"""
for reader in SUPPORTED_READERS:
if reader().detect(caps):
return reader
return None
+409
View File
@@ -0,0 +1,409 @@
from datetime import timedelta
from numbers import Number
from six import text_type
from .exceptions import CaptionReadError, CaptionReadTimingError
DEFAULT_LANGUAGE_CODE = 'en-US'
def force_byte_string(content):
try:
return content.encode('UTF-8')
except UnicodeEncodeError:
raise RuntimeError('Invalid content encoding')
except UnicodeDecodeError:
return content
class CaptionConverter(object):
def __init__(self, captions=None):
self.captions = captions if captions else []
def read(self, content, caption_reader):
try:
self.captions = caption_reader.read(content)
except AttributeError as e:
raise Exception(e)
return self
def write(self, caption_writer):
try:
return caption_writer.write(self.captions)
except AttributeError as e:
raise Exception(e)
class BaseReader(object):
def __init__(self, *args, **kwargs):
pass
def detect(self, content):
if content:
return True
else:
return False
def read(self, content):
return CaptionSet()
class BaseWriter(object):
def __init__(self, relativize=True, video_width=None, video_height=None,
fit_to_screen=True):
"""
Initialize writer with the given parameters.
:param relativize: If True (default), converts absolute positioning
values (e.g. px) to percentage. ATTENTION: WebVTT does not support
absolute positioning. If relativize is set to False and it finds
an absolute positioning parameter for a given caption, it will
ignore all positioning for that cue and show it in the default
position.
:param video_width: The width of the video for which the captions being
converted were made. This is necessary for relativization.
:param video_height: The height of the video for which the captions
being converted were made. This is necessary for relativization.
:param fit_to_screen: If extent is not set or
if origin + extent > 100%, (re)calculate it based on origin.
It is a pycaption fix for caption files that are technically valid
but contains inconsistent settings that may cause long captions to
be cut out of the screen.
"""
self.relativize = relativize
self.video_width = video_width
self.video_height = video_height
self.fit_to_screen = fit_to_screen
def _relativize_and_fit_to_screen(self, layout_info):
if layout_info:
if self.relativize:
# Transform absolute values (e.g. px) into percentages
layout_info = layout_info.as_percentage_of(
self.video_width, self.video_height)
if self.fit_to_screen:
# Make sure origin + extent <= 100%
layout_info = layout_info.fit_to_screen()
return layout_info
def write(self, content):
return content
class Style(object):
def __init__(self):
pass
class CaptionNode(object):
"""
A single node within a caption, representing either
text, a style, or a linebreak.
Rules:
1. All nodes should have the property layout_info set.
The value None means specifically that no positioning information
should be specified. Each reader is to supply its own default
values (if necessary) when reading their respective formats.
"""
TEXT = 1
# When and if this is extended, it might be better to turn it into a
# property of the node, not a type of node itself.
STYLE = 2
BREAK = 3
def __init__(self, type_, layout_info=None):
"""
:type type_: int
:type layout_info: Layout
"""
self.type_ = type_
self.content = None
# Boolean. Marks the beginning/ end of a Style node.
self.start = None
self.layout_info = layout_info
def __repr__(self):
t = self.type_
if t == CaptionNode.TEXT:
return repr(self.content)
elif t == CaptionNode.BREAK:
return repr('BREAK')
elif t == CaptionNode.STYLE:
return repr('STYLE: %s %s' % (self.start, self.content))
else:
raise RuntimeError('Unknown node type: ' + str(t))
@staticmethod
def create_text(text, layout_info=None):
data = CaptionNode(CaptionNode.TEXT, layout_info=layout_info)
data.content = text
return data
@staticmethod
def create_style(start, content, layout_info=None):
data = CaptionNode(CaptionNode.STYLE, layout_info=layout_info)
data.content = content
data.start = start
return data
@staticmethod
def create_break(layout_info=None):
return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
class Caption(object):
"""
A single caption, including the time and styling information
for its display.
"""
def __init__(self, start, end, nodes, style={}, layout_info=None):
"""
Initialize the Caption object
:param start: The start time in microseconds
:type start: Number
:param end: The end time in microseconds
:type end: Number
:param nodes: A list of CaptionNodes
:type nodes: list
:param style: A dictionary with CSS-like styling rules
:type style: dict
:param layout_info: A Layout object with the necessary positioning
information
:type layout_info: Layout
"""
if not isinstance(start, Number):
raise CaptionReadTimingError("Captions must be initialized with a"
" valid start time")
if not isinstance(end, Number):
raise CaptionReadTimingError("Captions must be initialized with a"
" valid end time")
if not nodes:
raise CaptionReadError("Node list cannot be empty")
self.start = start
self.end = end
self.nodes = nodes
self.style = style
self.layout_info = layout_info
def is_empty(self):
return len(self.nodes) == 0
def format_start(self, msec_separator=None):
"""
Format the start time value in milliseconds into a string
value suitable for some of the supported output formats (ex.
SRT, DFXP).
"""
return self._format_timestamp(self.start, msec_separator)
def format_end(self, msec_separator=None):
"""
Format the end time value in milliseconds into a string value suitable
for some of the supported output formats (ex. SRT, DFXP).
"""
return self._format_timestamp(self.end, msec_separator)
def __repr__(self):
return repr(
'{start} --> {end}\n{text}'.format(
start=self.format_start(),
end=self.format_end(),
text=self.get_text()
)
)
def get_text(self):
"""
Get the text of the caption.
"""
def get_text_for_node(node):
if node.type_ == CaptionNode.TEXT:
return node.content
if node.type_ == CaptionNode.BREAK:
return '\n'
return ''
text_nodes = [get_text_for_node(node) for node in self.nodes]
return ''.join(text_nodes).strip()
def _format_timestamp(self, value, msec_separator=None):
datetime_value = timedelta(milliseconds=(int(value / 1000)))
str_value = text_type(datetime_value)[:11]
if not datetime_value.microseconds:
str_value += '.000'
if msec_separator is not None:
str_value = str_value.replace(".", msec_separator)
return '0' + str_value
class CaptionList(list):
""" A list of captions with a layout object attached to it """
def __init__(self, iterable=None, layout_info=None):
"""
:param iterable: An iterator used to populate the caption list
:param Layout layout_info: A Layout object with the positioning info
"""
self.layout_info = layout_info
args = [iterable] if iterable else []
super(CaptionList, self).__init__(*args)
def __getslice__(self, i, j):
return CaptionList(
list.__getslice__(self, i, j), layout_info=self.layout_info)
def __getitem__(self, y):
item = list.__getitem__(self, y)
if isinstance(item, Caption):
return item
return CaptionList(item, layout_info=self.layout_info)
def __add__(self, other):
add_is_safe = (
not hasattr(other, 'layout_info') or
not other.layout_info or
self.layout_info == other.layout_info
)
if add_is_safe:
return CaptionList(
list.__add__(self, other), layout_info=self.layout_info)
else:
raise ValueError(
"Cannot add CaptionList objects with different layout_info")
def __mul__(self, other):
return CaptionList(
list.__mul__(self, other), layout_info=self.layout_info)
__rmul__ = __mul__
class CaptionSet(object):
"""
A set of captions in potentially multiple languages,
all representing the same underlying content.
The .layout_info attribute, keeps information that should be inherited
by all the children.
"""
def __init__(self, captions, styles={}, layout_info=None):
"""
:param captions: A dictionary of the format {'language': CaptionList}
:param styles: A dictionary with CSS-like styling rules
:param Layout layout_info: A Layout object with the positioning info
"""
self._captions = captions
self._styles = styles
self.layout_info = layout_info
def set_captions(self, lang, captions):
self._captions[lang] = captions
def get_languages(self):
return list(self._captions.keys())
def get_captions(self, lang):
return self._captions.get(lang, [])
def add_style(self, selector, rules):
"""
:param selector: The selector indicating the elements to which the
rules should be applied.
:param rules: A dictionary with CSS-like styling rules.
"""
self._styles[selector] = rules
def get_style(self, selector):
"""
Returns a dictionary with CSS-like styling rules for a given selector.
:param selector: The selector whose rules should be returned (e.g. an
element or class name).
"""
return self._styles.get(selector, {})
def get_styles(self):
return sorted(self._styles.items())
def set_styles(self, styles):
self._styles = styles
def is_empty(self):
return all(
[len(captions) == 0 for captions in list(self._captions.values())]
)
def set_layout_info(self, lang, layout_info):
self._captions[lang].layout_info = layout_info
def get_layout_info(self, lang):
caption_list = self._captions.get(lang)
if caption_list:
return caption_list.layout_info
return None
def adjust_caption_timing(self, offset=0, rate_skew=1.0):
"""
Adjust the timing according to offset and rate_skew.
Skew is applied first, then offset.
e.g. if skew == 1.1, and offset is 5, a caption originally
displayed from 10-11 seconds would instead be at 16-17.1
"""
for lang in self.get_languages():
captions = self.get_captions(lang)
out_captions = CaptionList()
for caption in captions:
caption.start = caption.start * rate_skew + offset
caption.end = caption.end * rate_skew + offset
if caption.start >= 0:
out_captions.append(caption)
self.set_captions(lang, out_captions)
# Functions
def merge_concurrent_captions(caption_set):
"""Merge captions that have the same start and end times"""
for lang in caption_set.get_languages():
captions = caption_set.get_captions(lang)
last_caption = None
concurrent_captions = CaptionList()
merged_captions = CaptionList()
for caption in captions:
if last_caption:
last_timespan = last_caption.start, last_caption.end
current_timespan = caption.start, caption.end
if current_timespan == last_timespan:
concurrent_captions.append(caption)
last_caption = caption
continue
else:
merged_captions.append(merge(concurrent_captions))
concurrent_captions = [caption]
last_caption = caption
if concurrent_captions:
merged_captions.append(merge(concurrent_captions))
if merged_captions:
caption_set.set_captions(lang, merged_captions)
return caption_set
def merge(captions):
"""
Merge list of captions into one caption. The start/end times from the first
caption are kept.
"""
new_nodes = []
for caption in captions:
if new_nodes:
new_nodes.append(CaptionNode.create_break())
for node in caption.nodes:
new_nodes.append(node)
caption = Caption(
captions[0].start, captions[0].end, new_nodes, captions[0].style)
return caption
+2
View File
@@ -0,0 +1,2 @@
from .base import *
from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter
File diff suppressed because it is too large Load Diff
+248
View File
@@ -0,0 +1,248 @@
# We thought about making pycaption.base objects immutable. This would be nice
# in a lot of cases, but since the transformations on them could be quite
# complex, the deepcopy method is good enough sometimes.
from copy import deepcopy
from .base import DFXPWriter, DFXP_DEFAULT_REGION
from ..base import BaseWriter, CaptionNode, merge_concurrent_captions
from xml.sax.saxutils import escape
from bs4 import BeautifulSoup
LEGACY_DFXP_BASE_MARKUP = '''
<tt xmlns="http://www.w3.org/ns/ttml"
xmlns:tts="http://www.w3.org/ns/ttml#styling">
<head>
<styling/>
<layout/>
</head>
<body/>
</tt>
'''
LEGACY_DFXP_DEFAULT_STYLE = {
'color': 'white',
'font-family': 'monospace',
'font-size': '1c',
}
LEGACY_DFXP_DEFAULT_STYLE_ID = 'default'
LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom'
LEGACY_DFXP_DEFAULT_REGION = {
'text-align': 'center',
'display-align': 'after'
}
class SinglePositioningDFXPWriter(DFXPWriter):
"""A dfxp writer, that ignores all positioning, using a single provided value
"""
def __init__(self, default_positioning=DFXP_DEFAULT_REGION,
*args, **kwargs):
super(SinglePositioningDFXPWriter, self).__init__(*args, **kwargs)
self.default_positioning = default_positioning
def write(self, captions_set, force=''):
"""Writes a DFXP file using the positioning provided in the initializer
:type captions_set: pycaption.base.CaptionSet
:param force: only write this language, if available in the CaptionSet
:rtype: unicode
"""
captions_set = self._create_single_positioning_caption_set(
captions_set, self.default_positioning)
return super(SinglePositioningDFXPWriter, self).write(captions_set, force) # noqa
@staticmethod
def _create_single_positioning_caption_set(caption_set, positioning):
"""Return a caption where all the positioning information was
replaced from positioning
:type caption_set: pycaption.base.CaptionSet
:rtype: pycaption.base.CaptionSet
"""
# If SinglePositioningDFXPWriter would modify the state of the caption
# set, any writer using the same caption_set thereafter would be
# affected. At the moment we know we don't use any other writers, but
# this is important and mustn't be neglected
caption_set = deepcopy(caption_set)
caption_set = merge_concurrent_captions(caption_set)
caption_set.layout_info = positioning
for lang in caption_set.get_languages():
caption_set.set_layout_info(lang, positioning)
caption_list = caption_set.get_captions(lang)
for caption in caption_list:
caption.layout_info = positioning
for node in caption.nodes:
if hasattr(node, 'layout_info'):
node.layout_info = positioning
for _, style in caption_set.get_styles():
if 'text-align' in style:
style.pop('text-align')
return caption_set
class LegacyDFXPWriter(BaseWriter):
"""Ported the legacy DFXPWriter from 0.4.5"""
def __init__(self, *args, **kw):
self.p_style = False
self.open_span = False
def write(self, caption_set, force=''):
caption_set = deepcopy(caption_set)
caption_set = merge_concurrent_captions(caption_set)
dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml')
dfxp.find('tt')['xml:lang'] = "en"
for style_id, style in caption_set.get_styles():
if style != {}:
dfxp = self._recreate_styling_tag(style_id, style, dfxp)
if not caption_set.get_styles():
dfxp = self._recreate_styling_tag(
LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp)
# XXX For now we will always use this default region. In the future if
# regions are provided, they will be kept
dfxp = self._recreate_region_tag(
LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp)
body = dfxp.find('body')
if force:
langs = [self._force_language(force, caption_set.get_languages())]
else:
langs = caption_set.get_languages()
for lang in langs:
div = dfxp.new_tag('div')
div['xml:lang'] = '%s' % lang
for caption in caption_set.get_captions(lang):
if caption.style:
caption_style = caption.style
caption_style.update({'region': LEGACY_DFXP_DEFAULT_REGION_ID})
else:
caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID,
'region': LEGACY_DFXP_DEFAULT_REGION_ID}
p = self._recreate_p_tag(caption, caption_style, dfxp)
div.append(p)
body.append(div)
caption_content = dfxp.prettify(formatter=None)
return caption_content
# force the DFXP to only have one language, trying to match on "force"
def _force_language(self, force, langs):
for lang in langs:
if force == lang:
return lang
return langs[-1]
def _recreate_region_tag(self, region_id, styling, dfxp):
dfxp_region = dfxp.new_tag('region')
dfxp_region.attrs.update({'xml:id': region_id})
attributes = self._recreate_style(styling, dfxp)
dfxp_region.attrs.update(attributes)
new_tag = dfxp.new_tag('region')
new_tag.attrs.update({'xml:id': region_id})
if dfxp_region != new_tag:
dfxp.find('layout').append(dfxp_region)
return dfxp
def _recreate_styling_tag(self, style, content, dfxp):
dfxp_style = dfxp.new_tag('style')
dfxp_style.attrs.update({'xml:id': style})
attributes = self._recreate_style(content, dfxp)
dfxp_style.attrs.update(attributes)
new_tag = dfxp.new_tag('style')
new_tag.attrs.update({'xml:id': style})
if dfxp_style != new_tag:
dfxp.find('styling').append(dfxp_style)
return dfxp
def _recreate_p_tag(self, caption, caption_style, dfxp):
start = caption.format_start()
end = caption.format_end()
p = dfxp.new_tag("p", begin=start, end=end)
p.string = self._recreate_text(caption, dfxp)
if dfxp.find("style", {"xml:id": "p"}):
p['style'] = 'p'
p.attrs.update(self._recreate_style(caption_style, dfxp))
return p
def _recreate_text(self, caption, dfxp):
line = ''
for node in caption.nodes:
if node.type_ == CaptionNode.TEXT:
line += escape(node.content) + ' '
elif node.type_ == CaptionNode.BREAK:
line = line.rstrip() + '<br/>\n '
elif node.type_ == CaptionNode.STYLE:
line = self._recreate_span(line, node, dfxp)
return line.rstrip()
def _recreate_span(self, line, node, dfxp):
if node.start:
styles = ''
content_with_style = self._recreate_style(node.content, dfxp)
for style, value in list(content_with_style.items()):
styles += ' %s="%s"' % (style, value)
if styles:
if self.open_span:
line = line.rstrip() + '</span> '
line += '<span%s>' % styles
self.open_span = True
elif self.open_span:
line = line.rstrip() + '</span> '
self.open_span = False
return line
def _recreate_style(self, content, dfxp):
dfxp_style = {}
if 'region' in content:
if dfxp.find('region', {'xml:id': content['region']}):
dfxp_style['region'] = content['region']
if 'class' in content:
if dfxp.find("style", {"xml:id": content['class']}):
dfxp_style['style'] = content['class']
if 'text-align' in content:
dfxp_style['tts:textAlign'] = content['text-align']
if 'italics' in content:
dfxp_style['tts:fontStyle'] = 'italic'
if 'font-family' in content:
dfxp_style['tts:fontFamily'] = content['font-family']
if 'font-size' in content:
dfxp_style['tts:fontSize'] = content['font-size']
if 'color' in content:
dfxp_style['tts:color'] = content['color']
if 'display-align' in content:
dfxp_style['tts:displayAlign'] = content['display-align']
return dfxp_style
File diff suppressed because it is too large Load Diff
+40
View File
@@ -0,0 +1,40 @@
class CaptionReadError(Exception):
"""
Generic error raised when the reading of the caption file failed.
"""
def __str__(self):
return "%s(%s)" % (self.__class__.__name__, self.args)
class CaptionReadNoCaptions(CaptionReadError):
"""
Error raised when the provided caption file was not containing any
actual captions.
"""
class CaptionReadSyntaxError(CaptionReadError):
"""
Error raised when the provided caption file has syntax errors and could
not be parsed.
"""
class CaptionReadTimingError(CaptionReadError):
"""
Error raised when a Caption is initialized with invalid timings.
"""
class RelativizationError(Exception):
"""
Error raised when absolute positioning cannot be converted to
percentage
"""
class InvalidInputError(RuntimeError):
""" Error raised when the input is invalid (i.e. a unicode string)
"""
+916
View File
@@ -0,0 +1,916 @@
"""
This module implements the classes used to represent positioning information.
CONVENTIONS:
* None of the methods should modify the state of the objects on which they're
called. If the values of an object need to be recalculated, the method
responsible for the recalculation should return a new object with the
necessary modifications.
"""
import six
from enum import Enum
from .exceptions import RelativizationError
class UnitEnum(Enum):
"""Enumeration-like object, specifying the units of measure for length
Usage:
unit = UnitEnum.PIXEL
unit = UnitEnum.EM
if unit == UnitEnum.CELL :
...
"""
PIXEL = 'px'
EM = 'em'
PERCENT = '%'
CELL = 'c'
PT = 'pt'
class VerticalAlignmentEnum(Enum):
"""Enumeration object, specifying the allowed vertical alignment options
Usage:
alignment = VerticalAlignmentEnum.TOP
if alignment == VerticalAlignmentEnum.BOTTOM:
...
"""
TOP = 'top'
CENTER = 'center'
BOTTOM = 'bottom'
class HorizontalAlignmentEnum(Enum):
"""Enumeration object specifying the horizontal alignment preferences
"""
LEFT = 'left'
CENTER = 'center'
RIGHT = 'right'
START = 'start'
END = 'end'
class Alignment(object):
def __init__(self, horizontal, vertical):
"""
:type horizontal: HorizontalAlignmentEnum
:param horizontal: HorizontalAlignmentEnum member
:type vertical: VerticalAlignmentEnum
:param vertical: VerticalAlignmentEnum member
"""
self.horizontal = horizontal
self.vertical = vertical
def __hash__(self):
return hash(
hash(self.horizontal) * 83 +
hash(self.vertical) * 89 +
97
)
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.horizontal == other.horizontal and
self.vertical == other.vertical
)
def __repr__(self):
return "<Alignment ({horizontal} {vertical})>".format(
horizontal=self.horizontal, vertical=self.vertical
)
def serialized(self):
"""Returns a tuple of the useful information regarding this object
"""
return self.horizontal, self.vertical
@classmethod
def from_horizontal_and_vertical_align(cls, text_align=None,
display_align=None):
horizontal_obj = None
vertical_obj = None
if text_align == 'left':
horizontal_obj = HorizontalAlignmentEnum.LEFT
if text_align == 'start':
horizontal_obj = HorizontalAlignmentEnum.START
if text_align == 'center':
horizontal_obj = HorizontalAlignmentEnum.CENTER
if text_align == 'right':
horizontal_obj = HorizontalAlignmentEnum.RIGHT
if text_align == 'end':
horizontal_obj = HorizontalAlignmentEnum.END
if display_align == 'before':
vertical_obj = VerticalAlignmentEnum.TOP
if display_align == 'center':
vertical_obj = VerticalAlignmentEnum.CENTER
if display_align == 'after':
vertical_obj = VerticalAlignmentEnum.BOTTOM
if not any([horizontal_obj, vertical_obj]):
return None
return cls(horizontal_obj, vertical_obj)
class TwoDimensionalObject(object):
"""Adds a couple useful methods to its subclasses, nothing fancy.
"""
@classmethod
# TODO - highly cachable. Should use WeakValueDictionary here to return
# flyweights, not new objects.
def from_xml_attribute(cls, attribute):
"""Instantiate the class from a value of the type "4px" or "5%"
or any number concatenated with a measuring unit (member of UnitEnum)
:type attribute: unicode
"""
horizontal, vertical = six.text_type(attribute).split(' ')
horizontal = Size.from_string(horizontal)
vertical = Size.from_string(vertical)
return cls(horizontal, vertical)
class Stretch(TwoDimensionalObject):
"""Used for specifying the extent of a rectangle (how much it stretches),
or the padding in a rectangle (how much space should be left empty until
text can be displayed)
"""
def __init__(self, horizontal, vertical):
"""Use the .from_xxx methods. They know what's best for you.
:type horizontal: Size
:type vertical: Size
"""
for parameter in [horizontal, vertical]:
if not isinstance(parameter, Size):
raise ValueError("Stretch must be initialized with two valid "
"Size objects.")
self.horizontal = horizontal
self.vertical = vertical
def is_measured_in(self, measure_unit):
"""Whether the stretch is only measured in the provided units
:param measure_unit: a UnitEnum member
:return: True/False
"""
return (
self.horizontal.unit == measure_unit and
self.vertical.unit == measure_unit
)
def __repr__(self):
return '<Stretch ({horizontal}, {vertical})>'.format(
horizontal=self.horizontal, vertical=self.vertical
)
def serialized(self):
"""Returns a tuple of the useful attributes of this object"""
return (
None if not self.horizontal else self.horizontal.serialized(),
None if not self.vertical else self.vertical.serialized()
)
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.horizontal == other.horizontal and
self.vertical == other.vertical
)
def __hash__(self):
return hash(
hash(self.horizontal) * 59 +
hash(self.vertical) * 61 +
67
)
def __bool__(self):
return True if self.horizontal or self.vertical else False
def to_xml_attribute(self, **kwargs):
"""Returns a unicode representation of this object as an xml attribute
"""
return '{horizontal} {vertical}'.format(
horizontal=self.horizontal.to_xml_attribute(),
vertical=self.vertical.to_xml_attribute()
)
def is_relative(self):
"""
Returns True if all dimensions are expressed as percentages,
False otherwise.
"""
is_relative = True
if self.horizontal:
is_relative &= self.horizontal.is_relative()
if self.vertical:
is_relative &= self.vertical.is_relative()
return is_relative
def as_percentage_of(self, video_width, video_height):
"""
Converts absolute units (e.g. px, pt etc) to percentage
"""
return Stretch(
self.horizontal.as_percentage_of(video_width=video_width),
self.vertical.as_percentage_of(video_height=video_height)
)
class Region(object):
"""Represents the spatial coordinates of a rectangle
Don't instantiate by hand. use Region.from_points or Region.from_extent
"""
@classmethod
def from_points(cls, p1, p2):
"""Create a rectangle, knowing 2 points on the plane.
We assume that p1 is in the upper left (closer to the origin)
:param p1: Point instance
:param p2: Point instance
:return: a Point instance
"""
inst = cls()
inst._p1 = p1
inst._p2 = p2
return inst
@classmethod
def from_extent(cls, extent, origin):
"""Create a rectangle, knowing its upper left origin, and
spatial extension
:type extent: Stretch
:type origin: Point
:return: a Point instance
"""
inst = cls()
inst._extent = extent
inst._origin = origin
return inst
@property
def extent(self):
"""How wide this rectangle stretches (horizontally and vertically)
"""
if hasattr(self, '_extent'):
return self._extent
else:
return self._p1 - self._p2
@property
def origin(self):
"""Out of its 4 points, returns the one closest to the origin
"""
if hasattr(self, '_origin'):
return self._origin
else:
return Point.align_from_origin(self._p1, self._p2)[0]
upper_left_point = origin
@property
def lower_right_point(self):
"""The point furthest from the origin from the rectangle's 4 points
"""
if hasattr(self, '_p2'):
return Point.align_from_origin(self._p1, self._p2)[1]
else:
return self.origin.add_extent(self.extent)
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.extent == other.extent and
self.origin == other.origin
)
def __hash__(self):
return hash(
hash(self.origin) * 71 +
hash(self.extent) * 73 +
79
)
class Point(TwoDimensionalObject):
"""Represent a point in 2d space.
"""
def __init__(self, x, y):
"""
:type x: Size
:type y: Size
"""
for parameter in [x, y]:
if not isinstance(parameter, Size):
raise ValueError("Point must be initialized with two valid "
"Size objects.")
self.x = x
self.y = y
def __sub__(self, other):
"""Returns an Stretch object, if the other point's units are compatible
"""
return Stretch(abs(self.x - other.x), abs(self.y - other.y))
def add_stretch(self, stretch):
"""Returns another Point instance, whose coordinates are the sum of the
current Point's, and the Stretch instance's.
"""
return Point(self.x + stretch.horizontal, self.y + stretch.vertical)
def is_relative(self):
"""
Returns True if all dimensions are expressed as percentages,
False otherwise.
"""
is_relative = True
if self.x:
is_relative &= self.x.is_relative()
if self.y:
is_relative &= self.y.is_relative()
return is_relative
def as_percentage_of(self, video_width, video_height):
"""
Converts absolute units (e.g. px, pt etc) to percentage
"""
return Point(
self.x.as_percentage_of(video_width=video_width),
self.y.as_percentage_of(video_height=video_height)
)
@classmethod
def align_from_origin(cls, p1, p2):
"""Returns a tuple of 2 points. The first is closest to the origin
on both axes than the second.
If the 2 points fulfill this condition, returns them (ordered), if not,
creates 2 new points.
"""
if p1.x <= p2.x and p1.y <= p2.y:
return p1
if p1.x >= p2.x and p1.y >= p2.y:
return p2
else:
return (Point(min(p1.x, p2.x), min(p1.y, p2.y)),
Point(max(p1.x, p2.x), max(p1.y, p2.y)))
def __repr__(self):
return '<Point ({x}, {y})>'.format(
x=self.x, y=self.y
)
def serialized(self):
"""Returns the "useful" values of this object.
"""
return (
None if not self.x else self.x.serialized(),
None if not self.y else self.y.serialized()
)
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.x == other.x and
self.y == other.y
)
def __hash__(self):
return hash(
hash(self.x) * 51 +
hash(self.y) * 53 +
57
)
def __bool__(self):
return True if self.x or self.y else False
def to_xml_attribute(self, **kwargs):
"""Returns a unicode representation of this object as an xml attribute
"""
return '{x} {y}'.format(
x=self.x.to_xml_attribute(), y=self.y.to_xml_attribute())
@six.python_2_unicode_compatible
class Size(object):
"""Ties together a number with a unit, to represent a size.
Use as value objects! (don't change after creation)
"""
def __init__(self, value, unit):
"""
:param value: A number (float or int will do)
:param unit: A UnitEnum member
"""
if value is None:
raise ValueError("Size must be initialized with a value.")
if not isinstance(unit,UnitEnum):
raise ValueError("Size must be initialized with a valid unit.")
self.value = float(value)
self.unit = unit
def __sub__(self, other):
if self.unit == other.unit:
return Size(self.value - other.value, self.unit)
else:
raise ValueError("The sizes should have the same measure units.")
def __abs__(self):
return Size(abs(self.value), self.unit)
def __cmp__(self, other):
if self.unit == other.unit:
# python3 does not have cmp
return (self.value > other.value) - (self.value < other.value)
else:
raise ValueError("The sizes should have the same measure units.")
def __lt__(self, other):
return self.value < other.value
def __add__(self, other):
if self.unit == other.unit:
return Size(self.value + other.value, self.unit)
else:
raise ValueError("The sizes should have the same measure units.")
def is_relative(self):
"""
Returns True if value is expressed as percentage, False otherwise.
"""
return self.unit == UnitEnum.PERCENT
def as_percentage_of(self, video_width=None, video_height=None):
"""
:param video_width: An integer representing a width in pixels
:param video_height: An integer representing a height in pixels
"""
value = self.value
unit = self.unit
if unit == UnitEnum.PERCENT:
return self # Nothing to do here
# The input must be valid so that any conversion can be done
if not (video_width or video_height):
raise RelativizationError(
"Either video width or height must be given as a reference")
elif video_width and video_height:
raise RelativizationError(
"Only video width or height can be given as reference")
if unit == UnitEnum.EM:
# TODO: Implement proper conversion of em in function of font-size
# The em unit is relative to the font-size, to which we currently
# have no access. As a workaround, we presume the font-size is 16px,
# which is a common default value but not guaranteed.
value *= 16
unit = UnitEnum.PIXEL
if unit == UnitEnum.PT:
# XXX: we will convert first to "px" and from "px" this will be
# converted to percent. we don't take into consideration the
# font-size
value = value / 72.0 * 96.0
unit = UnitEnum.PIXEL
if unit == UnitEnum.PIXEL:
value = value * 100.0 / (video_width or video_height)
unit = UnitEnum.PERCENT
if unit == UnitEnum.CELL:
# TODO: Implement proper cell resolution
# (w3.org/TR/ttaf1-dfxp/#parameter-attribute-cellResolution)
# For now we will use the default values (32 columns and 15 rows)
cell_reference = 32 if video_width else 15
value = value * 100.0 / cell_reference
unit = UnitEnum.PERCENT
return Size(value, unit)
@classmethod
# TODO - this also looks highly cachable. Should use a WeakValueDict here
# to return flyweights
def from_string(cls, string):
"""Given a string of the form "46px" or "5%" etc., returns the proper
size object
:param string: a number concatenated to any of the UnitEnum members.
:type string: unicode
:rtype: Size
"""
raw_number = string
for unit in list(UnitEnum):
if raw_number.endswith(unit.value):
raw_number = raw_number.rstrip(unit.value)
break
else:
unit = None
if unit is not None:
value = None
try:
value = float(raw_number)
value = int(raw_number)
except ValueError:
pass
if value is None:
raise ValueError(
"""Couldn't recognize the value "{value}" as a number"""
.format(value=raw_number)
)
instance = cls(value, unit)
return instance
else:
raise ValueError(
"The specified value is not valid because its unit "
"is not recognized: {value}. "
"The only supported units are: {supported}"
.format(value=raw_number, supported=', '.join(UnitEnum._member_map_))
)
def __repr__(self):
return '<Size ({value} {unit})>'.format(
value=self.value, unit=self.unit.value
)
def __str__(self):
value = round(self.value, 2)
if value.is_integer():
s = "{}".format(int(value))
else:
s = "{:.2f}".format(value).rstrip('0').rstrip('.')
return "{}{}".format(s, self.unit.value)
def to_xml_attribute(self, **kwargs):
"""Returns a unicode representation of this object, as an xml attribute
"""
return six.text_type(self)
def serialized(self):
"""Returns the "useful" values of this object"""
return self.value, self.unit
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.value == other.value and
self.unit == other.unit
)
def __hash__(self):
return hash(
hash(self.value) * 41 +
hash(self.unit) * 43 +
47
)
def __bool__(self):
return self.unit in UnitEnum and self.value is not None
class Padding(object):
"""Represents padding information. Consists of 4 Size objects, representing
padding from (in this order): before (up), after (down), start (left) and
end (right).
A valid Padding object must always have all paddings set and different from
None. If this is not true Writers may fail for they rely on this assumption.
"""
def __init__(self, before=None, after=None, start=None, end=None):
"""
:type before: Size
:type after: Size
:type start: Size
:type end: Size
"""
self.before = before # top
self.after = after # bottom
self.start = start # left
self.end = end # right
for attr in ['before', 'after', 'start', 'end']:
# Ensure that a Padding object always explicitly defines all
# four possible paddings
if not isinstance(getattr(self, attr), Size):
# Sets default padding (0%)
setattr(self, attr, Size(0, UnitEnum.PERCENT))
@classmethod
def from_xml_attribute(cls, attribute):
"""As per the docs, the style attribute can contain 1,2,3 or 4 values.
If 1 value: apply to all edges
If 2: first applies to before and after, second to start and end
If 3: first applies to before, second to start and end, third to after
If 4: before, end, after, start;
http://www.w3.org/TR/ttaf1-dfxp/#style-attribute-padding
:param attribute: a string like object, representing a dfxp attr. value
:return: a Padding object
"""
values_list = six.text_type(attribute).split(' ')
sizes = []
for value in values_list:
sizes.append(Size.from_string(value))
if len(sizes) == 1:
return cls(sizes[0], sizes[0], sizes[0], sizes[0])
elif len(sizes) == 2:
return cls(sizes[0], sizes[0], sizes[1], sizes[1])
elif len(sizes) == 3:
return cls(sizes[0], sizes[2], sizes[1], sizes[1])
elif len(sizes) == 4:
return cls(sizes[0], sizes[2], sizes[3], sizes[1])
else:
raise ValueError('The provided value "{value}" could not be '
"parsed into the a padding. Check out "
"http://www.w3.org/TR/ttaf1-dfxp/"
"#style-attribute-padding for the definition "
"and examples".format(value=attribute))
def __repr__(self):
return (
"<Padding (before: {before}, after: {after}, start: {start}, "
"end: {end})>".format(
before=self.before, after=self.after, start=self.start,
end=self.end
)
)
def serialized(self):
"""Returns a tuple containing the useful values of this object
"""
return (
None if not self.before else self.before.serialized(),
None if not self.after else self.after.serialized(),
None if not self.start else self.start.serialized(),
None if not self.end else self.end.serialized()
)
def __eq__(self, other):
return (
other and
type(self) == type(other) and
self.before == other.before and
self.after == other.after and
self.start == other.start and
self.end == other.end
)
def __hash__(self):
return hash(
hash(self.before) * 19 +
hash(self.after) * 23 +
hash(self.start) * 29 +
hash(self.end) * 31 +
37
)
def to_xml_attribute(
self, attribute_order=('before', 'end', 'after', 'start'),
**kwargs):
"""Returns a unicode representation of this object as an xml attribute
TODO - should extend the attribute_order tuple to contain 4 tuples,
so we can reduce the output length to 3, 2 or 1 element.
:type attribute_order: tuple
:param attribute_order: the order that the attributes should be
serialized
"""
try:
string_list = []
for attrib in attribute_order:
if hasattr(self, attrib):
string_list.append(
getattr(self, attrib).to_xml_attribute())
except AttributeError:
# A Padding object with attributes set to None is considered
# invalid. All four possible paddings must be set. If one of them
# is not, this error is raised.
raise ValueError("The attribute order specified is invalid.")
return ' '.join(string_list)
def as_percentage_of(self, video_width, video_height):
return Padding(
self.before.as_percentage_of(video_height=video_height),
self.after.as_percentage_of(video_height=video_height),
self.start.as_percentage_of(video_width=video_width),
self.end.as_percentage_of(video_width=video_width)
)
def is_relative(self):
is_relative = True
if self.before:
is_relative &= self.before.is_relative()
if self.after:
is_relative &= self.after.is_relative()
if self.start:
is_relative &= self.start.is_relative()
if self.end:
is_relative &= self.end.is_relative()
return is_relative
class Layout(object):
"""Should encapsulate all the information needed to determine (as correctly
as possible) the layout (positioning) of elements on the screen.
Inheritance of this property, from the CaptionSet to its children is
specific for each caption type.
"""
def __init__(self, origin=None, extent=None, padding=None, alignment=None,
webvtt_positioning=None, inherit_from=None):
"""
:type origin: Point
:param origin: The point on the screen which is the top left vertex
of a rectangular region where the captions should be placed
:type extent: Stretch
:param extent: The width and height of the rectangle where the caption
should be placed on the screen.
:type padding: Padding
:param padding: The padding of the text inside the region described
by the origin and the extent
:type alignment: Alignment
:type webvtt_positioning: unicode
:param webvtt_positioning: A string with the raw WebVTT cue settings.
This is used so that WebVTT positioning isn't lost on conversion
from WebVTT to WebVTT. It is needed only because pycaption
currently doesn't support reading positioning from WebVTT.
:type inherit_from: Layout
:param inherit_from: A Layout with the positioning parameters to be
used if not specified by the positioning arguments,
"""
self.origin = origin
self.extent = extent
self.padding = padding
self.alignment = alignment
self.webvtt_positioning = webvtt_positioning
if inherit_from:
for attr_name in ['origin', 'extent', 'padding', 'alignment']:
attr = getattr(self, attr_name)
if not attr:
setattr(self, attr_name, getattr(inherit_from, attr_name))
def __bool__(self):
return any([
self.origin, self.extent, self.padding, self.alignment,
self.webvtt_positioning
])
def __repr__(self):
return (
"<Layout (origin: {origin}, extent: {extent}, "
"padding: {padding}, alignment: {alignment})>".format(
origin=self.origin, extent=self.extent, padding=self.padding,
alignment=self.alignment
)
)
def serialized(self):
"""Returns nested tuple containing the "useful" values of this object
"""
return (
None if not self.origin else self.origin.serialized(),
None if not self.extent else self.extent.serialized(),
None if not self.padding else self.padding.serialized(),
None if not self.alignment else self.alignment.serialized()
)
def __eq__(self, other):
return (
type(self) == type(other) and
self.origin == other.origin and
self.extent == other.extent and
self.padding == other.padding and
self.alignment == other.alignment
)
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(
hash(self.origin) * 7
+ hash(self.extent) * 11
+ hash(self.padding) * 13
+ hash(self.alignment) * 5
+ 17
)
def is_relative(self):
"""
Returns True if all positioning values are expressed as percentages,
False otherwise.
"""
is_relative = True
if self.origin:
is_relative &= self.origin.is_relative()
if self.extent:
is_relative &= self.extent.is_relative()
if self.padding:
is_relative &= self.padding.is_relative()
return is_relative
def as_percentage_of(self, video_width, video_height):
params = {'alignment': self.alignment}
# We don't need to preserve webvtt_positioning on Layout
# transformations because, if it is set, the WebVTT writer
# returns as soon as it's found and the transformations are
# never triggered.
for attr_name in ['origin', 'extent', 'padding']:
attr = getattr(self, attr_name)
if attr:
params[attr_name] = attr.as_percentage_of(video_width,
video_height)
return Layout(**params)
def fit_to_screen(self):
"""
If extent is not set or if origin + extent > 100%, (re)calculate it
based on origin. It is a pycaption fix for caption files that are
technically valid but contain inconsistent settings that may cause
long captions to be cut out of the screen.
ATTENTION: This must be called on relativized objects (such as the one
returned by as_percentage_of). All units are presumed to be percentages.
"""
if self.origin:
# Calculated values to be used if replacement is needed
diff_horizontal = Size(100 - self.origin.x.value, UnitEnum.PERCENT)
diff_vertical = Size(100 - self.origin.y.value, UnitEnum.PERCENT)
if not self.extent:
# Extent is not set, use the calculated values
new_extent = Stretch(diff_horizontal, diff_vertical)
else:
# Extent is set but may have inconsistent values,
# e.g. origin="35% 25%" extent="80% 80%", which would cause
# captions to end horizontally at 115% and vertically at 105%,
# which would result in them being cut out of the screen.
# In this case, the horizontal and vertical values are
# corrected so that origin + extent = 100%.
bottom_right = self.origin.add_stretch(self.extent)
found_absolute_unit = False
if bottom_right.x.unit != UnitEnum.PERCENT:
found_absolute_unit = True
elif bottom_right.x.unit != UnitEnum.PERCENT:
found_absolute_unit = True
if found_absolute_unit:
raise ValueError("Units must be relativized before extent "
"can be calculated based on origin.")
new_horizontal = self.extent.horizontal
new_vertical = self.extent.vertical
# If extent is set but it's inconsistent, replace with
# calculated values
if bottom_right.x.value > 100:
new_horizontal = diff_horizontal
if bottom_right.y.value > 100:
new_vertical = diff_vertical
new_extent = Stretch(new_horizontal, new_vertical)
return Layout(
origin=self.origin,
extent=new_extent,
padding=self.padding,
alignment=self.alignment
# We don't need to preserve webvtt_positioning on Layout
# transformations because, if it is set, the WebVTT writer
# returns as soon as it's found and the transformations are
# never triggered.
)
return self
+805
View File
@@ -0,0 +1,805 @@
"""
The classes in this module handle SAMI reading and writing. It supports several
CSS attributes, some of which are handled as positioning settings (and applied
to Layout objects) and others as simple styling (applied to legacy style nodes).
The following attributes are handled as positioning:
'text-align' # Converted to Alignment
'margin-top'
'margin-right'
'margin-bottom'
'margin-left'
OBS:
* Margins are converted to Padding
* Margins defined inline are not supported
TODO: Add support for inline margins
Any other CSS the BeautifulSoup library manages to parse is handled as simple
styling and applied to style nodes. However, apparently only these are actually
used by writers on conversion:
'font-family'
'font-size'
'font-style'
'color'
OBS:
* Other parameters are preserved, but not if they're specified inline.
TODO:
Make this less confusing. Confirm whether these really are the only
supported styling attributes and make it more clear, perhaps by listing
them in constants in the beginning of the file and using them to filter
out unneeded attributes either everywhere in the code or not at all, but
most importantly regardless of whether they're defined inline or not,
because this is irrelevant.
"""
import re
import six
from logging import FATAL
from collections import deque
from copy import deepcopy
from future.backports.html.parser import HTMLParseError
from html.parser import HTMLParser
from html.entities import name2codepoint
from xml.sax.saxutils import escape
from bs4 import BeautifulSoup, NavigableString
from cssutils import parseString, log, css as cssutils_css
from .base import (
BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
DEFAULT_LANGUAGE_CODE)
from .exceptions import (
CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError)
from .geometry import Layout, Alignment, Padding, Size
# change cssutils default logging
log.setLevel(FATAL)
SAMI_BASE_MARKUP = '''
<sami>
<head>
<style type="text/css"/>
</head>
<body/>
</sami>'''
class SAMIReader(BaseReader):
def __init__(self, *args, **kw):
super(SAMIReader, self).__init__(*args, **kw)
self.line = []
self.first_alignment = None
def detect(self, content):
if '<sami' in content.lower():
return True
else:
return False
def read(self, content):
if type(content) != six.text_type:
raise InvalidInputError('The content is not a unicode string.')
content, doc_styles, doc_langs = (
self._get_sami_parser_class()().feed(content))
sami_soup = self._get_xml_parser_class()(content)
# Get the global layout that applies to all <p> tags
global_layout = self._build_layout(doc_styles.get('p', {}))
caption_dict = {}
for language in doc_langs:
lang_layout = None
for target, styling in list(doc_styles.items()):
if target not in ['p', 'sync', 'span']:
if styling.get('lang', None) == language:
lang_layout = self._build_layout(
doc_styles.get(target, {}),
inherit_from=global_layout
)
break
lang_layout = lang_layout or global_layout
lang_captions = self._translate_lang(
language, sami_soup, lang_layout)
caption_dict[language] = lang_captions
caption_set = CaptionSet(
caption_dict,
layout_info=global_layout
)
# Convert styles from CSS to internal representation
for style in list(doc_styles.items()):
style = (style[0], self._translate_parsed_style(style[1]))
caption_set.set_styles(doc_styles)
if caption_set.is_empty():
raise CaptionReadNoCaptions("empty caption file")
return caption_set
@staticmethod
def _get_sami_parser_class():
"""Hook method for providing custom SAMIParser classes"""
return SAMIParser
@staticmethod
def _get_xml_parser_class():
"""Hook method for providing a custom XML parser class"""
return BeautifulSoup
def _build_layout(self, styles, inherit_from=None):
"""
:type styles: dict
:param styles: a dictionary with CSS-like styling rules
:type inherit_from: Layout
:param inherit_from: The Layout with values to be used in case the
positioning settings in the styles parameter don't specify
something.
"""
alignment = Alignment.from_horizontal_and_vertical_align(
text_align=styles.get('text-align')
)
return self._get_layout_class()(
origin=None,
extent=None,
padding=self._get_padding(styles),
alignment=alignment,
inherit_from=inherit_from
)
@staticmethod
def _get_layout_class():
"""Hook method for providing a custom Layout class"""
return Layout
def _get_padding(self, styles):
margin_before = self._get_size(styles, 'margin-top')
margin_after = self._get_size(styles, 'margin-bottom')
margin_start = self._get_size(styles, 'margin-left')
margin_end = self._get_size(styles, 'margin-right')
if not any([margin_before, margin_after, margin_start, margin_end]):
return None
return Padding(
before=margin_before, # top
after=margin_after, # bottom
start=margin_start, # left
end=margin_end # right
)
def _get_size(self, styles, style_label):
value_from_style = styles.get(style_label, None)
if not value_from_style:
return None
return Size.from_string(value_from_style)
def _translate_lang(self, language, sami_soup, parent_layout):
"""
For a given language, translate the SAMI XML to internal list of
captions.
:rtype: list
"""
captions = CaptionList(layout_info=parent_layout)
milliseconds = 0
for p in sami_soup.select('p[lang|=%s]' % language):
milliseconds = int(float(p.parent['start']))
start = milliseconds * 1000
end = 0
if captions != [] and captions[-1].end == 0:
captions[-1].end = milliseconds * 1000
if p.get_text().strip():
self.first_alignment = None
styles = self._translate_attrs(p)
layout_info = self._build_layout(styles,
inherit_from=parent_layout)
self.line = []
self._translate_tag(p, layout_info)
caption_layout = self._get_layout_class()(
alignment=self.first_alignment,
inherit_from=layout_info
)
for node in self.line:
node.layout_info = Layout(
alignment=self.first_alignment,
inherit_from=node.layout_info
)
self.first_alignment = None
caption = Caption(start, end, self.line, styles, caption_layout)
captions.append(caption)
if captions and captions[-1].end == 0:
# Arbitrarily make this last 4 seconds. Not ideal...
captions[-1].end = (milliseconds + 4000) * 1000
return captions
def _get_style_name_from_tag(self, tag):
if tag == 'i':
return 'italics'
elif tag == 'b':
return 'bold'
elif tag == 'u':
return 'underline'
else:
raise RuntimeError("Unknown style tag")
def _translate_tag(self, tag, inherit_from=None):
"""
:param inherit_from: A Layout object extracted from an ancestor tag
to be attached to leaf nodes
"""
# convert text
if isinstance(tag, NavigableString):
# BeautifulSoup apparently handles unescaping character codes
# (e.g. &amp;) automatically. The following variable, therefore,
# should contain a plain unicode string.
# strips indentation whitespace only
pattern = re.compile("^(?:[\n\r]+\s*)?(.+)")
result = pattern.search(tag)
if not result:
return
tag_text = result.groups()[0]
self.line.append(CaptionNode.create_text(tag_text, inherit_from))
# convert line breaks
elif tag.name == 'br':
self.line.append(CaptionNode.create_break(inherit_from))
# convert italics, bold, and underline
elif tag.name == 'i' or tag.name == 'b' or tag.name == 'u':
style_name = self._get_style_name_from_tag(tag.name)
self.line.append(
CaptionNode.create_style(True, {style_name: True})
)
# recursively call function for any children elements
for a in tag.contents:
self._translate_tag(a, inherit_from)
self.line.append(
CaptionNode.create_style(False, {style_name: True}))
elif tag.name == 'span':
self._translate_span(tag, inherit_from)
else:
# recursively call function for any children elements
for a in tag.contents:
self._translate_tag(a, inherit_from)
def _translate_span(self, tag, inherit_from=None):
# convert tag attributes
args = self._translate_attrs(tag)
# only include span tag if attributes returned
if args:
layout_info = self._build_layout(args, inherit_from)
# OLD: Create legacy style node
# NEW: But pass new layout object
node = CaptionNode.create_style(True, args, layout_info)
self.line.append(node)
# recursively call function for any children elements
for a in tag.contents:
# NEW: Pass the layout along so that it's eventually attached
# to leaf nodes (e.g. text or break)
self._translate_tag(a, layout_info)
node = CaptionNode.create_style(False, args, layout_info)
self.line.append(node)
else:
for a in tag.contents:
self._translate_tag(a, inherit_from)
def _translate_attrs(self, tag):
attrs = {}
css_attrs = tag.attrs
if 'class' in css_attrs:
attrs['class'] = css_attrs['class'][0].lower()
if 'id' in css_attrs:
attrs['class'] = css_attrs['id'].lower()
if 'style' in css_attrs:
styles = css_attrs['style'].split(';')
attrs.update(self._translate_style(attrs, styles))
return attrs
# convert attributes from inline CSS
def _translate_style(self, attrs, styles):
for style in styles:
style = style.split(':')
if len(style) == 2:
css_property, value = style
else:
continue
if css_property == 'text-align':
self._save_first_alignment(value.strip())
else:
self._translate_css_property(attrs, css_property, value)
return attrs
def _translate_parsed_style(self, styles):
# Keep unknown styles by default
attrs = styles
for css_property in list(styles.keys()):
value = styles[css_property]
self._translate_css_property(attrs, css_property, value)
return attrs
def _translate_css_property(self, attrs, css_property, value):
if css_property == 'font-family':
attrs['font-family'] = value.strip()
elif css_property == 'font-size':
attrs['font-size'] = value.strip()
elif css_property == 'font-style' and value.strip() == 'italic':
attrs['italics'] = True
elif css_property == 'text-decoration' and value.strip() == 'underline':
attrs['underline'] = True
elif css_property == 'font-weight' and value.strip() == 'bold':
attrs['bold'] = True
elif css_property == 'lang':
attrs['lang'] = value.strip()
elif css_property == 'color':
attrs['color'] = value.strip()
def _save_first_alignment(self, align):
"""
Unlike the other inline CSS attributes parsed in _translate_styles, the
'text-align' setting must be applied to a Layout and not to a style
because it affects positioning. This Layout must be assigned to the
Caption object, and not a Node, because it doesn't make sense to have
spans in the same caption with different alignments. Even though the
SAMI format seems to in principle accept it, pycaption normalizes to
something it can make sense of internally and convert to other formats.
If there are multiple elements (span, div, etc) in the same line with
different alignments, only the first alignment is taken into account.
If the root element of the caption (sync's first child) has an inline
text-align, it is preserved and any children alignment is ignored.
:param align: A unicode string representing a CSS text-align value
"""
if not self.first_alignment:
self.first_alignment = Alignment.from_horizontal_and_vertical_align( # noqa
text_align=align
)
class SAMIWriter(BaseWriter):
def __init__(self, *args, **kwargs):
super(SAMIWriter, self).__init__(*args, **kwargs)
self.open_span = False
self.last_time = None
def write(self, caption_set):
caption_set = deepcopy(caption_set)
sami = BeautifulSoup(SAMI_BASE_MARKUP, "lxml-xml")
caption_set.layout_info = self._relativize_and_fit_to_screen(
caption_set.layout_info)
primary = None
for lang in caption_set.get_languages():
self.last_time = None
if primary is None:
primary = lang
caption_set.set_layout_info(
lang,
self._relativize_and_fit_to_screen(
caption_set.get_layout_info(lang))
)
for caption in caption_set.get_captions(lang):
# Loop through all captions/nodes and apply transformations to
# layout in function of the provided or default settings
caption.layout_info = self._relativize_and_fit_to_screen(
caption.layout_info)
for node in caption.nodes:
node.layout_info = self._relativize_and_fit_to_screen(
node.layout_info)
sami = self._recreate_p_tag(
caption, sami, lang, primary, caption_set)
stylesheet = self._recreate_stylesheet(caption_set)
sami.find('style').append(stylesheet)
a = sami.prettify(formatter=None).split('\n')
caption_content = '\n'.join(a[1:])
return caption_content
def _recreate_p_tag(self, caption, sami, lang, primary, captions):
"""
Creates a p tag for the given caption, attach it to the sami object
and return it.
:type caption: Caption
:type sami: BeautifulSoup
:type lang: unicode
:type primary: unicode
:type captions: CaptionSet
:rtype: BeautifulSoup
"""
time = caption.start / 1000
if self.last_time and time != self.last_time:
sami = self._recreate_blank_tag(
sami, caption, lang, primary, captions)
self.last_time = caption.end / 1000
sami, sync = self._recreate_sync(sami, lang, primary, time)
p = sami.new_tag("p")
p_style = ''
for attr, value in list(self._recreate_style(caption.style).items()):
p_style += '%s:%s;' % (attr, value)
if p_style:
p['p_style'] = p_style
p['class'] = self._recreate_p_lang(caption, lang, captions)
p.string = self._recreate_text(caption.nodes)
sync.append(p)
return sami
def _recreate_sync(self, sami, lang, primary, time):
"""
Creates a sync tag for a given language and timing (if it doesn't
already exist), attach it to the sami body and return the sami
BeautifulSoupobject.
:type sami: BeautifulSoup
:type lang: unicode
:type primary: unicode
:type time: int
:rtype: BeautifulSoup
"""
if lang == primary:
sync = sami.new_tag("sync", start="%d" % time)
sami.body.append(sync)
else:
sync = sami.find("sync", start="%d" % time)
if sync is None:
sami, sync = self._find_closest_sync(sami, time)
return sami, sync
def _find_closest_sync(self, sami, time):
sync = sami.new_tag("sync", start="%d" % time)
earlier = sami.find_all("sync", start=lambda x: int(x) < time)
if earlier:
last_sync = earlier[-1]
last_sync.insert_after(sync)
else:
def later_syncs(start):
return int(start) > time
later = sami.find_all("sync", start=later_syncs)
if later:
last_sync = later[0]
last_sync.insert_before(sync)
return sami, sync
def _recreate_blank_tag(self, sami, caption, lang, primary, captions):
sami, sync = self._recreate_sync(sami, lang, primary, self.last_time)
p = sami.new_tag("p")
p['class'] = self._recreate_p_lang(caption, lang, captions)
p.string = '&nbsp;'
sync.append(p)
return sami
def _recreate_p_lang(self, caption, lang, captions):
try:
if 'lang' in captions.get_style(caption.style['class']):
return caption.style['class']
except KeyError:
pass
return lang
def _recreate_stylesheet(self, caption_set):
stylesheet = '<!--'
for attr, value in caption_set.get_styles():
if value != {}:
stylesheet += self._recreate_style_block(
attr, value, caption_set.layout_info)
for lang in caption_set.get_languages():
lang_string = 'lang: {}'.format(lang)
if lang_string not in stylesheet:
stylesheet += self._recreate_style_block(
lang, {'lang': lang}, caption_set.get_layout_info(lang))
return stylesheet + ' -->'
def _recreate_style_block(self, target, rules, layout_info):
"""
:param target: A unicode string representing the target of the styling
rules.
:param rules: A dictionary with CSS-like styling rules.
:param layout_info: A Layout object providing positioning information
to be converted to CSS
"""
if target not in ['p', 'sync', 'span']:
# If it's not a valid SAMI element, then it's a custom class name
selector = '.{}'.format(target)
else:
selector = target
sami_style = '\n {} {{\n '.format(selector)
if layout_info and layout_info.padding:
rules.update({
'margin-top': six.text_type(layout_info.padding.before),
'margin-right': six.text_type(layout_info.padding.end),
'margin-bottom': six.text_type(layout_info.padding.after),
'margin-left': six.text_type(layout_info.padding.start),
})
for attr, value in sorted(self._recreate_style(rules).items()):
sami_style += ' {}: {};\n '.format(attr, value)
return sami_style + '}\n'
def _recreate_text(self, caption):
line = ''
for node in caption:
if node.type_ == CaptionNode.TEXT:
line += self._encode(node.content) + ' '
elif node.type_ == CaptionNode.BREAK:
line = line.rstrip() + '<br/>\n '
elif node.type_ == CaptionNode.STYLE:
line = self._recreate_line_style(line, node)
return line.rstrip()
def _recreate_line_style(self, line, node):
if node.start:
if self.open_span:
line = line.rstrip() + '</span> '
line = self._recreate_span(line, node.content)
else:
if self.open_span:
line = line.rstrip() + '</span> '
self.open_span = False
return line
def _recreate_span(self, line, content):
style = ''
klass = ''
if 'class' in content:
klass += ' class="%s"' % content['class']
for attr, value in list(self._recreate_style(content).items()):
style += '%s:%s;' % (attr, value)
if style or klass:
if style:
style = ' style="%s"' % style
line += '<span%s%s>' % (klass, style)
self.open_span = True
return line
def _recreate_style(self, rules):
"""
:param rules: A dictionary with CSS-like styling rules
"""
sami_style = {}
for key, value in list(rules.items()):
# Recreate original CSS rules from internal style
if key == 'italics' and value == True:
sami_style['font-style'] = 'italic'
elif key == 'bold' and value == True:
sami_style['font-weight'] = 'bold'
elif key == 'underline' and value == True:
sami_style['text-decoration'] = 'underline'
else:
sami_style[key] = value
return sami_style
def _encode(self, s):
"""
Encodes plain unicode string to proper SAMI file escaping special
characters in case they appear in the string.
:type s: unicode
"""
return escape(s)
class SAMIParser(HTMLParser):
def __init__(self, *args, **kw):
HTMLParser.__init__(self, *args, **kw)
self.sami = ''
self.line = ''
self.styles = {}
self.queue = deque()
self.langs = set()
self.last_element = ''
self.name2codepoint = name2codepoint.copy()
self.name2codepoint['apos'] = 0x0027
self.convert_charrefs = False
def handle_starttag(self, tag, attrs):
"""
Override the parser's handling of starttags
:param tag: unicode string indicating the tag type (e.g. "head" or "p")
:param tag: list of attribute tuples of type (u'name', u'value')
"""
self.last_element = tag
# treat divs as spans
if tag == 'div':
tag = 'span'
# figure out the caption language of P tags
if tag == 'p':
lang = self._find_lang(attrs)
# if no language detected, set it as the default
lang = lang or DEFAULT_LANGUAGE_CODE
attrs.append(('lang', lang))
self.langs.add(lang)
# clean-up line breaks
if tag == 'br':
self.sami += "<br/>"
# add tag to queue
else:
# if already in queue, first close tags off in LIFO order
while tag in self.queue:
closer = self.queue.pop()
self.sami += "</%s>" % closer
# open new tag in queue
self.queue.append(tag)
# add tag with attributes
for attr, value in attrs:
tag += ' %s="%s"' % (attr.lower(), value)
self.sami += "<%s>" % tag
# override the parser's handling of endtags
def handle_endtag(self, tag):
# treat divs as spans
if tag == 'div':
tag = 'span'
# handle incorrectly formatted sync/p tags
if tag in ['p', 'sync'] and tag == self.last_element:
return
# close off tags in LIFO order, if matching starting tag in queue
while tag in self.queue:
closing_tag = self.queue.pop()
self.sami += "</%s>" % closing_tag
def handle_entityref(self, name):
if name in ['gt', 'lt']:
self.sami += '&%s;' % name
else:
try:
self.sami += chr(self.name2codepoint[name])
except (KeyError, ValueError):
self.sami += '&%s' % name
self.last_element = ''
def handle_charref(self, name):
if name[0] == 'x':
self.sami += chr(int(name[1:], 16))
else:
self.sami += chr(int(name))
# override the parser's handling of data
def handle_data(self, data):
self.sami += data
self.last_element = ''
# override the parser's feed function
def feed(self, data):
"""
:param data: Raw SAMI unicode string
:returns: tuple (unicode, dict, set)
"""
no_cc = 'no closed captioning available'
if '<html' in data.lower():
raise CaptionReadSyntaxError(
'SAMI File seems to be an HTML file.')
elif no_cc in data.lower():
raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc)
# try to find style tag in SAMI
try:
# prevent BS4 error with huge SAMI files with unclosed tags
index = data.lower().find("</head>")
self.styles = self._css_parse(
BeautifulSoup(data[:index], "lxml").find('style').get_text())
except AttributeError:
self.styles = {}
# fix erroneous italics tags
data = data.replace('<i/>', '<i>')
# fix awkward tags found in some SAMIs
data = data.replace(';>', '>')
try:
HTMLParser.feed(self, data)
except HTMLParseError as e:
raise CaptionReadSyntaxError(e)
# close any tags that remain in the queue
while self.queue != deque([]):
closing_tag = self.queue.pop()
self.sami += "</%s>" % closing_tag
return self.sami, self.styles, self.langs
# parse the SAMI's stylesheet
def _css_parse(self, css):
"""
Parse styling via cssutils modules
:rtype: dict
"""
sheet = parseString(css)
style_sheet = {}
for rule in sheet:
new_style = {}
selector = rule.selectorText.lower()
if selector[0] in ['#', '.']:
selector = selector[1:]
# keep any style attributes that are needed
for prop in rule.style:
if prop.name == 'color':
cv = cssutils_css.ColorValue(prop.value)
# Code for RGB to hex conversion comes from
# http://bit.ly/1kwfBnQ
new_style['color'] = "#%02x%02x%02x" % (
cv.red, cv.green, cv.blue)
else:
new_style[prop.name] = prop.value
if new_style:
style_sheet[selector] = new_style
return style_sheet
def _find_lang(self, attrs):
for attr, value in attrs:
# if lang is an attribute of the tag
if attr.lower() == 'lang':
return value[:2]
# if the P tag has a class, try and find the language
if attr.lower() == 'class':
try:
return self.styles[value.lower()]['lang']
except KeyError:
pass
return None
+696
View File
@@ -0,0 +1,696 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
3 types of SCC captions:
Roll-Up
Paint-On
Pop-On
Commands:
94ae - [ENM] - Erase Non-displayed(buffer) Memory
942c - [EDM] - Erase Displayed Memory
9420 - [RCL] - Resume Caption Loading
9429 - [RDC] - Resume Direct Captioning
9425, 9426, 94a7 - [RU2], [RU3], [RU4] (roll up captions 2,3 or 4 rows)
- these commands set the number of expected lines
94ad - (in CEA-608-E: 142d) - [CR] carriage return.
- This actually rolls the captions up as many rows as specified by
[RU1], [RU2], or [RU3]
80 - no-op char. Doesn't do anything, but must be used with other
characters, to make a 2 byte word
97a1, 97a2, 9723 - [TO] move 1, 2 or 3 columns - Tab Over command
- this moves the positioning 1, 2, or 3 columns to the right
- Nothing regarding this is implemented.
942f - [EOC] - display the buffer on the screen - End Of Caption
... - [PAC] - Preamble address code (can set positioning and style)
- All the PACs are specified by the first and second byte combined
from pycaption.scc.constants.PAC_BYTES_TO_POSITIONING_MAP
9429 - [RDC] - Resume Direct Captioning
94a4 - (in CEA-608-E: 1424) - [DER] Delete to End of Row
Pop-On:
The commands should usually appear in this order. Not strict though, and
the the commands don't have to necessarily be on the same row.
1. 94ae [ENM] (erase non displayed memory)
2. 9420 [RCL] (resume caption loading => this command here means we're using Pop-On captions)
2.1? [ENM] - if step 0 was skipped?
3. [PAC] Positioning/ styling command (can position on columns divisible by 4)
The control chars is called Preamble Address Code [PAC].
4. If positioning needs to be on columns not divisible by 4, use a [TO] command
5. text
6. 942c [EDM] - optionally, erase the currently displayed caption
7. 942f [EOC] display the caption
Roll-Up:
1. [RU2], [RU3] or [RU4] - sets Roll-Up style and depth
- these set the Roll-Up style: (characteristic command)
2. [CR] to roll the display up 1 row...lol?
3. [PAC] - sets the indent of the base row
Paint-On:
1. [RDC] - sets the Paint-On style (characteristic command)
2. [PAC]
3. text
4. [PAC]
5. text or [DER]
There are some rules regarding the parity of the commands.
This resource:
http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML
specifies that there are interpreters which only work if the commands have an
odd parity. This however is not consistent, and we might not handle well
these cases. Odd parity of a command means that, converting toe word into
binary, should result in an odd number of '1's. The PAC commands obey this
rule, but some do not. Some commands that do not are found in the COMMANDS
dictionary. This is legacy logic, that I didn't know how to handle, and
just carried over when implementing positioning.
"""
import re
import math
import textwrap
from copy import deepcopy
import six
from pycaption.base import (
BaseReader, BaseWriter, CaptionSet, CaptionNode,
)
from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError
from .constants import (
HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS,
MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
)
from .specialized_collections import (
TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
InstructionNodeCreator)
from .state_machines import DefaultProvidingPositionTracker
class NodeCreatorFactory(object):
"""Will return instances of the given node_creator.
This is used as a means of creating new InstructionNodeCreator instances,
because these need to share state beyond their garbage collection, but
storing the information at the class level is not good either, because
this information must be erased after the reader's .read() operation
completes.
"""
def __init__(self, position_tracker,
node_creator=InstructionNodeCreator):
self.position_tracker = position_tracker
self.node_creator = node_creator
def new_creator(self):
"""Returns a new instance of self.node_creator, initialized with
the same italics_tracker, and position_tracker
"""
return self.node_creator(position_tracker=self.position_tracker)
def from_list(self, roll_rows):
"""Wraps the node_creator's method with the same name
:param roll_rows: list of node_creator instances
:return: a node_creator instance
"""
return self.node_creator.from_list(
roll_rows,
position_tracker=self.position_tracker
)
def get_corrected_end_time(caption):
"""If the last caption was never explicitly ended, set its end time to
start + 4 seconds
:param Caption caption: the last caption
:rtype: int
"""
if caption.end:
return caption.end
return caption.start + 4 * 1000 * 1000
class SCCReader(BaseReader):
"""Converts a given unicode string to a CaptionSet.
This can be then later used for converting into any other supported formats
"""
def __init__(self, *args, **kw):
self.caption_stash = CaptionCreator()
self.time_translator = _SccTimeTranslator()
self.node_creator_factory = NodeCreatorFactory(
DefaultProvidingPositionTracker()
)
self.last_command = ''
self.buffer_dict = NotifyingDict()
self.buffer_dict['pop'] = self.node_creator_factory.new_creator()
self.buffer_dict['paint'] = self.node_creator_factory.new_creator()
self.buffer_dict['roll'] = self.node_creator_factory.new_creator()
# Call this method when the active key changes
self.buffer_dict.add_change_observer(self._flush_implicit_buffers)
self.buffer_dict.set_active('pop')
self.roll_rows = []
self.roll_rows_expected = 0
self.simulate_roll_up = False
self.time = 0
def detect(self, content):
"""Checks whether the given content is a proper SCC file
:type content: unicode
:rtype: bool
"""
lines = content.splitlines()
if lines[0] == HEADER:
return True
else:
return False
def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
"""Converts the unicode string into a CaptionSet
:type content: six.text_type
:param content: The SCC content to be converted to a CaptionSet
:type lang: six.text_type
:param lang: The language of the caption
:type simulate_roll_up: bool
:param simulate_roll_up: If True, when converting to other formats,
the resulting captions will contain all the rows that were visible
on the screen when the captions were rolling up.
:type offset: int
:param offset:
:rtype: CaptionSet
"""
if type(content) != six.text_type:
raise InvalidInputError('The content is not a unicode string.')
self.simulate_roll_up = simulate_roll_up
self.time_translator.offset = offset * 1000000
# split lines
lines = content.splitlines()
# loop through each line except the first
for line in lines[1:]:
self._translate_line(line)
self._flush_implicit_buffers()
captions = CaptionSet({lang: self.caption_stash.get_all()})
# check captions for incorrect lengths
for cap in captions.get_captions(lang):
# if there's an end time on a caption and the difference is
# less than .05s kill it (this is likely caused by a standalone
# EOC marker in the SCC file)
if 0 < cap.end - cap.start < 50000:
raise ValueError('unsupported length found in SCC input file: ' + str(cap))
if captions.is_empty():
raise CaptionReadNoCaptions("empty caption file")
else:
last_caption = captions.get_captions(lang)[-1]
last_caption.end = get_corrected_end_time(last_caption)
return captions
def _fix_last_timing(self, timing):
"""HACK HACK: Certain Paint-On captions don't specify the 942f [EOC]
(End Of Caption) command on the same line.
If this is a 942f line, also simulate a 942c (Erase Displayed Memory)
to properly set the timing on the last caption.
This method needs some serious attention, because it proves the timing
calculation is not done well for Pop-On captions
"""
# Calculate the end time from the current line
time_translator = _SccTimeTranslator()
time_translator.start_at(timing)
time_translator.offset = self.time_translator.offset
# But use the current time translator for the start time
self.caption_stash.create_and_store(
self.buffer, self.time_translator.get_time())
self.caption_stash.correct_last_timing(time_translator.get_time())
self.buffer = self.node_creator_factory.node_creator()
def _flush_implicit_buffers(self, old_key=None, *args):
"""Convert to Captions those buffers whose behavior is implicit.
The Paint-On buffer is explicit. New captions are created from it
with the command 'End Of Caption' [EOC], '942f'
The other 2 buffers, Roll-Up and Paint-On we treat as "more" implicit,
meaning that they can be displayed by a command on the next row.
If they're on the last row however, or if the caption type is changing,
we make sure to convert the buffers to text, so we don't lose any info.
"""
if old_key == 'pop':
return
elif old_key is None or old_key == 'roll':
if not self.buffer.is_empty():
self._roll_up()
elif old_key is None or old_key == 'paint':
# xxx - perhaps the self.buffer property is sufficient
if not self.buffer_dict['paint'].is_empty():
self.caption_stash.create_and_store(
self.buffer_dict['paint'], self.time)
def _translate_line(self, line):
# ignore blank lines
if line.strip() == '':
return
# split line in timestamp and words
r = re.compile(r"([0-9:;]*)([\s\t]*)((.)*)")
parts = r.findall(line.lower())
# XXX!!!!!! THESE 2 LINES ARE A HACK
if parts[0][2].strip() == '942f':
self._fix_last_timing(timing=parts[0][0])
self.time_translator.start_at(parts[0][0])
# loop through each word
for word in parts[0][2].split(' '):
# ignore empty results
if word.strip() != '':
self._translate_word(word)
def _translate_word(self, word):
# count frames for timing
self.time_translator.increment_frames()
# first check if word is a command
# TODO - check that all the positioning commands are here, or use
# some other strategy to determine if the word is a command.
if word in COMMANDS or _is_pac_command(word):
self._translate_command(word)
# second, check if word is a special character
elif word in SPECIAL_CHARS:
self._translate_special_char(word)
elif word in EXTENDED_CHARS:
self._translate_extended_char(word)
# third, try to convert word into 2 characters
else:
self._translate_characters(word)
def _handle_double_command(self, word):
# ensure we don't accidentally use the same command twice
if word == self.last_command:
self.last_command = ''
return True
else:
self.last_command = word
return False
def _translate_special_char(self, word):
# XXX - this looks highly buggy. Why should special chars be ignored
# when printed 2 times one after another?
if self._handle_double_command(word):
return
self.buffer.add_chars(SPECIAL_CHARS[word])
def _translate_extended_char(self, word):
# XXX - this looks highly buggy. Why would a special char be ignored
# if it's printed 2 times one after another?
if self._handle_double_command(word):
return
# add to buffer
self.buffer.add_chars(EXTENDED_CHARS[word])
def _translate_command(self, word):
if self._handle_double_command(word):
return
# if command is pop_up
if word == '9420':
self.buffer_dict.set_active('pop')
# command is paint_on [Resume Direct Captioning]
elif word == '9429':
self.buffer_dict.set_active('paint')
self.roll_rows_expected = 1
if not self.buffer.is_empty():
self.caption_stash.create_and_store(
self.buffer, self.time
)
self.buffer = self.node_creator_factory.new_creator()
self.time = self.time_translator.get_time()
# if command is roll_up 2, 3 or 4 rows
elif word in ('9425', '9426', '94a7'):
self.buffer_dict.set_active('roll')
# count how many lines are expected
if word == '9425':
self.roll_rows_expected = 2
elif word == '9426':
self.roll_rows_expected = 3
elif word == '94a7':
self.roll_rows_expected = 4
# if content is in the queue, turn it into a caption
if not self.buffer.is_empty():
self.caption_stash.create_and_store(
self.buffer, self.time)
self.buffer = self.node_creator_factory.new_creator()
# set rows to empty, configure start time for caption
self.roll_rows = []
self.time = self.time_translator.get_time()
# clear pop_on buffer
elif word == '94ae':
self.buffer = self.node_creator_factory.new_creator()
# display pop_on buffer [End Of Caption]
elif word == '942f':
self.time = self.time_translator.get_time()
self.caption_stash.create_and_store(self.buffer, self.time)
self.buffer = self.node_creator_factory.new_creator()
# roll up captions [Carriage Return]
elif word == '94ad':
# display roll-up buffer
if not self.buffer.is_empty():
self._roll_up()
# clear screen
elif word == '942c':
self.roll_rows = []
# XXX - The 942c command has nothing to do with paint-ons
# This however is legacy code, and will break lots of tests if
# the proper buffer (self.buffer) is used.
# Most likely using `self.buffer` instead of the paint buffer
# is the right thing to do, but this needs some further attention.
if not self.buffer_dict['paint'].is_empty():
self.caption_stash.create_and_store(
self.buffer_dict['paint'], self.time)
self.buffer = self.node_creator_factory.new_creator()
# attempt to add proper end time to last caption(s)
self.caption_stash.correct_last_timing(
self.time_translator.get_time())
# if command not one of the aforementioned, add to buffer
else:
self.buffer.interpret_command(word)
def _translate_characters(self, word):
# split word into the 2 bytes
byte1 = word[:2]
byte2 = word[2:]
# check to see if the the bytes are recognized characters
if byte1 not in CHARACTERS or byte2 not in CHARACTERS:
return
self.buffer.add_chars(CHARACTERS[byte1], CHARACTERS[byte2])
@property
def buffer(self):
"""Returns the currently active buffer
"""
return self.buffer_dict.get_active()
@buffer.setter
def buffer(self, value):
"""Sets a new value to the active key
:param value: any object
"""
try:
key = self.buffer_dict.active_key
self.buffer_dict[key] = value
except TypeError:
pass
def _roll_up(self):
# We expect the active buffer to be the rol buffer
if self.simulate_roll_up:
if self.roll_rows_expected > 1:
if len(self.roll_rows) >= self.roll_rows_expected:
self.roll_rows.pop(0)
self.roll_rows.append(self.buffer)
self.buffer = self.node_creator_factory.from_list(
self.roll_rows)
# convert buffer and empty
self.caption_stash.create_and_store(self.buffer, self.time)
self.buffer = self.node_creator_factory.new_creator()
# configure time
self.time = self.time_translator.get_time()
# try to insert the proper ending time for the previous caption
self.caption_stash.correct_last_timing(self.time, force=True)
class SCCWriter(BaseWriter):
def __init__(self, *args, **kw):
super(SCCWriter, self).__init__(*args, **kw)
def write(self, caption_set):
output = HEADER + '\n\n'
if caption_set.is_empty():
return output
caption_set = deepcopy(caption_set)
# Only support one language.
lang = list(caption_set.get_languages())[0]
captions = caption_set.get_captions(lang)
# PASS 1: compute codes for each caption
codes = [(self._text_to_code(caption), caption.start, caption.end)
for caption in captions]
# PASS 2:
# Advance start times so as to have time to write to the pop-on
# buffer; possibly remove the previous clear-screen command
for index, (code, start, end) in enumerate(codes):
code_words = len(code) / 5 + 8
code_time_microseconds = code_words * MICROSECONDS_PER_CODEWORD
code_start = start - code_time_microseconds
if index == 0:
continue
previous_code, previous_start, previous_end = codes[index-1]
if previous_end + 3 * MICROSECONDS_PER_CODEWORD >= code_start:
codes[index-1] = (previous_code, previous_start, None)
codes[index] = (code, code_start, end)
# PASS 3:
# Write captions.
for (code, start, end) in codes:
output += ('%s\t' % self._format_timestamp(start))
output += '94ae 94ae 9420 9420 '
output += code
output += '942c 942c 942f 942f\n\n'
if end is not None:
output += '%s\t942c 942c\n\n' % self._format_timestamp(end)
return output
# Wrap lines at 32 chars
@staticmethod
def _layout_line(caption):
def caption_node_to_text(caption_node):
if caption_node.type_ == CaptionNode.TEXT:
return six.text_type(caption_node.content)
elif caption_node.type_ == CaptionNode.BREAK:
return '\n'
caption_text = ''.join(
[caption_node_to_text(node) for node in caption.nodes])
inner_lines = caption_text.split('\n')
inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines]
return '\n'.join(inner_lines_laid_out)
@staticmethod
def _maybe_align(code):
# Finish a half-word with a no-op so we can move to a full word
if len(code) % 5 == 2:
code += '80 '
return code
@staticmethod
def _maybe_space(code):
if len(code) % 5 == 4:
code += ' '
return code
def _print_character(self, code, char):
try:
char_code = CHARACTER_TO_CODE[char]
except KeyError:
try:
char_code = SPECIAL_OR_EXTENDED_CHAR_TO_CODE[char]
except KeyError:
char_code = '91b6' # Use £ as "unknown character" symbol
if len(char_code) == 2:
return code + char_code
elif len(char_code) == 4:
return self._maybe_align(code) + char_code
else:
# This should not happen!
return code
def _text_to_code(self, s):
code = ''
lines = self._layout_line(s).split('\n')
for row, line in enumerate(lines):
row += 16 - len(lines)
# Move cursor to column 0 of the destination row
for _ in range(2):
code += ('%s%s ' % (PAC_HIGH_BYTE_BY_ROW[row],
PAC_LOW_BYTE_BY_ROW_RESTRICTED[row]))
# Print the line using the SCC encoding
for char in line:
code = self._print_character(code, char)
code = self._maybe_space(code)
code = self._maybe_align(code)
return code
@staticmethod
def _format_timestamp(microseconds):
seconds_float = microseconds / 1000.0 / 1000.0
# Convert to non-drop-frame timecode
seconds_float *= 1000.0 / 1001.0
hours = math.floor(seconds_float / 3600)
seconds_float -= hours * 3600
minutes = math.floor(seconds_float / 60)
seconds_float -= minutes * 60
seconds = math.floor(seconds_float)
seconds_float -= seconds
frames = math.floor(seconds_float * 30)
return '%02d:%02d:%02d:%02d' % (hours, minutes, seconds, frames)
class _SccTimeTranslator(object):
"""Converts SCC time to microseconds, keeping track of frames passed
"""
def __init__(self):
self._time = '00:00:00;00'
# microseconds. The offset from which we begin the time calculation
self.offset = 0
self._frames = 0
def get_time(self):
"""Returns the time, in microseconds. Takes into account the number of
frames passed, and the offset
:rtype: int
"""
return self._translate_time(
self._time[:-2] + six.text_type(int(self._time[-2:]) + self._frames),
self.offset
)
@staticmethod
def _translate_time(stamp, offset):
"""
:param stamp:
:type offset: int
:param offset: Subtract this many microseconds from the calculated time
Helpful for when the captions are off by some time interval.
:rtype: int
"""
if ';' in stamp:
# Drop-frame timebase runs at the same rate as wall clock
seconds_per_timestamp_second = 1.0
else:
# Non-drop-frame timebase runs "slow"
# 1 second of timecode is longer than an actual second (1.001s)
seconds_per_timestamp_second = 1001.0 / 1000.0
time_split = stamp.replace(';', ':').split(':')
timestamp_seconds = (int(time_split[0]) * 3600 +
int(time_split[1]) * 60 +
int(time_split[2]) +
int(time_split[3]) / 30.0)
seconds = timestamp_seconds * seconds_per_timestamp_second
microseconds = seconds * 1000 * 1000 - offset
if microseconds < 0:
microseconds = 0
return microseconds
def start_at(self, timespec):
"""Reset the counter to the given time
:type timespec: unicode
"""
self._time = timespec
self._frames = 0
def increment_frames(self):
"""After a command was processed, we'd increment the number of frames
"""
self._frames += 1
def _is_pac_command(word):
"""Checks whether the given word is a Preamble Address Code [PAC] command
:type word: unicode
:param word: 4 letter unicode command
:rtype: bool
"""
if not word or len(word) != 4:
return False
byte1, byte2 = word[:2], word[2:]
try:
PAC_BYTES_TO_POSITIONING_MAP[byte1][byte2]
except KeyError:
return False
else:
return True

Some files were not shown because too many files have changed in this diff Show More