Netflix-videos-downloader/utils/modules/kanji_to_romaji/kanji_to_romaji_module.py

647 lines
24 KiB
Python

# coding=utf-8
import os
import sys
from collections import OrderedDict
try:
# noinspection PyPackageRequirements
import simplejson as json
except ImportError:
import json
from .models import UnicodeRomajiMapping
from .models import KanjiBlock
from .models import Particle
PATH_TO_MODULE = os.path.dirname(__file__)
JP_MAPPINGS_PATH = os.path.join(PATH_TO_MODULE, "jp_mappings")
hiragana_iter_mark = ""
hiragana_voiced_iter_mark = ""
katakana_iter_mark = ""
katakana_voiced_iter_mark = ""
kanji_iteration_mark = ""
hirgana_soukon_unicode_char = ""
katakana_soukon_unicode_char = ""
katakana_long_vowel_mark = ""
def load_kana_mappings_dict():
kana_romaji_mapping = {}
for f in os.listdir(JP_MAPPINGS_PATH):
if os.path.splitext(f)[1] == ".json" and "kanji" not in f:
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
kana_romaji_mapping.update(json.load(data_file))
return kana_romaji_mapping
def load_kanji_mappings_dict():
"""
read through all json files that contain "kanji" in filename
load json data from files to kanji_romaji_mapping dictionary
if the key(kanji char) has already been added to kanji_romaji_mapping then create "other_readings" key
"other_readings" will consist of w_type for its key and the new romaji reading for it
e.g:
{u"係り":
'w_type': 'noun',
'romaji': 'kakari',
{'other_readings': {'godan verb stem': 'kakawari'}
}
:return: dict - kanji to romaji mapping
"""
kanji_romaji_mapping = {}
f_list = os.listdir(JP_MAPPINGS_PATH)
for f in f_list[:]: # shift all conjugated files to end, lower priority for verb stems
if "conjugated" in f:
f_list.remove(f)
f_list.append(f)
for f in f_list:
if os.path.splitext(f)[1] == ".json" and "kanji" in f:
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
data_file_dict = json.load(data_file)
for k in list(data_file_dict.keys()):
if k in kanji_romaji_mapping and \
data_file_dict[k]["w_type"] != kanji_romaji_mapping[k]["w_type"]:
# if "other_readings" in kanji_romaji_mapping[k] and \
# data_file_dict[k]["w_type"] in kanji_romaji_mapping[k]["other_readings"]:
# raise
if "other_readings" not in kanji_romaji_mapping[k]:
kanji_romaji_mapping[k]["other_readings"] = {}
kanji_romaji_mapping[k]["other_readings"][data_file_dict[k]["w_type"]] = \
data_file_dict[k]["romaji"]
else:
kanji_romaji_mapping[k] = data_file_dict[k]
return kanji_romaji_mapping
def _convert_hira_kata_char(hira_or_kata_char, h_to_k=True):
"""
take second last hex character from unicode and add/subtract 6 hex to it to get hiragana/katakana char
e.g hiragana u3041 -> 0x3041 + 0x6 = 0x30A1 -> katakana u30A1
:param hira_or_kata_char: unicode hiragana character
:return: converterd hiragana or katakana depending on h_to_k value
"""
if h_to_k:
suffix_offset = 6
else:
suffix_offset = -6
unicode_second_last_char = list(hira_or_kata_char.encode("unicode_escape"))[-2]
suffix = hex(int(unicode_second_last_char, 16) + suffix_offset)
char_list = list(hira_or_kata_char.encode("unicode_escape"))
char_list[-2] = suffix[-1]
result_char = "".join(char_list).decode('unicode-escape').encode('utf-8')
return result_char
def convert_hiragana_to_katakana(hiragana):
converted_str = ""
for c in hiragana:
if is_hiragana(c) or c in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char]:
converted_str += _convert_hira_kata_char(c)
else:
converted_str += c.encode('utf-8')
return converted_str.decode("utf-8")
def convert_katakana_to_hiragana(katakana):
converted_str = ""
for c in katakana:
if is_katakana(c) or c in [katakana_iter_mark, katakana_voiced_iter_mark,
katakana_soukon_unicode_char]:
converted_str += _convert_hira_kata_char(c, h_to_k=False)
else:
converted_str += c.encode('utf-8')
return converted_str.decode("utf-8")
def is_hiragana(c):
hiragana_starting_unicode = "\u3041"
hiragana_ending_unicode = "\u3096"
return c not in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char] and \
hiragana_starting_unicode <= c <= hiragana_ending_unicode
def is_katakana(c):
katakana_starting_unicode = "\u30A1"
katakana_ending_unicode = "\u30F6"
return c not in [katakana_iter_mark, katakana_voiced_iter_mark,
katakana_soukon_unicode_char, katakana_long_vowel_mark] and \
katakana_starting_unicode <= c <= katakana_ending_unicode
def is_kanji(c):
cjk_start_range = "\u4E00"
cjk_end_range = "\u9FD5"
if isinstance(c, KanjiBlock):
return True
else:
return c != kanji_iteration_mark and cjk_start_range <= c <= cjk_end_range
def get_char_type(c):
"""
determine type of passed character by checking if it belongs in a certan unicode range
:param c: kana or kanji character
:return: type of character
"""
char_type = None
if is_hiragana(c):
char_type = "hiragana"
elif is_katakana(c):
char_type = "katakana"
elif is_kanji(c):
char_type = "kanji"
return char_type
def translate_particles(kana_list):
"""
try to find particles which are in hirgana and turn them in to Particle objects
Particle will provide spacing and will be translated in to appropriate romaji (e.g wa instead of ha for は)
rules (varies depending on the hiragana char):
char between two KanjiBlocks(that can be nouns) then assume to be a particle
e.g: 私は嬉 -> KanjiBlock(私), は, KanjiBlock(嬉) -> は is particle use wa instead of ha
type(Kanji, Hiragana, Katakana) changes adjacent to the char
e.g: アパートへくる -> ト, へ, く -> katakana, へ, hiragana -> へ is a particle, use e instead of he
char is last char and previous char is a noun
e.g: 会いました友達に -> KanjiBlock(友達) which is a noun, に
:param kana_list: list of kana characters and KanjiBlock objects
:return: None; update the kana_list that is passed
"""
def is_noun(k_block):
return hasattr(k_block, "w_type") and ("noun" in k_block.w_type or "pronoun" in k_block.w_type)
def type_changes(p, n):
if get_char_type(p) is not None and get_char_type(n) is not None:
return get_char_type(p) != get_char_type(n)
else:
return False
def particle_imm_follows(prev_c_, valid_prev_particles):
"""
check if prev_c is a Particle object
check that prev_c is one of the valid_prev_particles
e.g: wa particle can't be followed by wa particle again but ni particle can be followed by wa.
:param prev_c_: previous character compared to current character in the iteration
:param valid_prev_particles: list of previous particles that can be followed by current character.
:return:
"""
return isinstance(prev_c_, Particle) and prev_c_ in valid_prev_particles
no_hira_char = "\u306E"
ha_hira_char = "\u306F"
he_hira_char = "\u3078"
to_hira_char = "\u3068"
ni_hira_char = "\u306B"
de_hira_char = "\u3067"
mo_hira_char = "\u3082"
ga_hira_char = "\u304C"
no_prtcle = Particle("no")
wa_prtcle = Particle("wa")
e_prtcle = Particle("e")
to_prtcle = Particle("to")
ni_prtcle = Particle("ni")
de_prtcle = Particle("de")
mo_prtcle = Particle("mo")
ga_prtcle = Particle("ga")
for i in range(1, len(kana_list)):
is_last_char = False
prev_c = kana_list[i - 1]
if i == len(kana_list) - 1:
is_last_char = True
next_c = ""
else:
next_c = kana_list[i + 1]
if kana_list[i] == no_hira_char:
if (is_noun(prev_c) and is_noun(next_c)) or \
type_changes(prev_c, next_c) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = no_prtcle
elif kana_list[i] == ha_hira_char:
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
particle_imm_follows(prev_c, [e_prtcle, to_prtcle, ni_prtcle, de_prtcle]) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = wa_prtcle
elif kana_list[i] == mo_hira_char:
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
particle_imm_follows(prev_c, [ni_prtcle, de_prtcle]) or \
(is_noun(prev_c) and is_last_char):
kana_list[i] = mo_prtcle
elif kana_list[i] in [he_hira_char, to_hira_char, ni_hira_char, de_hira_char, ga_hira_char] and \
(is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
type_changes(prev_c, next_c) or \
(is_noun(prev_c) and is_last_char):
if kana_list[i] == he_hira_char:
kana_list[i] = e_prtcle
elif kana_list[i] == to_hira_char:
kana_list[i] = to_prtcle
elif kana_list[i] == ni_hira_char:
kana_list[i] = ni_prtcle
elif kana_list[i] == de_hira_char:
kana_list[i] = de_prtcle
elif kana_list[i] == ga_hira_char:
kana_list[i] = ga_prtcle
def translate_kanji_iteration_mark(kana_list):
"""
translate kanji_iteration_mark: 々
e.g:
在々: zaizai
:param kana_list: unicode consisting of kana and kanji chars
:return: unicode with kanji iteration marks translated
"""
prev_c = ""
for i in range(0, len(kana_list)):
if kana_list[i] == kanji_iteration_mark:
kana_list[i] = prev_c.romaji.strip()
prev_c = kana_list[i]
def get_type_if_verb_stem(curr_chars):
"""
get verb type for given verb stem. verb types can be ichidan, godan or None.
No stem for irregulars
:param curr_chars: kanji chars that is a verb stem
:return: type of verb stem
"""
v_type = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
v_type = UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
v_type = "godan verb"
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
v_type = "ichidan verb"
return v_type
def check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len):
"""
if the given curr_chars has a verb stem reading then try to match it with an one of the listed verb endings
otherwise return/use its .romaji property
e.g:
kana_list = [KanjiBlock(灯り), ま, し, た]
curr_chars = 灯り can be verb stem reading
try and match 灯り with an ending within kana_list
灯り + ました matches
romaji is tomori + mashita (this modifies kana_list to remove matched ending)
kana_list = [tomorimashita]
kana_list = [KanjiBlock(灯り), を, 見ます]
curr_chars = 灯り can be verb stem reading
try and match 灯り with an ending within kana_list
no matching ending
romaji is akari
kana_list = [akari, を, 見ます]
:param kana_list:
:param curr_chars: KanjiBlock current characters to parse out of entire kana_list
:param start_pos:
:param char_len:
:return: ending kanji, ending romaji; both will be None if ending not found
"""
endings = OrderedDict({})
endings["ませんでした"] = "masen deshita"
endings["ませんで"] = "masende"
endings["なさるな"] = "nasaruna"
endings["なかった"] = "nakatta"
endings["れて"] = "rete"
endings["ましょう"] = "mashou"
endings["ました"] = "mashita"
endings["まして"] = "mashite"
endings["ません"] = "masen"
endings["ないで"] = "naide"
endings["なさい"] = "nasai"
endings["ます"] = "masu"
endings["よう"] = "you" # ichidan
endings["ない"] = "nai"
endings[""] = "ta" # ichidan
endings[""] = "te" # ichidan
endings[""] = "ro" # ichidan
endings[""] = "u"
dict_entry = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
dict_entry = UnicodeRomajiMapping.kanji_mapping[curr_chars]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
dict_entry = {
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["godan verb stem"]
}
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
dict_entry = {
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["ichidan verb stem"]
}
e_k = None
e_r = None
if dict_entry is not None:
for e in list(endings.keys()):
possible_conj = curr_chars + e
actual_conj = "".join(kana_list[start_pos: (start_pos + char_len + len(e))])
if possible_conj == actual_conj:
e_k = e
e_r = endings[e] + " "
break
return e_k, e_r
def has_non_verb_stem_reading(curr_chars):
"""
check if curr_chars has an alternative reading aside from the verb stem
:param curr_chars: unicode kanji chars to check
:return: true/false depending on if curr_chars has a verb stem reading
"""
res = False
if "verb stem" not in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
res = True
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
if any(["verb stem" not in ork
for ork in list(UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"].keys())]):
res = True
return res
def get_verb_stem_romaji(verb_stem_kanji):
"""
find romaji for verb stem within kanji_mapping
:param verb_stem_kanji: unicode verb stem kanji
:return: romaji for verb stem kanji
"""
romaji = None
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["w_type"]:
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["romaji"]
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]:
for k in list(UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"].keys()):
if "verb stem" in k:
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"][k]
break
return romaji
def prepare_kanjiblocks(kchar_list):
"""
create and replace matched Kanji characters that are within kanji_mapping with KanjiBlock
KanjiBlock will be used for spacing and particle translation later
if the kanji found is a verb stem then try to find an ending to match it with what's in kchar_list
:param kchar_list: list containing kana and kanji characters
:return: kchar_list with all found Kanji characters turned in to KanjiBlock objects
"""
if len(UnicodeRomajiMapping.kanji_mapping) == 0:
UnicodeRomajiMapping.kanji_mapping = load_kanji_mappings_dict()
max_char_len = len(kchar_list)
kana_list = list(kchar_list)
start_pos = 0
while start_pos < max_char_len:
char_len = len(kana_list) - start_pos
while char_len > 0:
curr_chars = "".join(kana_list[start_pos: (start_pos + char_len)])
if curr_chars in UnicodeRomajiMapping.kanji_mapping:
verb_stem_type = get_type_if_verb_stem(curr_chars)
ending_match_found = False
if verb_stem_type is not None:
ending_kana, ending_romaji = check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len)
if ending_kana is not None and ending_romaji is not None:
ending_match_found = True
conjugated_val = {
"romaji": get_verb_stem_romaji(curr_chars) + ending_romaji,
"w_type": "conjugated " + verb_stem_type
}
for i in range(start_pos + char_len - 1 + len(ending_kana), start_pos - 1, -1):
del kana_list[i]
kana_list.insert(start_pos,
KanjiBlock(curr_chars + ending_kana, conjugated_val))
if ending_match_found is False and has_non_verb_stem_reading(curr_chars):
for i in range(start_pos + char_len - 1, start_pos - 1, -1):
del kana_list[i]
kana_list.insert(start_pos,
KanjiBlock(curr_chars, UnicodeRomajiMapping.kanji_mapping[curr_chars]))
char_len -= 1
start_pos += 1
return kana_list
def translate_kanji(kana_list):
i = 0
while i < len(kana_list):
if type(kana_list[i]) == KanjiBlock:
kana_list[i] = kana_list[i].romaji
i += 1
kana = "".join(kana_list)
return kana
def prep_kanji(kana):
kana_list = list(kana)
if any([is_kanji(k) for k in kana]):
kana_list = prepare_kanjiblocks(kana)
translate_kanji_iteration_mark(kana_list)
return kana_list
def translate_to_romaji(kana):
"""
translate hiragana, katakana, typographic, and fhw latin
:param kana: unicode kana(+kanji) characters
:return: translated base kana characters to romaji as well as typographic, and fhw latin
"""
if len(UnicodeRomajiMapping.kana_mapping) == 0:
UnicodeRomajiMapping.kana_mapping = load_kana_mappings_dict()
max_char_len = 2
for char_len in range(max_char_len, 0, -1):
start_pos = 0
while start_pos < len(kana) - char_len + 1:
curr_chars = kana[start_pos: (start_pos + char_len)]
if curr_chars in UnicodeRomajiMapping.kana_mapping:
kana = kana.replace(curr_chars, UnicodeRomajiMapping.kana_mapping[curr_chars], 1)
if len(UnicodeRomajiMapping.kana_mapping[curr_chars]) == 0:
start_pos -= 1
start_pos += 1
while " " in kana:
kana = kana.replace(" ", " ")
kana = kana.strip()
lines = kana.split("\n")
for i in range(0, len(lines)):
lines[i] = lines[i].strip()
kana = "\n".join(lines)
return kana
def translate_soukon(partial_kana):
"""
translate both hiragana and katakana soukon: っ, ッ; repeats next consonant
e.g:
ちょっと willl be choっto by the time iit is passed to this method and then becomes chotto
:param partial_kana: partially translated kana with base kana chars already translated to romaji
:return: partial kana with soukon translated
"""
prev_char = ""
for c in reversed(partial_kana):
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
partial_kana = prev_char[0].join(partial_kana.rsplit(c, 1))
prev_char = c
return partial_kana
def translate_long_vowel(partial_kana):
"""
translate katakana long vowel ー; repeats previous vowel
e.g:
メール will be meーru by the time it is passed to this method and then becomes meeru
:param partial_kana: partially translated kana with base kana chars already translated to romaji
:return: partial kana with long vowel translated
"""
prev_c = ""
for c in partial_kana:
if c == katakana_long_vowel_mark:
if prev_c[-1] in list("aeiou"):
partial_kana = partial_kana.replace(c, prev_c[-1], 1)
else:
partial_kana = partial_kana.replace(c, "", 1)
prev_c = c
return partial_kana
def translate_soukon_ch(kana):
"""
if soukon(mini-tsu) is followed by chi then soukon romaji becomes 't' sound
e.g: ko-soukon-chi -> kotchi instead of kocchi
:param kana:
:return:
"""
prev_char = ""
hiragana_chi_unicode_char = "\u3061"
katakana_chi_unicode_char = "\u30C1"
partial_kana = kana
for c in reversed(kana):
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
if prev_char == hiragana_chi_unicode_char or prev_char == katakana_chi_unicode_char:
partial_kana = "t".join(partial_kana.rsplit(c, 1))
prev_char = c
return partial_kana
def _translate_dakuten_equivalent_char(kana_char):
dakuten_mapping = {
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": "",
"": "", "": "", "": "", "": "", "": ""
}
dakuten_equiv = ""
if kana_char in dakuten_mapping:
dakuten_equiv = dakuten_mapping[kana_char]
return dakuten_equiv
def translate_dakuten_equivalent(kana_char):
"""
translate hiragana and katakana character to their dakuten equivalent
e.g:
ヒ: ビ
く: ぐ
み: ""
:param kana_char: unicode kana char
:return: dakuten equivalent if it exists otherwise empty string
"""
return _translate_dakuten_equivalent_char(kana_char)
def translate_kana_iteration_mark(kana):
"""
translate hiragana and katakana iteration marks: ゝ, ゞ, ヽ, ヾ
e.g:
こゝ: koko
タヾ: tada
かゞみち: kagaみち
:param kana: unicode consisting of kana chars
:return: unicode with kana iteration marks translated
"""
prev_char = ""
partial_kana = kana
for c in kana:
if c == hiragana_iter_mark or c == katakana_iter_mark:
partial_kana = prev_char.join(partial_kana.split(c, 1))
elif c == hiragana_voiced_iter_mark or c == katakana_voiced_iter_mark:
partial_kana = translate_dakuten_equivalent(prev_char).join(partial_kana.split(c, 1))
else:
prev_char = c
return partial_kana
def kanji_to_romaji(kana):
if type(kana) == str:
kana = kana
pk = translate_kana_iteration_mark(kana)
pk = translate_soukon_ch(pk)
pk_list = prep_kanji(pk)
translate_particles(pk_list)
pk = translate_kanji(pk_list)
pk = translate_to_romaji(pk)
pk = translate_soukon(pk)
r = translate_long_vowel(pk)
return r.replace("\\\\", "\\").encode("unicode_escape")
if __name__ == "__main__":
if len(sys.argv) > 1:
print(kanji_to_romaji(("".join(sys.argv[1:])).decode('unicode-escape')))
else:
print("Missing Kanji/Kana character argument\n" \
"e.g: kanji_to_romaji.py \\u30D2")