upload
This commit is contained in:
646
utils/modules/kanji_to_romaji/kanji_to_romaji_module.py
Normal file
646
utils/modules/kanji_to_romaji/kanji_to_romaji_module.py
Normal file
@@ -0,0 +1,646 @@
|
||||
# coding=utf-8
|
||||
import os
|
||||
import sys
|
||||
from collections import OrderedDict
|
||||
|
||||
try:
|
||||
# noinspection PyPackageRequirements
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .models import UnicodeRomajiMapping
|
||||
from .models import KanjiBlock
|
||||
from .models import Particle
|
||||
|
||||
PATH_TO_MODULE = os.path.dirname(__file__)
|
||||
JP_MAPPINGS_PATH = os.path.join(PATH_TO_MODULE, "jp_mappings")
|
||||
|
||||
hiragana_iter_mark = "ゝ"
|
||||
hiragana_voiced_iter_mark = "ゞ"
|
||||
katakana_iter_mark = "ヽ"
|
||||
katakana_voiced_iter_mark = "ヾ"
|
||||
kanji_iteration_mark = "々"
|
||||
|
||||
hirgana_soukon_unicode_char = "っ"
|
||||
katakana_soukon_unicode_char = "ッ"
|
||||
katakana_long_vowel_mark = "ー"
|
||||
|
||||
|
||||
def load_kana_mappings_dict():
|
||||
kana_romaji_mapping = {}
|
||||
for f in os.listdir(JP_MAPPINGS_PATH):
|
||||
if os.path.splitext(f)[1] == ".json" and "kanji" not in f:
|
||||
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
|
||||
kana_romaji_mapping.update(json.load(data_file))
|
||||
return kana_romaji_mapping
|
||||
|
||||
|
||||
def load_kanji_mappings_dict():
|
||||
"""
|
||||
read through all json files that contain "kanji" in filename
|
||||
load json data from files to kanji_romaji_mapping dictionary
|
||||
if the key(kanji char) has already been added to kanji_romaji_mapping then create "other_readings" key
|
||||
"other_readings" will consist of w_type for its key and the new romaji reading for it
|
||||
e.g:
|
||||
{u"係り":
|
||||
'w_type': 'noun',
|
||||
'romaji': 'kakari',
|
||||
{'other_readings': {'godan verb stem': 'kakawari'}
|
||||
}
|
||||
:return: dict - kanji to romaji mapping
|
||||
"""
|
||||
|
||||
kanji_romaji_mapping = {}
|
||||
f_list = os.listdir(JP_MAPPINGS_PATH)
|
||||
for f in f_list[:]: # shift all conjugated files to end, lower priority for verb stems
|
||||
if "conjugated" in f:
|
||||
f_list.remove(f)
|
||||
f_list.append(f)
|
||||
|
||||
for f in f_list:
|
||||
if os.path.splitext(f)[1] == ".json" and "kanji" in f:
|
||||
with open(os.path.join(JP_MAPPINGS_PATH, f), encoding='utf-8-sig') as data_file:
|
||||
data_file_dict = json.load(data_file)
|
||||
for k in list(data_file_dict.keys()):
|
||||
if k in kanji_romaji_mapping and \
|
||||
data_file_dict[k]["w_type"] != kanji_romaji_mapping[k]["w_type"]:
|
||||
# if "other_readings" in kanji_romaji_mapping[k] and \
|
||||
# data_file_dict[k]["w_type"] in kanji_romaji_mapping[k]["other_readings"]:
|
||||
# raise
|
||||
|
||||
if "other_readings" not in kanji_romaji_mapping[k]:
|
||||
kanji_romaji_mapping[k]["other_readings"] = {}
|
||||
|
||||
kanji_romaji_mapping[k]["other_readings"][data_file_dict[k]["w_type"]] = \
|
||||
data_file_dict[k]["romaji"]
|
||||
else:
|
||||
kanji_romaji_mapping[k] = data_file_dict[k]
|
||||
return kanji_romaji_mapping
|
||||
|
||||
|
||||
def _convert_hira_kata_char(hira_or_kata_char, h_to_k=True):
|
||||
"""
|
||||
take second last hex character from unicode and add/subtract 6 hex to it to get hiragana/katakana char
|
||||
e.g hiragana u3041 -> 0x3041 + 0x6 = 0x30A1 -> katakana u30A1
|
||||
|
||||
:param hira_or_kata_char: unicode hiragana character
|
||||
:return: converterd hiragana or katakana depending on h_to_k value
|
||||
"""
|
||||
if h_to_k:
|
||||
suffix_offset = 6
|
||||
else:
|
||||
suffix_offset = -6
|
||||
unicode_second_last_char = list(hira_or_kata_char.encode("unicode_escape"))[-2]
|
||||
suffix = hex(int(unicode_second_last_char, 16) + suffix_offset)
|
||||
char_list = list(hira_or_kata_char.encode("unicode_escape"))
|
||||
char_list[-2] = suffix[-1]
|
||||
result_char = "".join(char_list).decode('unicode-escape').encode('utf-8')
|
||||
return result_char
|
||||
|
||||
|
||||
def convert_hiragana_to_katakana(hiragana):
|
||||
converted_str = ""
|
||||
|
||||
for c in hiragana:
|
||||
if is_hiragana(c) or c in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char]:
|
||||
converted_str += _convert_hira_kata_char(c)
|
||||
else:
|
||||
converted_str += c.encode('utf-8')
|
||||
return converted_str.decode("utf-8")
|
||||
|
||||
|
||||
def convert_katakana_to_hiragana(katakana):
|
||||
converted_str = ""
|
||||
|
||||
for c in katakana:
|
||||
if is_katakana(c) or c in [katakana_iter_mark, katakana_voiced_iter_mark,
|
||||
katakana_soukon_unicode_char]:
|
||||
converted_str += _convert_hira_kata_char(c, h_to_k=False)
|
||||
else:
|
||||
converted_str += c.encode('utf-8')
|
||||
return converted_str.decode("utf-8")
|
||||
|
||||
|
||||
def is_hiragana(c):
|
||||
hiragana_starting_unicode = "\u3041"
|
||||
hiragana_ending_unicode = "\u3096"
|
||||
return c not in [hiragana_iter_mark, hiragana_voiced_iter_mark, hirgana_soukon_unicode_char] and \
|
||||
hiragana_starting_unicode <= c <= hiragana_ending_unicode
|
||||
|
||||
|
||||
def is_katakana(c):
|
||||
katakana_starting_unicode = "\u30A1"
|
||||
katakana_ending_unicode = "\u30F6"
|
||||
return c not in [katakana_iter_mark, katakana_voiced_iter_mark,
|
||||
katakana_soukon_unicode_char, katakana_long_vowel_mark] and \
|
||||
katakana_starting_unicode <= c <= katakana_ending_unicode
|
||||
|
||||
|
||||
def is_kanji(c):
|
||||
cjk_start_range = "\u4E00"
|
||||
cjk_end_range = "\u9FD5"
|
||||
if isinstance(c, KanjiBlock):
|
||||
return True
|
||||
else:
|
||||
return c != kanji_iteration_mark and cjk_start_range <= c <= cjk_end_range
|
||||
|
||||
|
||||
def get_char_type(c):
|
||||
"""
|
||||
determine type of passed character by checking if it belongs in a certan unicode range
|
||||
:param c: kana or kanji character
|
||||
:return: type of character
|
||||
"""
|
||||
char_type = None
|
||||
if is_hiragana(c):
|
||||
char_type = "hiragana"
|
||||
elif is_katakana(c):
|
||||
char_type = "katakana"
|
||||
elif is_kanji(c):
|
||||
char_type = "kanji"
|
||||
|
||||
return char_type
|
||||
|
||||
|
||||
def translate_particles(kana_list):
|
||||
"""
|
||||
try to find particles which are in hirgana and turn them in to Particle objects
|
||||
Particle will provide spacing and will be translated in to appropriate romaji (e.g wa instead of ha for は)
|
||||
|
||||
rules (varies depending on the hiragana char):
|
||||
char between two KanjiBlocks(that can be nouns) then assume to be a particle
|
||||
e.g: 私は嬉 -> KanjiBlock(私), は, KanjiBlock(嬉) -> は is particle use wa instead of ha
|
||||
type(Kanji, Hiragana, Katakana) changes adjacent to the char
|
||||
e.g: アパートへくる -> ト, へ, く -> katakana, へ, hiragana -> へ is a particle, use e instead of he
|
||||
char is last char and previous char is a noun
|
||||
e.g: 会いました友達に -> KanjiBlock(友達) which is a noun, に
|
||||
|
||||
:param kana_list: list of kana characters and KanjiBlock objects
|
||||
:return: None; update the kana_list that is passed
|
||||
"""
|
||||
def is_noun(k_block):
|
||||
return hasattr(k_block, "w_type") and ("noun" in k_block.w_type or "pronoun" in k_block.w_type)
|
||||
|
||||
def type_changes(p, n):
|
||||
if get_char_type(p) is not None and get_char_type(n) is not None:
|
||||
return get_char_type(p) != get_char_type(n)
|
||||
else:
|
||||
return False
|
||||
|
||||
def particle_imm_follows(prev_c_, valid_prev_particles):
|
||||
"""
|
||||
check if prev_c is a Particle object
|
||||
check that prev_c is one of the valid_prev_particles
|
||||
e.g: wa particle can't be followed by wa particle again but ni particle can be followed by wa.
|
||||
:param prev_c_: previous character compared to current character in the iteration
|
||||
:param valid_prev_particles: list of previous particles that can be followed by current character.
|
||||
:return:
|
||||
"""
|
||||
return isinstance(prev_c_, Particle) and prev_c_ in valid_prev_particles
|
||||
|
||||
no_hira_char = "\u306E"
|
||||
ha_hira_char = "\u306F"
|
||||
he_hira_char = "\u3078"
|
||||
to_hira_char = "\u3068"
|
||||
ni_hira_char = "\u306B"
|
||||
de_hira_char = "\u3067"
|
||||
mo_hira_char = "\u3082"
|
||||
ga_hira_char = "\u304C"
|
||||
|
||||
no_prtcle = Particle("no")
|
||||
wa_prtcle = Particle("wa")
|
||||
e_prtcle = Particle("e")
|
||||
to_prtcle = Particle("to")
|
||||
ni_prtcle = Particle("ni")
|
||||
de_prtcle = Particle("de")
|
||||
mo_prtcle = Particle("mo")
|
||||
ga_prtcle = Particle("ga")
|
||||
|
||||
for i in range(1, len(kana_list)):
|
||||
is_last_char = False
|
||||
prev_c = kana_list[i - 1]
|
||||
if i == len(kana_list) - 1:
|
||||
is_last_char = True
|
||||
next_c = ""
|
||||
else:
|
||||
next_c = kana_list[i + 1]
|
||||
|
||||
if kana_list[i] == no_hira_char:
|
||||
if (is_noun(prev_c) and is_noun(next_c)) or \
|
||||
type_changes(prev_c, next_c) or \
|
||||
(is_noun(prev_c) and is_last_char):
|
||||
kana_list[i] = no_prtcle
|
||||
|
||||
elif kana_list[i] == ha_hira_char:
|
||||
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
|
||||
type_changes(prev_c, next_c) or \
|
||||
particle_imm_follows(prev_c, [e_prtcle, to_prtcle, ni_prtcle, de_prtcle]) or \
|
||||
(is_noun(prev_c) and is_last_char):
|
||||
kana_list[i] = wa_prtcle
|
||||
|
||||
elif kana_list[i] == mo_hira_char:
|
||||
if (is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
|
||||
type_changes(prev_c, next_c) or \
|
||||
particle_imm_follows(prev_c, [ni_prtcle, de_prtcle]) or \
|
||||
(is_noun(prev_c) and is_last_char):
|
||||
kana_list[i] = mo_prtcle
|
||||
|
||||
elif kana_list[i] in [he_hira_char, to_hira_char, ni_hira_char, de_hira_char, ga_hira_char] and \
|
||||
(is_noun(prev_c) and isinstance(next_c, KanjiBlock)) or \
|
||||
type_changes(prev_c, next_c) or \
|
||||
(is_noun(prev_c) and is_last_char):
|
||||
|
||||
if kana_list[i] == he_hira_char:
|
||||
kana_list[i] = e_prtcle
|
||||
|
||||
elif kana_list[i] == to_hira_char:
|
||||
kana_list[i] = to_prtcle
|
||||
|
||||
elif kana_list[i] == ni_hira_char:
|
||||
kana_list[i] = ni_prtcle
|
||||
|
||||
elif kana_list[i] == de_hira_char:
|
||||
kana_list[i] = de_prtcle
|
||||
|
||||
elif kana_list[i] == ga_hira_char:
|
||||
kana_list[i] = ga_prtcle
|
||||
|
||||
|
||||
def translate_kanji_iteration_mark(kana_list):
|
||||
"""
|
||||
translate kanji_iteration_mark: 々
|
||||
e.g:
|
||||
在々: zaizai
|
||||
:param kana_list: unicode consisting of kana and kanji chars
|
||||
:return: unicode with kanji iteration marks translated
|
||||
"""
|
||||
prev_c = ""
|
||||
for i in range(0, len(kana_list)):
|
||||
if kana_list[i] == kanji_iteration_mark:
|
||||
kana_list[i] = prev_c.romaji.strip()
|
||||
prev_c = kana_list[i]
|
||||
|
||||
|
||||
def get_type_if_verb_stem(curr_chars):
|
||||
"""
|
||||
get verb type for given verb stem. verb types can be ichidan, godan or None.
|
||||
No stem for irregulars
|
||||
:param curr_chars: kanji chars that is a verb stem
|
||||
:return: type of verb stem
|
||||
"""
|
||||
v_type = None
|
||||
|
||||
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
|
||||
v_type = UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]
|
||||
|
||||
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
|
||||
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
|
||||
v_type = "godan verb"
|
||||
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
|
||||
v_type = "ichidan verb"
|
||||
|
||||
return v_type
|
||||
|
||||
|
||||
def check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len):
|
||||
"""
|
||||
if the given curr_chars has a verb stem reading then try to match it with an one of the listed verb endings
|
||||
otherwise return/use its .romaji property
|
||||
|
||||
e.g:
|
||||
kana_list = [KanjiBlock(灯り), ま, し, た]
|
||||
curr_chars = 灯り can be verb stem reading
|
||||
try and match 灯り with an ending within kana_list
|
||||
灯り + ました matches
|
||||
romaji is tomori + mashita (this modifies kana_list to remove matched ending)
|
||||
kana_list = [tomorimashita]
|
||||
|
||||
kana_list = [KanjiBlock(灯り), を, 見ます]
|
||||
curr_chars = 灯り can be verb stem reading
|
||||
try and match 灯り with an ending within kana_list
|
||||
no matching ending
|
||||
romaji is akari
|
||||
kana_list = [akari, を, 見ます]
|
||||
|
||||
:param kana_list:
|
||||
:param curr_chars: KanjiBlock current characters to parse out of entire kana_list
|
||||
:param start_pos:
|
||||
:param char_len:
|
||||
:return: ending kanji, ending romaji; both will be None if ending not found
|
||||
"""
|
||||
endings = OrderedDict({})
|
||||
endings["ませんでした"] = "masen deshita"
|
||||
endings["ませんで"] = "masende"
|
||||
endings["なさるな"] = "nasaruna"
|
||||
endings["なかった"] = "nakatta"
|
||||
endings["れて"] = "rete"
|
||||
endings["ましょう"] = "mashou"
|
||||
endings["ました"] = "mashita"
|
||||
endings["まして"] = "mashite"
|
||||
endings["ません"] = "masen"
|
||||
endings["ないで"] = "naide"
|
||||
endings["なさい"] = "nasai"
|
||||
endings["ます"] = "masu"
|
||||
endings["よう"] = "you" # ichidan
|
||||
endings["ない"] = "nai"
|
||||
endings["た"] = "ta" # ichidan
|
||||
endings["て"] = "te" # ichidan
|
||||
endings["ろ"] = "ro" # ichidan
|
||||
endings["う"] = "u"
|
||||
|
||||
dict_entry = None
|
||||
|
||||
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
|
||||
dict_entry = UnicodeRomajiMapping.kanji_mapping[curr_chars]
|
||||
|
||||
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
|
||||
|
||||
if "godan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
|
||||
dict_entry = {
|
||||
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["godan verb stem"]
|
||||
}
|
||||
elif "ichidan verb stem" in UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]:
|
||||
dict_entry = {
|
||||
"romaji": UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"]["ichidan verb stem"]
|
||||
}
|
||||
e_k = None
|
||||
e_r = None
|
||||
if dict_entry is not None:
|
||||
for e in list(endings.keys()):
|
||||
possible_conj = curr_chars + e
|
||||
actual_conj = "".join(kana_list[start_pos: (start_pos + char_len + len(e))])
|
||||
if possible_conj == actual_conj:
|
||||
e_k = e
|
||||
e_r = endings[e] + " "
|
||||
break
|
||||
|
||||
return e_k, e_r
|
||||
|
||||
|
||||
def has_non_verb_stem_reading(curr_chars):
|
||||
"""
|
||||
check if curr_chars has an alternative reading aside from the verb stem
|
||||
:param curr_chars: unicode kanji chars to check
|
||||
:return: true/false depending on if curr_chars has a verb stem reading
|
||||
"""
|
||||
res = False
|
||||
|
||||
if "verb stem" not in UnicodeRomajiMapping.kanji_mapping[curr_chars]["w_type"]:
|
||||
res = True
|
||||
|
||||
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[curr_chars]:
|
||||
if any(["verb stem" not in ork
|
||||
for ork in list(UnicodeRomajiMapping.kanji_mapping[curr_chars]["other_readings"].keys())]):
|
||||
res = True
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def get_verb_stem_romaji(verb_stem_kanji):
|
||||
"""
|
||||
find romaji for verb stem within kanji_mapping
|
||||
:param verb_stem_kanji: unicode verb stem kanji
|
||||
:return: romaji for verb stem kanji
|
||||
"""
|
||||
romaji = None
|
||||
if "verb stem" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["w_type"]:
|
||||
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["romaji"]
|
||||
elif "other_readings" in UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]:
|
||||
for k in list(UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"].keys()):
|
||||
if "verb stem" in k:
|
||||
romaji = UnicodeRomajiMapping.kanji_mapping[verb_stem_kanji]["other_readings"][k]
|
||||
break
|
||||
|
||||
return romaji
|
||||
|
||||
|
||||
def prepare_kanjiblocks(kchar_list):
|
||||
"""
|
||||
create and replace matched Kanji characters that are within kanji_mapping with KanjiBlock
|
||||
KanjiBlock will be used for spacing and particle translation later
|
||||
if the kanji found is a verb stem then try to find an ending to match it with what's in kchar_list
|
||||
:param kchar_list: list containing kana and kanji characters
|
||||
:return: kchar_list with all found Kanji characters turned in to KanjiBlock objects
|
||||
"""
|
||||
if len(UnicodeRomajiMapping.kanji_mapping) == 0:
|
||||
UnicodeRomajiMapping.kanji_mapping = load_kanji_mappings_dict()
|
||||
|
||||
max_char_len = len(kchar_list)
|
||||
kana_list = list(kchar_list)
|
||||
|
||||
start_pos = 0
|
||||
while start_pos < max_char_len:
|
||||
char_len = len(kana_list) - start_pos
|
||||
while char_len > 0:
|
||||
curr_chars = "".join(kana_list[start_pos: (start_pos + char_len)])
|
||||
if curr_chars in UnicodeRomajiMapping.kanji_mapping:
|
||||
verb_stem_type = get_type_if_verb_stem(curr_chars)
|
||||
ending_match_found = False
|
||||
if verb_stem_type is not None:
|
||||
ending_kana, ending_romaji = check_for_verb_stem_ending(kana_list, curr_chars, start_pos, char_len)
|
||||
if ending_kana is not None and ending_romaji is not None:
|
||||
ending_match_found = True
|
||||
conjugated_val = {
|
||||
"romaji": get_verb_stem_romaji(curr_chars) + ending_romaji,
|
||||
"w_type": "conjugated " + verb_stem_type
|
||||
}
|
||||
|
||||
for i in range(start_pos + char_len - 1 + len(ending_kana), start_pos - 1, -1):
|
||||
del kana_list[i]
|
||||
|
||||
kana_list.insert(start_pos,
|
||||
KanjiBlock(curr_chars + ending_kana, conjugated_val))
|
||||
|
||||
if ending_match_found is False and has_non_verb_stem_reading(curr_chars):
|
||||
for i in range(start_pos + char_len - 1, start_pos - 1, -1):
|
||||
del kana_list[i]
|
||||
kana_list.insert(start_pos,
|
||||
KanjiBlock(curr_chars, UnicodeRomajiMapping.kanji_mapping[curr_chars]))
|
||||
char_len -= 1
|
||||
start_pos += 1
|
||||
return kana_list
|
||||
|
||||
|
||||
def translate_kanji(kana_list):
|
||||
i = 0
|
||||
while i < len(kana_list):
|
||||
if type(kana_list[i]) == KanjiBlock:
|
||||
kana_list[i] = kana_list[i].romaji
|
||||
i += 1
|
||||
|
||||
kana = "".join(kana_list)
|
||||
return kana
|
||||
|
||||
|
||||
def prep_kanji(kana):
|
||||
kana_list = list(kana)
|
||||
if any([is_kanji(k) for k in kana]):
|
||||
kana_list = prepare_kanjiblocks(kana)
|
||||
translate_kanji_iteration_mark(kana_list)
|
||||
|
||||
return kana_list
|
||||
|
||||
|
||||
def translate_to_romaji(kana):
|
||||
"""
|
||||
translate hiragana, katakana, typographic, and fhw latin
|
||||
:param kana: unicode kana(+kanji) characters
|
||||
:return: translated base kana characters to romaji as well as typographic, and fhw latin
|
||||
"""
|
||||
if len(UnicodeRomajiMapping.kana_mapping) == 0:
|
||||
UnicodeRomajiMapping.kana_mapping = load_kana_mappings_dict()
|
||||
|
||||
max_char_len = 2
|
||||
|
||||
for char_len in range(max_char_len, 0, -1):
|
||||
start_pos = 0
|
||||
while start_pos < len(kana) - char_len + 1:
|
||||
curr_chars = kana[start_pos: (start_pos + char_len)]
|
||||
if curr_chars in UnicodeRomajiMapping.kana_mapping:
|
||||
kana = kana.replace(curr_chars, UnicodeRomajiMapping.kana_mapping[curr_chars], 1)
|
||||
if len(UnicodeRomajiMapping.kana_mapping[curr_chars]) == 0:
|
||||
start_pos -= 1
|
||||
start_pos += 1
|
||||
|
||||
while " " in kana:
|
||||
kana = kana.replace(" ", " ")
|
||||
kana = kana.strip()
|
||||
|
||||
lines = kana.split("\n")
|
||||
for i in range(0, len(lines)):
|
||||
lines[i] = lines[i].strip()
|
||||
kana = "\n".join(lines)
|
||||
return kana
|
||||
|
||||
|
||||
def translate_soukon(partial_kana):
|
||||
"""
|
||||
translate both hiragana and katakana soukon: っ, ッ; repeats next consonant
|
||||
e.g:
|
||||
ちょっと willl be choっto by the time iit is passed to this method and then becomes chotto
|
||||
:param partial_kana: partially translated kana with base kana chars already translated to romaji
|
||||
:return: partial kana with soukon translated
|
||||
"""
|
||||
prev_char = ""
|
||||
|
||||
for c in reversed(partial_kana):
|
||||
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
|
||||
partial_kana = prev_char[0].join(partial_kana.rsplit(c, 1))
|
||||
prev_char = c
|
||||
return partial_kana
|
||||
|
||||
|
||||
def translate_long_vowel(partial_kana):
|
||||
"""
|
||||
translate katakana long vowel ー; repeats previous vowel
|
||||
e.g:
|
||||
メール will be meーru by the time it is passed to this method and then becomes meeru
|
||||
:param partial_kana: partially translated kana with base kana chars already translated to romaji
|
||||
:return: partial kana with long vowel translated
|
||||
"""
|
||||
prev_c = ""
|
||||
for c in partial_kana:
|
||||
if c == katakana_long_vowel_mark:
|
||||
if prev_c[-1] in list("aeiou"):
|
||||
partial_kana = partial_kana.replace(c, prev_c[-1], 1)
|
||||
else:
|
||||
partial_kana = partial_kana.replace(c, "", 1)
|
||||
prev_c = c
|
||||
return partial_kana
|
||||
|
||||
|
||||
def translate_soukon_ch(kana):
|
||||
"""
|
||||
if soukon(mini-tsu) is followed by chi then soukon romaji becomes 't' sound
|
||||
e.g: ko-soukon-chi -> kotchi instead of kocchi
|
||||
:param kana:
|
||||
:return:
|
||||
"""
|
||||
|
||||
prev_char = ""
|
||||
hiragana_chi_unicode_char = "\u3061"
|
||||
katakana_chi_unicode_char = "\u30C1"
|
||||
partial_kana = kana
|
||||
for c in reversed(kana):
|
||||
if c == hirgana_soukon_unicode_char or c == katakana_soukon_unicode_char: # assuming that soukon can't be last
|
||||
if prev_char == hiragana_chi_unicode_char or prev_char == katakana_chi_unicode_char:
|
||||
partial_kana = "t".join(partial_kana.rsplit(c, 1))
|
||||
prev_char = c
|
||||
return partial_kana
|
||||
|
||||
|
||||
def _translate_dakuten_equivalent_char(kana_char):
|
||||
dakuten_mapping = {
|
||||
"か": "が", "き": "ぎ", "く": "ぐ", "け": "げ", "こ": "ご",
|
||||
"さ": "ざ", "し": "じ", "す": "ず", "せ": "ぜ", "そ": "ぞ",
|
||||
"た": "だ", "ち": "ぢ", "つ": "づ", "て": "で", "と": "ど",
|
||||
"は": "ば", "ひ": "び", "ふ": "ぶ", "へ": "べ", "ほ": "ぼ",
|
||||
"タ": "ダ", "チ": "ヂ", "ツ": "ヅ", "テ": "デ", "ト": "ド",
|
||||
"カ": "ガ", "キ": "ギ", "ク": "グ", "ケ": "ゲ", "コ": "ゴ",
|
||||
"サ": "ザ", "シ": "ジ", "ス": "ズ", "セ": "ゼ", "ソ": "ゾ",
|
||||
"ハ": "バ", "ヒ": "ビ", "フ": "ブ", "ヘ": "ベ", "ホ": "ボ"
|
||||
}
|
||||
|
||||
dakuten_equiv = ""
|
||||
if kana_char in dakuten_mapping:
|
||||
dakuten_equiv = dakuten_mapping[kana_char]
|
||||
|
||||
return dakuten_equiv
|
||||
|
||||
|
||||
def translate_dakuten_equivalent(kana_char):
|
||||
"""
|
||||
translate hiragana and katakana character to their dakuten equivalent
|
||||
e.g:
|
||||
ヒ: ビ
|
||||
く: ぐ
|
||||
み: ""
|
||||
:param kana_char: unicode kana char
|
||||
:return: dakuten equivalent if it exists otherwise empty string
|
||||
"""
|
||||
return _translate_dakuten_equivalent_char(kana_char)
|
||||
|
||||
|
||||
def translate_kana_iteration_mark(kana):
|
||||
"""
|
||||
translate hiragana and katakana iteration marks: ゝ, ゞ, ヽ, ヾ
|
||||
e.g:
|
||||
こゝ: koko
|
||||
タヾ: tada
|
||||
かゞみち: kagaみち
|
||||
:param kana: unicode consisting of kana chars
|
||||
:return: unicode with kana iteration marks translated
|
||||
"""
|
||||
prev_char = ""
|
||||
partial_kana = kana
|
||||
for c in kana:
|
||||
if c == hiragana_iter_mark or c == katakana_iter_mark:
|
||||
partial_kana = prev_char.join(partial_kana.split(c, 1))
|
||||
elif c == hiragana_voiced_iter_mark or c == katakana_voiced_iter_mark:
|
||||
partial_kana = translate_dakuten_equivalent(prev_char).join(partial_kana.split(c, 1))
|
||||
else:
|
||||
prev_char = c
|
||||
return partial_kana
|
||||
|
||||
|
||||
def kanji_to_romaji(kana):
|
||||
if type(kana) == str:
|
||||
kana = kana
|
||||
pk = translate_kana_iteration_mark(kana)
|
||||
pk = translate_soukon_ch(pk)
|
||||
pk_list = prep_kanji(pk)
|
||||
translate_particles(pk_list)
|
||||
pk = translate_kanji(pk_list)
|
||||
pk = translate_to_romaji(pk)
|
||||
pk = translate_soukon(pk)
|
||||
r = translate_long_vowel(pk)
|
||||
return r.replace("\\\\", "\\").encode("unicode_escape")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
print(kanji_to_romaji(("".join(sys.argv[1:])).decode('unicode-escape')))
|
||||
else:
|
||||
print("Missing Kanji/Kana character argument\n" \
|
||||
"e.g: kanji_to_romaji.py \\u30D2")
|
||||
Reference in New Issue
Block a user