Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
mecab = MeCab.Tagger("-Ochasen")
mecab.parse('') # 空でパースする必要がある
node = mecab.parseToNode(text)
ret = []
while node is not None:
origin = node.surface # もとの単語を代入
if not origin:
node = node.next
continue
# originが空のとき、漢字以外の時はふりがなを振る必要がないのでそのまま出力する
if origin != "" and any(is_kanji(_) for _ in origin):
kana = node.feature.split(",")[7] # 読み仮名を代入
hiragana = jaconv.kata2hira(kana)
for pair in split_okurigana(origin, hiragana):
ret += [pair]
else:
if origin:
ret += [(origin,)]
node = node.next
return ret
trn_dir_list = [os.path.join(trn_base_dir, category) for category in ["core", "noncore"]]
all_triphone_sequences = []
for dir_idx, trn_dir in enumerate(trn_dir_list):
trn_files = os.listdir(trn_dir)
for file_idx, trn_filename in enumerate(trn_files):
printr("\r{}/{} ({}/{})".format(file_idx + 1, len(trn_files), dir_idx + 1, len(trn_dir_list)))
trn_path = os.path.join(trn_dir, trn_filename)
with codecs.open(trn_path, "r", "utf-8") as f:
for data in f:
components = data.split(":")
assert len(components) == 3
sentence = components[-1].strip()
sentence = jaconv.hira2kata(sentence) # 強制カタカナ変換
unigram_tokens = convert_sentence_to_unigram_tokens(sentence)
for token in unigram_tokens:
if token not in unigram_counts:
raise Exception(token)
unigram_counts[token] += 1
if len(unigram_tokens) == 1:
continue
for first, second in zip(unigram_tokens[:-1], unigram_tokens[1:]):
if first == u"ー":
continue
if second == u"ー":
continue
key = first + second
if key not in bigram_counts:
raise Exception(key)
bigram_counts[key] += 1
if k in song.meta:
v = song.meta[k]
search.add(normalize(v[request.lc]))
search.add(normalize(v["k"]))
search.add(normalize(v["l"]))
search.add(normalize(jaconv.kana2alphabet(jaconv.kata2hira(v["k"]))).replace("ー",""))
for k in ("genre",):
if k in song.meta:
v = song.meta[k]
search.add(normalize(v[request.lc]))
d["search"] = list(search)
if request.latin:
d["sort"] = song.meta["title"][(request.lc, "l")]
if ord(d["sort"][0:1]) > 0x100:
# Try again with kana-to-romaji, might help manufacture some sensible sort order
d["sort"] = jaconv.kana2alphabet(jaconv.kata2hira(song.meta["title"][(request.lc, "l", "k")]))
else:
d["sort"] = song.meta["title"][(request.lc, "k")]
return d
with open(kanji_map_file_path, 'w') as f:
# Reserve index
all_char_set.discard('N')
all_char_set.discard('Z')
all_char_set.discard('_')
if model == 'attention':
all_char_set.discard('<')
all_char_set.discard('>')
kanji_set = set([])
for char in all_char_set:
if (not is_hiragana(char)) and (not is_katakana(char)):
kanji_set.add(char)
for kana in kana_list:
kanji_set.add(kana)
kanji_set.add(jaconv.kata2hira(kana))
if model == 'ctc':
kanji_list = ['_', 'NZ'] + sorted(list(kanji_set))
elif model == 'attention':
kanji_list = ['_', '<', '>', 'NZ'] + sorted(list(kanji_set))
for i, kanji in enumerate(kanji_list):
f.write('%s %s\n' % (kanji, str(i)))
# kana
with open(kana_map_file_path, 'w') as f:
if model == 'ctc':
kana_list = ['_', 'NZ'] + kana_list
elif model == 'attention':
kana_list = ['_', '<', '>', 'NZ'] + kana_list
for i, kana in enumerate(kana_list):
f.write('%s %s\n' % (kana, str(i)))
# kanji
with open(kanji_map_file_path, 'w') as f:
# インデックスを予約するラベル
all_char_set.discard('N')
all_char_set.discard('Z')
all_char_set.discard('_')
all_char_set.discard('<')
all_char_set.discard('>')
kanji_set = set([])
for char in all_char_set:
if (not is_hiragana(char)) and (not is_katakana(char)):
kanji_set.add(char)
for kana in kana_list:
kanji_set.add(kana)
kanji_set.add(jaconv.kata2hira(kana))
# NOTE: 頻出するラベルにはなるべく小さいインデックスを与える
kanji_list = ['_', '<', '>', 'NZ'] + sorted(list(kanji_set))
for index, kanji in enumerate(kanji_list):
f.write('%s %s\n' % (kanji, str(index)))
# kana
with open(kana_map_file_path, 'w') as f:
kana_list = ['_', '<', '>', 'NZ'] + kana_list
for index, kana in enumerate(kana_list):
f.write('%s %s\n' % (kana, str(index)))
# phone
with open(phone_map_file_path, 'w') as f:
phone_list = ['_', '<', '>', 'NZ'] + sorted(list(phone_set))
for index, phone in enumerate(phone_list):
f.write('%s %s\n' % (phone, str(index)))
import jaconv
from WaniKaniTools.api import APIv2
if __name__ == '__main__':
api_v2 = APIv2()
result = api_v2.GET('subjects', params={'types':'vocabulary'})
# print(json.dumps(result, indent=4))
while True:
for data in result['data']:
reading_array = set()
for reading in data["data"]["readings"]:
# if reading['primary']:
reading_array.add(jaconv.kata2hira(reading['reading']))
if len(reading_array) == 1:
continue
meaning_array = []
for meaning in data["data"]["meanings"]:
if meaning['primary']:
meaning_array += [meaning['meaning']]
to_print = (
data["data"]["characters"] if "characters" in data["data"] else data["data"]["character"],
', '.join(reading_array),
', '.join(meaning_array)
)
print('\t'.join(to_print))
def katakana(string):
return jaconv.hira2kata(string)
def normalize(s):
val = jaconv.h2z(s)
val = jaconv.hira2kata(val)
val = val.translate(normalize_tbl)
return val
def text_to_sequence(text, p=0.0):
for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
"(", ")", "(", ")"]:
text = text.replace(c, "")
text = text.replace("!", "!")
text = text.replace("?", "?")
text = normalize_delimitor(text)
text = jaconv.normalize(text)
if p > 0:
text = mix_pronunciation(text, p)
text = jaconv.hira2kata(text)
text = add_punctuation(text)
return [ord(c) for c in text] + [_eos] # EOS
logmel, delta, delta_delta = fft.compute_deltas(logmel)
logmel = logmel.T
delta = delta.T
delta_delta = delta_delta.T
if logmel.shape[1] > max_feature_length:
max_feature_length = logmel.shape[1]
if len(sentence) > max_sentence_length:
max_sentence_length = len(sentence)
if logmel.shape[1] == 0:
continue
audio_features.append((logmel, delta, delta_delta))
sentence = jaconv.hira2kata(sentence) # 強制カタカナ変換
sentences.append(sentence)
assert max_feature_length > 0
return audio_features, sentences, max_feature_length, max_sentence_length