How to use jaconv - 10 common examples

To help you get started, we’ve selected a few jaconv examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MikimotoH / furigana / furigana / furigana.py View on Github external
"""
    mecab = MeCab.Tagger("-Ochasen")
    mecab.parse('') # 空でパースする必要がある
    node = mecab.parseToNode(text)
    ret = []

    while node is not None:
        origin = node.surface # もとの単語を代入
        if not origin:
            node = node.next
            continue

        # originが空のとき、漢字以外の時はふりがなを振る必要がないのでそのまま出力する
        if origin != "" and any(is_kanji(_) for _ in origin):
            kana = node.feature.split(",")[7] # 読み仮名を代入
            hiragana = jaconv.kata2hira(kana)
            for pair in split_okurigana(origin, hiragana):
                ret += [pair]
        else:
            if origin:
                ret += [(origin,)]
        node = node.next
    return ret
github musyoku / chainer-speech-recognition / tools / preprocess / bigram.py View on Github external
trn_dir_list = [os.path.join(trn_base_dir, category) for category in ["core", "noncore"]]
	all_triphone_sequences = []

	for dir_idx, trn_dir in enumerate(trn_dir_list):
		trn_files = os.listdir(trn_dir)

		for file_idx, trn_filename in enumerate(trn_files):
			printr("\r{}/{} ({}/{})".format(file_idx + 1, len(trn_files), dir_idx + 1, len(trn_dir_list)))
			trn_path = os.path.join(trn_dir, trn_filename)

			with codecs.open(trn_path, "r", "utf-8") as f:
				for data in f:
					components = data.split(":")
					assert len(components) == 3
					sentence = components[-1].strip()
					sentence = jaconv.hira2kata(sentence) # 強制カタカナ変換
					unigram_tokens = convert_sentence_to_unigram_tokens(sentence)
					for token in unigram_tokens:
						if token not in unigram_counts:
							raise Exception(token)
						unigram_counts[token] += 1
					if len(unigram_tokens) == 1:
						continue
					for first, second in zip(unigram_tokens[:-1], unigram_tokens[1:]):
						if first == u"ー":
							continue
						if second == u"ー":
							continue
						key = first + second
						if key not in bigram_counts:
							raise Exception(key)
						bigram_counts[key] += 1
github marcan / blitzloop / blitzloop / web.py View on Github external
if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
            search.add(normalize(v["k"]))
            search.add(normalize(v["l"]))
            search.add(normalize(jaconv.kana2alphabet(jaconv.kata2hira(v["k"]))).replace("ー",""))
    for k in ("genre",):
        if k in song.meta:
            v = song.meta[k]
            search.add(normalize(v[request.lc]))
    d["search"] = list(search)
    if request.latin:
        d["sort"] = song.meta["title"][(request.lc, "l")]
        if ord(d["sort"][0:1]) > 0x100:
            # Try again with kana-to-romaji, might help manufacture some sensible sort order
            d["sort"] = jaconv.kana2alphabet(jaconv.kata2hira(song.meta["title"][(request.lc, "l", "k")]))
    else:
        d["sort"] = song.meta["title"][(request.lc, "k")]
    return d
github hirofumi0810 / asr_preprocessing / csj / labels / target.py View on Github external
with open(kanji_map_file_path, 'w') as f:
            # Reserve index
            all_char_set.discard('N')
            all_char_set.discard('Z')
            all_char_set.discard('_')
            if model == 'attention':
                all_char_set.discard('<')
                all_char_set.discard('>')

            kanji_set = set([])
            for char in all_char_set:
                if (not is_hiragana(char)) and (not is_katakana(char)):
                    kanji_set.add(char)
            for kana in kana_list:
                kanji_set.add(kana)
                kanji_set.add(jaconv.kata2hira(kana))

            if model == 'ctc':
                kanji_list = ['_', 'NZ'] + sorted(list(kanji_set))
            elif model == 'attention':
                kanji_list = ['_', '<', '>', 'NZ'] + sorted(list(kanji_set))
            for i, kanji in enumerate(kanji_list):
                f.write('%s  %s\n' % (kanji, str(i)))

        # kana
        with open(kana_map_file_path, 'w') as f:
            if model == 'ctc':
                kana_list = ['_', 'NZ'] + kana_list
            elif model == 'attention':
                kana_list = ['_', '<', '>', 'NZ'] + kana_list
            for i, kana in enumerate(kana_list):
                f.write('%s  %s\n' % (kana, str(i)))
github hirofumi0810 / asr_preprocessing / csj / labels / attention / character.py View on Github external
# kanji
        with open(kanji_map_file_path, 'w') as f:
            # インデックスを予約するラベル
            all_char_set.discard('N')
            all_char_set.discard('Z')
            all_char_set.discard('_')
            all_char_set.discard('<')
            all_char_set.discard('>')

            kanji_set = set([])
            for char in all_char_set:
                if (not is_hiragana(char)) and (not is_katakana(char)):
                    kanji_set.add(char)
            for kana in kana_list:
                kanji_set.add(kana)
                kanji_set.add(jaconv.kata2hira(kana))
            # NOTE: 頻出するラベルにはなるべく小さいインデックスを与える
            kanji_list = ['_', '<', '>', 'NZ'] + sorted(list(kanji_set))
            for index, kanji in enumerate(kanji_list):
                f.write('%s  %s\n' % (kanji, str(index)))

        # kana
        with open(kana_map_file_path, 'w') as f:
            kana_list = ['_', '<', '>', 'NZ'] + kana_list
            for index, kana in enumerate(kana_list):
                f.write('%s  %s\n' % (kana, str(index)))

        # phone
        with open(phone_map_file_path, 'w') as f:
            phone_list = ['_',  '<', '>', 'NZ'] + sorted(list(phone_set))
            for index, phone in enumerate(phone_list):
                f.write('%s  %s\n' % (phone, str(index)))
github patarapolw / WaniKaniTools / example / multiple_readings.py View on Github external
import jaconv
from WaniKaniTools.api import APIv2


if __name__ == '__main__':
    api_v2 = APIv2()
    result = api_v2.GET('subjects', params={'types':'vocabulary'})

    # print(json.dumps(result, indent=4))
    while True:
        for data in result['data']:
            reading_array = set()
            for reading in data["data"]["readings"]:
                # if reading['primary']:
                    reading_array.add(jaconv.kata2hira(reading['reading']))

            if len(reading_array) == 1:
                continue

            meaning_array = []
            for meaning in data["data"]["meanings"]:
                if meaning['primary']:
                    meaning_array += [meaning['meaning']]

            to_print = (
                data["data"]["characters"] if "characters" in data["data"] else data["data"]["character"],
                ', '.join(reading_array),
                ', '.join(meaning_array)
            )
            print('\t'.join(to_print))
github yoriyuki / nksnd / nksnd / utils / words.py View on Github external
def katakana(string):
    return jaconv.hira2kata(string)
github marcan / blitzloop / blitzloop / web.py View on Github external
def normalize(s):
    val = jaconv.h2z(s)
    val = jaconv.hira2kata(val)
    val = val.translate(normalize_tbl)
    return val
github r9y9 / deepvoice3_pytorch / deepvoice3_pytorch / frontend / jp / __init__.py View on Github external
def text_to_sequence(text, p=0.0):
    for c in [" ", " ", "「", "」", "『", "』", "・", "【", "】",
              "(", ")", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "!")
    text = text.replace("?", "?")

    text = normalize_delimitor(text)
    text = jaconv.normalize(text)
    if p > 0:
        text = mix_pronunciation(text, p)
    text = jaconv.hira2kata(text)
    text = add_punctuation(text)

    return [ord(c) for c in text] + [_eos]  # EOS
github musyoku / chainer-speech-recognition / asr / data / processing.py View on Github external
logmel, delta, delta_delta = fft.compute_deltas(logmel)

			logmel = logmel.T
			delta = delta.T
			delta_delta = delta_delta.T

			if logmel.shape[1] > max_feature_length:
				max_feature_length = logmel.shape[1]
			if len(sentence) > max_sentence_length:
				max_sentence_length = len(sentence)

			if logmel.shape[1] == 0:
				continue

			audio_features.append((logmel, delta, delta_delta))
			sentence = jaconv.hira2kata(sentence) # 強制カタカナ変換
			sentences.append(sentence)

		assert max_feature_length > 0
		return audio_features, sentences, max_feature_length, max_sentence_length