Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_custom_pinyin_dict_tone2():
load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2')
assert lazy_pinyin('桔', style=TONE2) == ['ce4']
assert pinyin('桔') == [['cè']]
def test_36():
hans = '两年前七斤喝醉了酒'
pys = ['liang', 'nian', 'qian', 'qi', 'jin', 'he', 'zui', 'le', 'jiu']
assert lazy_pinyin(hans) == pys
words = segment_utf8_pinyin2(text)
elif method == 'basic2':
words = kf.deform_dis_get_words(text)
elif method =='char_pinyin':
from pypinyin import lazy_pinyin as pinyin
#return pinyin(text)
return [x.strip() for x in pinyin(text)]
elif method =='char_pinyin2':
from pypinyin import lazy_pinyin as pinyin
return [''.join(pinyin(x)) for x in text]
elif method == 'char_then_pinyin':
# In [2]: pinyin('补肾微信xtx0329')
# Out[2]: ['bu', 'shen', 'wei', 'xin', 'xtx0329']
# so as 补,肾,微,信,x,t,x,0,3,2,9,<s>,bu,shen,wei,xin,
from pypinyin import lazy_pinyin as pinyin
return segment_char(text) + ['<s>'] + [x.strip() for x in pinyin(text)]
elif method == 'char_then_pinyin2':
from pypinyin import lazy_pinyin as pinyin
return segment_char(text) + ['<s>'] + [''.join(pinyin(x)).strip() for x in text if x.strip()]
elif method == 'word_char':
return [x for x in cut(text, cut_all=False)] + ['<s>'] + segment_char(text)
elif method == 'word_char_pinyin':
from pypinyin import lazy_pinyin as pinyin
return [x for x in cut(text, cut_all=False)] + ['<s>'] + segment_char(text) + ['<s>'] + [x.strip() for x in pinyin(text)]
elif method == 'word_char_pinyin2':
from pypinyin import lazy_pinyin as pinyin
return [x for x in cut(text, cut_all=False)] + ['<s>'] + segment_char(text) + ['<s>'] + [''.join(pinyin(x)).strip() for x in text if x.strip()]
elif method == 'tab':
words = text.strip().split('\t')
elif method == 'white_space':
words = text.strip().split()
else:</s></s></s></s></s></s></s></s>
def synthesize(self, input_text='', output_wav_path=''):
"""
Synthesize .wav from text
input_text: the folder that contains all syllables .wav files
output_wav_path: the destination folder to save the synthesized file
"""
delay = 0
increment = 355 # milliseconds
pause = 500 # pause for punctuation
syllables = lazy_pinyin(input_text, style=pypinyin.TONE3)
# initialize to be complete silence, each character takes up ~500ms
result = AudioSegment.silent(duration=500 * len(input_text))
for syllable in syllables:
path = os.path.join(self.syllables_dir, syllable + ".wav")
sound_file = Path(path)
# insert 500 sr silence for punctuation marks
if syllable in self.punctuation:
short_silence = AudioSegment.silent(duration=pause)
result = result.overlay(short_silence, position=delay)
delay += increment
continue
# skip sound file that doesn't exist
if not sound_file.is_file():
continue
segment = AudioSegment.from_wav(path)
def main():
print(lazy_pinyin('中国'))
print(''.join(lazy_pinyin('大西瓜')))
print(lazy_pinyin('T恤'))
print(lazy_pinyin('big西瓜373a'))
def read_btxt2(infile,outfile):
'''读取成语文件,生成相应的拼音字典,共44482词,结果如下所示:
{"钝口拙腮": ["dun", "kou", "zhuo", "sai"],
"怜我怜卿": ["lian", "wo", "lian", "qing"]}
版本2,共55222词
'''
dict_data={}
with open(infile,'r',encoding='utf8')as f:
data=json.load(f)
for d in data:
line=d.strip() #.decode('gb18030')
if len(line)!=4:
continue
dict_data.setdefault(line,lazy_pinyin(line))
with open(outfile,'w',encoding='utf8')as fo:
json.dump(dict_data,fo,ensure_ascii=0)
print('总共成语数:',len(dict_data.keys()))
_updatetime = _o["updatetime"]
_u = int(time.mktime(_updatetime.timetuple()))
if _u > _timestamp:
_rdata["type"] = YVOBJECT.AU
_rdata["uuid"] = _o["user_uuid"]
_rdata["email"] = _o["user_email"]
_rdata["desc"] = None
_rdata["fullname"] = _o["user_fullname"]
_fn = _o["user_fullname"]
if not isinstance(_fn, unicode):
_fn = _fn.encode("utf-8")
_rdata["pinyinname0"] = "".join(lazy_pinyin(_fn))
_rdata["pinyinname1"] = "".join(list(itertools.chain.from_iterable(pinyin(_fn, style=pypinyin.INITIALS))))
_rdata["icon"] = _o["user_icon"]
_rdata["timestamp"] = _u
_rdata["updated"] = True
else:
_rdata["updated"] = False
return
def to_pinyin(word):
if not isinstance(word, unicode):
word = word.decode('utf-8')
return ''.join(lazy_pinyin(word))
def candidates(word):
candidates_1_order = []
candidates_2_order = []
candidates_3_order = []
error_pinyin = lazy_pinyin(word)
cn_char_set = load_word_dict(char_file_path)
candidate_words = list(known(edit_distance_word(word, cn_char_set)))
for candidate_word in candidate_words:
candidata_pinyin = lazy_pinyin(candidate_word)
if candidata_pinyin == error_pinyin:
candidates_1_order.append(candidate_word)
elif candidata_pinyin[0] == error_pinyin[0]:
candidates_2_order.append(candidate_word)
else:
candidates_3_order.append(candidate_word)
return candidates_1_order, candidates_2_order, candidates_3_order