Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
print("to process ./recording/{key}.wav")
sys.exit(1)
key = sys.argv[1:][0]
syllables = data[key]
path = "./recording/" + key + ".wav"
file = Path(path)
if not file.is_file():
raise Exception(path + " doesn't exist")
sound_file = AudioSegment.from_wav(path)
audio_chunks = split_on_silence(sound_file,
# must be silent for at least 300ms
min_silence_len=300,
# consider it silent if quieter than -48 dBFS
silence_thresh=-48
)
flag = False
if len(syllables) * 5 != len(audio_chunks):
flag = True
for i, chunk in enumerate(audio_chunks):
syllable = syllables[i // 5]
print(syllable)
j = i % 5
if j != 4: # 1st, 2nd, 3rd, 4th tone
out_file = "./pre/" + syllable + str(j + 1) + ".wav"
EXPORT_PATH = '/home/gswewf/data/五十音图'
time_start = "00:16"
time_end = "01:35"
song = AudioSegment.from_mp3(file)
start = (int(time_start.split(':')[0])*60 + int(time_start.split(':')[1]))*1000
end = (int(time_end.split(':')[0])*60 + int(time_end.split(':')[1]))*1000
# print(start, end)
# 剪切时间:是按ms 毫秒来的,所以时间格式的转换就要到毫秒级的。
word = song[start:end]
# 这里silence_thresh是认定小于-42dBFS以下的为silence,然后需要保持小于-42dBFS超过 700毫秒。这样子分割成一段一段的。
# 最关键的就是这两个值的确定,这里需要我们用到foobar的一个功能:视图——可视化———音量计
# 可以观察一段音频的dBFS大小,正常的音量差不多都是在-25dBFS到-10dBFS。这个单位是从-96dBFS到0dBFS的,越靠近0,音量越大。
# 我们这里取-42dBFS以下的,认为是静音。然后可以用foobar估算每个单词中间的间隙时间,大概是在900ms也就是0.9s。我们还是取小一些 0.7s分割。
words = split_on_silence(word, min_silence_len=700, silence_thresh=-42)
# 再来就是生成一个乱序的序列,然后把单词对应进去,然后中间插入空白静音1s。
silent = AudioSegment.silent(duration=1000)
print("共分割出{}个音".format(len(words)))
wushiyintu = ['あ', 'い', 'う', 'え', 'お',
'か', 'き', 'く', 'け', 'こ',
'さ', 'し', 'す', 'せ', 'そ',
'た', 'ち', 'つ', 'て', 'と',
'な', 'に', 'ぬ', 'ね', 'の',
'は', 'ひ', 'ふ', 'へ', 'ほ',
'ま', 'み', 'む', 'め', 'も',
'や', 'ゆ', 'よ',
'ら', 'り', 'る', 'れ', 'ろ',
'わ', 'を', 'ん']
x = AudioSegment.empty()
# Skip audio clips that are not longer than the padding
# Padding refers to the silence that is kept for each segment
padding = keep_silence * 2
if x.duration_seconds <= padding / 1000:
return _export_segments([x])
# Determine silence threshold based on whether the audio signal
# consists entirely of transients.
if _is_transients(x.get_array_of_samples(), x.frame_rate, n_window):
threshold = transients_threshold
else:
threshold = default_threshold
segments = silence.split_on_silence(
audio_segment=x,
min_silence_len=min_silence,
silence_thresh=threshold,
keep_silence=keep_silence,
)
# Export the original clip if no non-silent segments were found
if len(segments) == 0:
return _export_segments([x])
# Discard segments that are too short
mean_time = np.mean([seg.duration_seconds for seg in segments])
discard_threshold = 100 + padding
if mean_time > discard_threshold + 500:
segments = [seg for seg in segments
if seg.duration_seconds > discard_threshold]
output_directory: str,
min_silence_length: int,
threshold: int,
added_silence: int,
file_index: int) -> None:
"""
Splits an AudioSegment into sub-segments based on silence detected by pydub.
:param file_path: file path of the audio file to split
:param output_directory: path to directory in which to write output files
:param min_silence_length: the minimum length (in ms) of silence that indicates a break
:param threshold: the level below the norm (in dBFS) to consider silence
:param added_silence: silence to be added to the beginning and end of each split utterance
:param file_index: the number of the file in the directory (recursive) to mark each sub-utterance with.
"""
audio = AudioSegment(file_path)
segments = split_on_silence(audio_segment=audio,
min_silence_len=min_silence_length,
silence_thresh=-threshold)
silence = AudioSegment.silent(duration=added_silence)
for segment_index, segment in enumerate(segments):
audio_segment = silence + segment + silence
normalised_segment = match_target_amplitude(audio_segment, -20)
export_file_name = f"_file_{file_index}-part_{segment_index}.wav"
print(f"Exporting {export_file_name}")
normalised_segment.export(Path(output_directory, export_file_name))
def strip_silence(seg, silence_len=1000, silence_thresh=-16, padding=100):
if padding > silence_len:
raise InvalidDuration("padding cannot be longer than silence_len")
chunks = split_on_silence(seg, silence_len, silence_thresh, padding)
crossfade = padding / 2
if not len(chunks):
return seg[0:0]
seg = chunks[0]
for chunk in chunks[1:]:
seg = seg.append(chunk, crossfade=crossfade)
return seg
def transcribe(lang: str='en', model: str='tdnn'):
"""
Transcribe audio
"""
if request.method == "POST":
try:
f = request.files['file']
filename = secure_filename(f.filename)
wav_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename)
f.save(wav_filename)
complete_audio = AudioSegment.from_file(wav_filename)
chunks = split_on_silence(complete_audio, silence_thresh=-26, min_silence_len=500, keep_silence=500)
chunks = chunks if len(chunks)>0 else [complete_audio]
except:
return jsonify(status='error', description="Unable to find 'file'")
try:
transcriptions = []
for i, chunk in enumerate(chunks):
chunk_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename.strip(".wav")+"chunk"+str(i)+".wav")
chunk.export(chunk_filename, format="wav")
config_obj = config.config[lang][model]
config_obj["wav_filename"] = chunk_filename
transcription = inference.inference(config_obj)
transcriptions.append(transcription)
except:
return jsonify(status='error', description="Wrong lang or model")
output_directory: str,
min_silence_length: int,
threshold: int,
added_silence: int,
file_index: int) -> None:
"""
Splits an AudioSegment into sub-segments based on silence detected by pydub.
:param file_path: file path of the audio file to split
:param output_directory: path to directory in which to write output files
:param min_silence_length: the minimum length (in ms) of silence that indicates a break
:param threshold: the level below the norm (in dBFS) to consider silence
:param added_silence: silence to be added to the beginning and end of each split utterance
:param file_index: the number of the file in the directory (recursive) to mark each sub-utterance with.
"""
audio = AudioSegment(file_path)
segments = split_on_silence(audio_segment=audio,
min_silence_len=min_silence_length,
silence_thresh=-threshold)
silence = AudioSegment.silent(duration=added_silence)
for segment_index, segment in enumerate(segments):
audio_segment = silence + segment + silence
normalised_segment = match_target_amplitude(audio_segment, -20)
export_file_name = f"_file_{file_index}-part_{segment_index}.wav"
print(f"Exporting {export_file_name}")
normalised_segment.export(Path(output_directory, export_file_name))