How to use the pydub.silence.split_on_silence function in pydub

To help you get started, we’ve selected a few pydub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / parrots / parrots / custom_syllables.py View on Github external
print("to process ./recording/{key}.wav")
    sys.exit(1)

key = sys.argv[1:][0]

syllables = data[key]

path = "./recording/" + key + ".wav"

file = Path(path)
if not file.is_file():
    raise Exception(path + " doesn't exist")

sound_file = AudioSegment.from_wav(path)

audio_chunks = split_on_silence(sound_file,
                                # must be silent for at least 300ms
                                min_silence_len=300,
                                # consider it silent if quieter than -48 dBFS
                                silence_thresh=-48
                                )

flag = False
if len(syllables) * 5 != len(audio_chunks):
    flag = True

for i, chunk in enumerate(audio_chunks):
    syllable = syllables[i // 5]
    print(syllable)
    j = i % 5
    if j != 4:  # 1st, 2nd, 3rd, 4th tone
        out_file = "./pre/" + syllable + str(j + 1) + ".wav"
github gswyhq / hello-world / speech_synthesis_语音合成 / 日语五十音图的听写.py View on Github external
EXPORT_PATH = '/home/gswewf/data/五十音图'
time_start = "00:16"
time_end = "01:35"

song = AudioSegment.from_mp3(file)
start = (int(time_start.split(':')[0])*60 + int(time_start.split(':')[1]))*1000
end = (int(time_end.split(':')[0])*60 + int(time_end.split(':')[1]))*1000
# print(start, end)
# 剪切时间:是按ms 毫秒来的,所以时间格式的转换就要到毫秒级的。
word = song[start:end]

# 这里silence_thresh是认定小于-42dBFS以下的为silence,然后需要保持小于-42dBFS超过 700毫秒。这样子分割成一段一段的。
# 最关键的就是这两个值的确定,这里需要我们用到foobar的一个功能:视图——可视化———音量计
# 可以观察一段音频的dBFS大小,正常的音量差不多都是在-25dBFS到-10dBFS。这个单位是从-96dBFS到0dBFS的,越靠近0,音量越大。
# 我们这里取-42dBFS以下的,认为是静音。然后可以用foobar估算每个单词中间的间隙时间,大概是在900ms也就是0.9s。我们还是取小一些 0.7s分割。
words = split_on_silence(word, min_silence_len=700, silence_thresh=-42)

# 再来就是生成一个乱序的序列,然后把单词对应进去,然后中间插入空白静音1s。
silent = AudioSegment.silent(duration=1000)

print("共分割出{}个音".format(len(words)))
wushiyintu = ['あ', 'い', 'う', 'え', 'お',
              'か', 'き', 'く', 'け', 'こ',
              'さ', 'し', 'す', 'せ', 'そ',
              'た', 'ち', 'つ', 'て', 'と',
              'な', 'に', 'ぬ', 'ね', 'の',
              'は', 'ひ', 'ふ', 'へ', 'ほ',
              'ま', 'み', 'む', 'め', 'も',
              'や', 'ゆ', 'よ',
              'ら', 'り', 'る', 'れ', 'ろ',
              'わ', 'を', 'ん']
github tqbl / dcase2018_task2 / task2 / silence.py View on Github external
x = AudioSegment.empty()

    # Skip audio clips that are not longer than the padding
    # Padding refers to the silence that is kept for each segment
    padding = keep_silence * 2
    if x.duration_seconds <= padding / 1000:
        return _export_segments([x])

    # Determine silence threshold based on whether the audio signal
    # consists entirely of transients.
    if _is_transients(x.get_array_of_samples(), x.frame_rate, n_window):
        threshold = transients_threshold
    else:
        threshold = default_threshold

    segments = silence.split_on_silence(
        audio_segment=x,
        min_silence_len=min_silence,
        silence_thresh=threshold,
        keep_silence=keep_silence,
    )

    # Export the original clip if no non-silent segments were found
    if len(segments) == 0:
        return _export_segments([x])

    # Discard segments that are too short
    mean_time = np.mean([seg.duration_seconds for seg in segments])
    discard_threshold = 100 + padding
    if mean_time > discard_threshold + 500:
        segments = [seg for seg in segments
                    if seg.duration_seconds > discard_threshold]
github CoEDL / elpis / elpis / engines / common / input / split_on_silence.py View on Github external
output_directory: str,
                                min_silence_length: int,
                                threshold: int,
                                added_silence: int,
                                file_index: int) -> None:
    """
    Splits an AudioSegment into sub-segments based on silence detected by pydub.
    :param file_path: file path of the audio file to split
    :param output_directory: path to directory in which to write output files
    :param min_silence_length: the minimum length (in ms) of silence that indicates a break
    :param threshold: the level below the norm (in dBFS) to consider silence
    :param added_silence: silence to be added to the beginning and end of each split utterance
    :param file_index: the number of the file in the directory (recursive) to mark each sub-utterance with.
    """
    audio = AudioSegment(file_path)
    segments = split_on_silence(audio_segment=audio,
                                min_silence_len=min_silence_length,
                                silence_thresh=-threshold)
    silence = AudioSegment.silent(duration=added_silence)
    for segment_index, segment in enumerate(segments):
        audio_segment = silence + segment + silence
        normalised_segment = match_target_amplitude(audio_segment, -20)
        export_file_name = f"_file_{file_index}-part_{segment_index}.wav"
        print(f"Exporting {export_file_name}")
        normalised_segment.export(Path(output_directory, export_file_name))
github jiaaro / pydub / pydub / effects.py View on Github external
def strip_silence(seg, silence_len=1000, silence_thresh=-16, padding=100):
    if padding > silence_len:
        raise InvalidDuration("padding cannot be longer than silence_len")

    chunks = split_on_silence(seg, silence_len, silence_thresh, padding)
    crossfade = padding / 2

    if not len(chunks):
        return seg[0:0]

    seg = chunks[0]
    for chunk in chunks[1:]:
        seg = seg.append(chunk, crossfade=crossfade)

    return seg
github Vernacular-ai / kaldi-serve / kaldi_serve / app / server.py View on Github external
def transcribe(lang: str='en', model: str='tdnn'):
    """
    Transcribe audio
    """
    if request.method == "POST":
        try:
            f = request.files['file']
            filename = secure_filename(f.filename)
            wav_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            f.save(wav_filename)
            complete_audio = AudioSegment.from_file(wav_filename)
            chunks = split_on_silence(complete_audio, silence_thresh=-26, min_silence_len=500, keep_silence=500)
            chunks = chunks if len(chunks)>0 else [complete_audio]
        except:
            return jsonify(status='error', description="Unable to find 'file'")

        try:
            transcriptions = []
            for i, chunk in enumerate(chunks):
                chunk_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename.strip(".wav")+"chunk"+str(i)+".wav")
                chunk.export(chunk_filename, format="wav")
                config_obj = config.config[lang][model]
                config_obj["wav_filename"] = chunk_filename
                transcription = inference.inference(config_obj)
                transcriptions.append(transcription)
        except:
            return jsonify(status='error', description="Wrong lang or model")
github CoEDL / kaldi_helpers / kaldi_helpers / input_scripts / split_on_silence.py View on Github external
output_directory: str,
                                min_silence_length: int,
                                threshold: int,
                                added_silence: int,
                                file_index: int) -> None:
    """
    Splits an AudioSegment into sub-segments based on silence detected by pydub.
    :param file_path: file path of the audio file to split
    :param output_directory: path to directory in which to write output files
    :param min_silence_length: the minimum length (in ms) of silence that indicates a break
    :param threshold: the level below the norm (in dBFS) to consider silence
    :param added_silence: silence to be added to the beginning and end of each split utterance
    :param file_index: the number of the file in the directory (recursive) to mark each sub-utterance with.
    """
    audio = AudioSegment(file_path)
    segments = split_on_silence(audio_segment=audio,
                                min_silence_len=min_silence_length,
                                silence_thresh=-threshold)
    silence = AudioSegment.silent(duration=added_silence)
    for segment_index, segment in enumerate(segments):
        audio_segment = silence + segment + silence
        normalised_segment = match_target_amplitude(audio_segment, -20)
        export_file_name = f"_file_{file_index}-part_{segment_index}.wav"
        print(f"Exporting {export_file_name}")
        normalised_segment.export(Path(output_directory, export_file_name))