How to use the librosa.effects.trim function in librosa

To help you get started, we’ve selected a few librosa examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github r9y9 / wavenet_vocoder / tests / test_model.py View on Github external
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
    x, _ = librosa.load(example_audio_file(), sr=sr)
    x, _ = librosa.effects.trim(x, top_db=15)

    # To save computational cost
    x = x[:N]

    # For power conditioning wavenet
    if returns_power:
        # (1 x N')
        p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
        upsample_factor = x.size // p.size
        # (1 x N)
        p = np.repeat(p, upsample_factor, axis=-1)
        if p.size < x.size:
            # pad against time axis
            p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)

        # shape adajst
github h2oai / driverlessai-recipes / transformers / speech / audio_MFCC_transformer.py View on Github external
hop_length = 347*duration 
        fmin = 20 #min freq
        fmax = sampling_rate // 2 #max freq
        n_mels = 128  #number of mels
        n_fft = n_mels * 20 #fft window size
        padmode = 'constant'
        samples = sampling_rate * duration #number of samples
        n_mfcc = 13  #number of Mel FCC to use
        
        try:
            
            audio, sr = librosa.load(file_path, sr=sampling_rate)
            
            #Trim silence
            if len(audio)> 0: 
                audio, _ = librosa.effects.trim(audio) 
            
            #Trim if audio length > samples 
            if len(audio) > samples: 
                audio = audio[0:0+samples]
                
            #Else pad blanks if shorter 
            else: 
                padding = samples - len(audio)
                offset = padding // 2
                audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
   
            #Get Mel spectogram of audio
            spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=sampling_rate,
                                                 n_mels=n_mels,
                                                 hop_length=hop_length,
github yweweler / single-speaker-tts / datasets / cmu_slt.py View on Github external
def load_audio(file_path):
        # Window length in audio samples.
        win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate)
        # Window hop in audio samples.
        hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate)

        # Load the actual audio file.
        wav, sr = load_wav(file_path.decode())

        # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
        # Remove silence at the beginning and end of the wav so the network does not have to learn
        # some random initial silence delay after which it is allowed to speak.
        wav, _ = librosa.effects.trim(wav)

        # Calculate the linear scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
        # for example are applied to each frame automatically.
        linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T

        # Calculate the Mel. scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
        # example are applied to each frame automatically.
        mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels,
                                         model_params.mel_fmin, model_params.mel_fmax, hop_len,
                                         win_len, 1).T

        # Convert the linear spectrogram into decibel representation.
        linear_mag = np.abs(linear_spec)
        linear_mag_db = magnitude_to_decibel(linear_mag)
github tsurumeso / vocal-remover / augment.py View on Github external
filelist = list(zip(X_list, y_list))
    for mix_path, inst_path in tqdm(filelist):
        basename_mix, _ = os.path.splitext(os.path.basename(mix_path))
        basename_inst, _ = os.path.splitext(os.path.basename(inst_path))
        outpath_mix = os.path.join(args.mixtures, basename_mix + suffix)
        outpath_inst = os.path.join(args.instruments, basename_inst + suffix)
        if os.path.exists(outpath_mix) and os.path.exists(outpath_inst):
            continue

        X, _ = librosa.load(
            mix_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
        y, _ = librosa.load(
            inst_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast')

        X, _ = librosa.effects.trim(X)
        y, _ = librosa.effects.trim(y)
        X, y = spec_utils.align_wave_head_and_tail(X, y, args.sr)

        v = X - y
        sf.write(input_i, y.T, args.sr)
        sf.write(input_v, v.T, args.sr)
        subprocess.call(cmd_i, stderr=subprocess.DEVNULL)
        subprocess.call(cmd_v, stderr=subprocess.DEVNULL)

        y, _ = librosa.load(
            output_i, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
        v, _ = librosa.load(
            output_v, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
        X = y + v

        spec = spec_utils.calc_spec(X, args.hop_length)
github andi611 / ZeroSpeech-TTS-without-T / convert.py View on Github external
def spectrogram2wav(mag): # Generate wave file from spectrogram
	mag = mag.T # transpose
	mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db # de-noramlize
	mag = np.power(10.0, mag * 0.05) # to amplitude
	wav = griffin_lim(mag) # wav reconstruction
	wav = signal.lfilter([1], [1, -hp.preemphasis], wav) # de-preemphasis
	wav, _ = librosa.effects.trim(wav) # trim
	return wav.astype(np.float32)
github thuhcsi / IJCAI2019-DRL4SER / emotion_inferring / dataset / audio.py View on Github external
def trim_silence(wav, hparams):
  return librosa.effects.trim(wav,
                              top_db=hparams.trim_top_db,
                              frame_length=hparams.trim_fft_size,
                              hop_length=hparams.trim_hop_size)[0]
github mindslab-ai / voicefilter / generator.py View on Github external
def mix(hp, args, audio, num, s1_dvec, s1_target, s2, train):
    srate = hp.audio.sample_rate
    dir_ = os.path.join(args.out_dir, 'train' if train else 'test')

    d, _ = librosa.load(s1_dvec, sr=srate)
    w1, _ = librosa.load(s1_target, sr=srate)
    w2, _ = librosa.load(s2, sr=srate)
    assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
        'wav files must be mono, not stereo'

    d, _ = librosa.effects.trim(d, top_db=20)
    w1, _ = librosa.effects.trim(w1, top_db=20)
    w2, _ = librosa.effects.trim(w2, top_db=20)

    # if reference for d-vector is too short, discard it
    if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
        return

    # LibriSpeech dataset have many silent interval, so let's vad-merge them
    # VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
    if args.vad == 1:
        w1, w2 = vad_merge(w1), vad_merge(w2)

    # I think random segment length will be better, but let's follow the paper first
    # fit audio to `hp.data.audio_len` seconds.
    # if merged audio is shorter than `L`, discard it
    L = int(srate * hp.data.audio_len)
    if w1.shape[0] < L or w2.shape[0] < L: