How to use librosa - 10 common examples

To help you get started, we’ve selected a few librosa examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Hiroshiba / realtime-yukarin / test_scripts / test_voice_changer.py View on Github external
start_time = 0
for i in range(len(raw_wave) // audio_config.chunk + 1):
    feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.in_rate)
    wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period)
    start_time += audio_config.chunk / audio_config.in_rate
    print('cent', i, flush=True)

start_time = 0
for i in range(len(raw_wave) // audio_config.chunk + 1):
    wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.out_rate)
    wave_out_list.append(wave_out)
    start_time += audio_config.chunk / audio_config.out_rate
    print('post', i, flush=True)

out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.out_rate)
github r9y9 / wavenet_vocoder / tests / test_model.py View on Github external
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
    x, _ = librosa.load(example_audio_file(), sr=sr)
    x, _ = librosa.effects.trim(x, top_db=15)

    # To save computational cost
    x = x[:N]

    # For power conditioning wavenet
    if returns_power:
        # (1 x N')
        p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
        upsample_factor = x.size // p.size
        # (1 x N)
        p = np.repeat(p, upsample_factor, axis=-1)
        if p.size < x.size:
            # pad against time axis
            p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)

        # shape adajst
github 4p0pt0Z / Audio_blind_source_separation / generate_audioset_segments.py View on Github external
mel_spectrograms = collections.deque()
    stft_magnitudes = collections.deque()
    stft_phases = collections.deque()
    segment_names = collections.deque()

    # Mel filterbank matrix for computing the mel spectrograms
    mel_filterbank = librosa.filters.mel(config["sampling_rate"],
                                         n_fft=STFT_frame_n_samples,
                                         n_mels=config["n_Mel_filters"],
                                         fmin=config["Mel_min_freq"],
                                         fmax=config["Mel_max_freq"])

    # Loop over all the 10 second long audio file.
    for idx, (audio_file, label_file) in enumerate(zip(all_wavs_filenames, all_labels_filenames)):
        try:  # read file audio data and parse the label file. If this fail, continue to next file
            audio, _ = librosa.core.load(audio_file, sr=config["sampling_rate"], mono=True)
            labels_segment = parse_label_file(label_file, classes)
            if len(labels_segment) != len(classes):
                raise ValueError(
                    'Length of labels_segment is ' + str(len(labels_segment)) + ' while there are only ' + str(
                        len(classes)) + ' classes.')
        except Exception as e:
            print(e)
            print(audio_file)
            continue

        # Split the audio into segments
        n_seg_in_audio = audio.shape[0] // segment_n_samples
        audio = audio[:n_seg_in_audio * segment_n_samples]
        segments = np.split(audio, n_seg_in_audio)

        # For all segments, add white noise if needed, compute audio features and store them in queues,
github pkmital / time-domain-neural-audio-style-transfer / models / fourier.py View on Github external
output_fname,
        n_fft=4096,
        n_layers=1,
        n_filters=4096,
        hop_length=256,
        alpha=0.05,
        k_w=15,
        k_h=3,
        optimizer='bfgs',
        stride=1,
        iterations=300,
        sr=22050):

    frame_size = n_fft // 2

    audio, fs = librosa.load(content_fname, sr=sr)
    content = chop(audio, hop_size=hop_length, frame_size=frame_size)
    audio, fs = librosa.load(style_fname, sr=sr)
    style = chop(audio, hop_size=hop_length, frame_size=frame_size)

    n_frames = min(content.shape[0], style.shape[0])
    n_samples = min(content.shape[1], style.shape[1])
    content = content[:n_frames, :n_samples]
    style = style[:n_frames, :n_samples]

    content_features, style_gram, kernels, freqs = compute_features(
        content=content,
        style=style,
        stride=stride,
        n_fft=n_fft,
        n_layers=n_layers,
        n_filters=n_filters,
github nanoleaf / aurora-sdk-linux / music_processor.py View on Github external
energy_output = energy.astype(np.uint16)
    else:
        energy_output = np.zeros(2).astype(np.uint16)

    # fft or mel
    if is_fft or is_mel:
        global sample_rate

        # down-sample by 4, with filtering, energy not scaled
        data_np = librosa.resample(data_np,
                                   sample_rate,
                                   sample_rate/4,
                                   res_type='kaiser_fast')

        # short time fft over n_fft samples
        fft_data = librosa.stft(data_np, n_fft,
                                hop_length=n_fft,
                                center=False)

        # calculate FFT or Mel
        if is_fft:
            fft_data_mag = np.abs(fft_data[0:n_fft // 2]) ** 2
            fft_data_mag *= 2**3
            fft_output = get_output_fft_bins(fft_data_mag, n_out_bins)
        else:
            fft_data_mag = np.abs(fft_data)**2
            fft_data_mag *= 2**2
            mel_data = librosa.feature.melspectrogram(S=fft_data_mag, sr=sample_rate / 4, n_mels=n_mel)
            fft_output = get_output_fft_bins(mel_data, n_out_bins)

        # output uint8_t
        fft_output = fft_output.astype(np.uint8)
github h2oai / driverlessai-recipes / transformers / speech / audio_MFCC_transformer.py View on Github external
#Trim silence
            if len(audio)> 0: 
                audio, _ = librosa.effects.trim(audio) 
            
            #Trim if audio length > samples 
            if len(audio) > samples: 
                audio = audio[0:0+samples]
                
            #Else pad blanks if shorter 
            else: 
                padding = samples - len(audio)
                offset = padding // 2
                audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
   
            #Get Mel spectogram of audio
            spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=sampling_rate,
                                                 n_mels=n_mels,
                                                 hop_length=hop_length,
                                                 n_fft=n_fft,
                                                 fmin=fmin,
                                                 fmax=fmax)
            #Convert to log scale (DB)
            spectrogram = librosa.power_to_db(spectrogram)
            
            #Get MFCC and second derivatives
            mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=n_mfcc)
            delta2_mfcc = librosa.feature.delta(mfcc, order=2)
            
            #Append MFCC to spectrogram and flatten
            features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)
            X = features.ravel()
github thuhcsi / IJCAI2019-DRL4SER / emotion_inferring / dataset / audio.py View on Github external
def _build_mel_basis(hparams):
  assert hparams.fmax <= hparams.sample_rate // 2
  return librosa.filters.mel(hparams.sample_rate,
                             hparams.n_fft,
                             n_mels=hparams.num_mels,
                             fmin=hparams.fmin,
                             fmax=hparams.fmax)
github h2oai / driverlessai-recipes / transformers / speech / audio_MFCC_transformer.py View on Github external
hop_length = 347*duration 
        fmin = 20 #min freq
        fmax = sampling_rate // 2 #max freq
        n_mels = 128  #number of mels
        n_fft = n_mels * 20 #fft window size
        padmode = 'constant'
        samples = sampling_rate * duration #number of samples
        n_mfcc = 13  #number of Mel FCC to use
        
        try:
            
            audio, sr = librosa.load(file_path, sr=sampling_rate)
            
            #Trim silence
            if len(audio)> 0: 
                audio, _ = librosa.effects.trim(audio) 
            
            #Trim if audio length > samples 
            if len(audio) > samples: 
                audio = audio[0:0+samples]
                
            #Else pad blanks if shorter 
            else: 
                padding = samples - len(audio)
                offset = padding // 2
                audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
   
            #Get Mel spectogram of audio
            spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=sampling_rate,
                                                 n_mels=n_mels,
                                                 hop_length=hop_length,
github yweweler / single-speaker-tts / datasets / cmu_slt.py View on Github external
def load_audio(file_path):
        # Window length in audio samples.
        win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate)
        # Window hop in audio samples.
        hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate)

        # Load the actual audio file.
        wav, sr = load_wav(file_path.decode())

        # TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
        # Remove silence at the beginning and end of the wav so the network does not have to learn
        # some random initial silence delay after which it is allowed to speak.
        wav, _ = librosa.effects.trim(wav)

        # Calculate the linear scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
        # for example are applied to each frame automatically.
        linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T

        # Calculate the Mel. scale spectrogram.
        # Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
        # example are applied to each frame automatically.
        mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels,
                                         model_params.mel_fmin, model_params.mel_fmax, hop_len,
                                         win_len, 1).T

        # Convert the linear spectrogram into decibel representation.
        linear_mag = np.abs(linear_spec)
        linear_mag_db = magnitude_to_decibel(linear_mag)
github ynop / audiomate / audiomate / processing / pipeline / rhythm.py View on Github external
def compute(self, chunk, sampling_rate, corpus=None, utterance=None):
        # Cleanup rest if it's the first frame
        if chunk.offset == 0:
            self.rest = None

        # Compute mel-spectrogram
        power_spec = np.abs(spectral.stft_from_frames(chunk.data.T)) ** 2
        mel = np.abs(librosa.feature.melspectrogram(S=power_spec, n_mels=self.n_mels, sr=sampling_rate))
        mel_power = librosa.power_to_db(mel)

        # Compute onset strengths
        oenv = librosa.onset.onset_strength(S=mel_power, center=False)

        # Remove context, otherwise we have duplicate frames while online processing
        oenv = oenv[chunk.left_context:]

        if self.rest is not None:
            all_frames = np.concatenate([self.rest, oenv])
        else:
            # Its the first chunk --> pad to center tempogram windows at the beginning
            all_frames = np.pad(oenv, (self.win_length // 2, 0), mode='linear_ramp', end_values=0)

        if chunk.is_last:
            # Its the last chunk --> pad to center tempogram windows at end
            all_frames = np.pad(all_frames, (0, self.win_length // 2), mode='linear_ramp', end_values=0)