How to use the librosa.feature.melspectrogram function in librosa

To help you get started, we’ve selected a few librosa examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / driverlessai-recipes / transformers / speech / audio_MFCC_transformer.py View on Github external
#Trim silence
            if len(audio)> 0: 
                audio, _ = librosa.effects.trim(audio) 
            
            #Trim if audio length > samples 
            if len(audio) > samples: 
                audio = audio[0:0+samples]
                
            #Else pad blanks if shorter 
            else: 
                padding = samples - len(audio)
                offset = padding // 2
                audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
   
            #Get Mel spectogram of audio
            spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=sampling_rate,
                                                 n_mels=n_mels,
                                                 hop_length=hop_length,
                                                 n_fft=n_fft,
                                                 fmin=fmin,
                                                 fmax=fmax)
            #Convert to log scale (DB)
            spectrogram = librosa.power_to_db(spectrogram)
            
            #Get MFCC and second derivatives
            mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=n_mfcc)
            delta2_mfcc = librosa.feature.delta(mfcc, order=2)
            
            #Append MFCC to spectrogram and flatten
            features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)
            X = features.ravel()
github ganesh-srinivas / laughter / scripts / predict_convnet_laughterornot_10sec_model.py View on Github external
def extract_features_from_waveforms(waveforms):
    """
    Extract log-scaled mel-spectrograms and their corresponding 
    deltas from the audio waveform (not the filename)
    """
    log_specgrams = []
    #labels=[]
    for s in waveforms:
      sound_clip = shape_sound_clip(s)

      melspec = librosa.feature.melspectrogram(sound_clip, n_mels = 120, n_fft=1024)
      #print melspec.shape

      logspec = librosa.power_to_db(melspec, ref = np.max)
      #print logspec.shape
      logspec = logspec.T.flatten()[:, np.newaxis].T
      #print logspec.shape

      #print "Produce of two elements in melspec: ", melspec.shape[0]*melspec.shape[1]  
      log_specgrams.append(logspec)
      del sound_clip
      del melspec
      del logspec
      #labels.append(labeltext2labelid(f.split('/')[-2]))  

    log_specgrams=np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
github BogiHsu / Voice-Conversion / preprocess / tacotron / utils.py View on Github external
# Trimming
    y, _ = librosa.effects.trim(y)
    # stft. D: (1+n_fft//2, T)
    D = librosa.stft(y=y,
                     n_fft=hp.n_fft, 
                     hop_length=hp.hop_length, 
                     win_length=hp.win_length) 
    
    # magnitude spectrogram
    magnitude = np.abs(D) #(1+n_fft/2, T)
    
    # power spectrogram
    power = magnitude**2 #(1+n_fft/2, T) 
    
    # mel spectrogram
    S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)

    return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
github willfrey / audio / torchaudio / transforms.py View on Github external
def __call__(self, y):
        return librosa.feature.melspectrogram(y=y, **self.__dict__)
github remyhuang / pop-music-highlighter / lib.py View on Github external
def audio_read(f):
    y, sr = librosa.core.load(f, sr=22050)
    d = librosa.core.get_duration(y=y, sr=sr)
    S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
    S = np.transpose(np.log(1+10000*S))
    S = np.expand_dims(S, axis=0)
    return y, S, int(d)
github barronalex / Tacotron / audio.py View on Github external
wave, _ = librosa.effects.trim(wave)

    # first pad the audio to the maximum length
    # we ensure it is a multiple of 4r so it works with max frames
    assert math.ceil(maximum_audio_length / hop_length) % 4*r == 0
    if wave.shape[0] <= maximum_audio_length: 
        wave = np.pad(wave,
                (0,maximum_audio_length - wave.shape[0]), 'constant', constant_values=0)
    else:
        return None, None

    pre_emphasis = 0.97
    wave = np.append(wave[0], wave[1:] - pre_emphasis * wave[:-1])

    stft = librosa.stft(wave, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
    mel = librosa.feature.melspectrogram(S=stft, n_mels=80)

    stft = np.log(np.abs(stft) + 1e-8)
    mel = np.log(np.abs(mel) + 1e-8)

    stft = reshape_frames(stft)
    mel = reshape_frames(mel)

    return mel, stft
github ciaua / clip2frame / Scripts / extract_feats.py View on Github external
def extract_melspec(in_fp, sr, win_size, hop_size, n_mels):
    sig, sr = librosa.core.load(in_fp, sr=sr)
    feat = librosa.feature.melspectrogram(sig, sr=sr,
                                          n_fft=win_size,
                                          hop_length=hop_size,
                                          n_mels=n_mels).T
    feat = np.log(1+10000*feat)
    return feat
github hansroh / aquests / aquests / lib / dnn / multimedia / audiofeatures.py View on Github external
def generate (y, sample_rate = SAMPLE_RATE, use_mel = True, use_stft = False):
    feature_stack = []
    numfeat = 0
    
    # stft ----------------------------------------------------
    stft = librosa.stft (y, n_fft=2048, win_length=1200, hop_length=256)
    if use_mel:
        # mel specrogram ------------------------------------------
        mel = librosa.feature.melspectrogram(S = stft, n_mels=80)
        mel = np.log(np.abs(mel) + 1e-8)
        feature_stack.extend (_featuring (mel))
        numfeat += 80 * 9
    
    if use_stft:    
        stft = np.log(np.abs(stft) + 1e-8)
        feature_stack.extend (_featuring (stft))
        numfeat += 1025 * 9    
        
    # mfcc -----------------------------------
    vec = librosa.feature.mfcc(S = stft, sr = sample_rate, n_mfcc=20, n_fft=512, hop_length = 256)
    feature_stack.extend (_featuring (vec))
    
    # chroma_cqt -----------------------------------
    cqt = librosa.feature.chroma_cqt (y=y, sr = sample_rate, n_chroma = 12, hop_length = 256)
    cqt = np.log(np.abs(cqt) + 1e-8) # to dB lebel 
github igormq / asr-study / preprocess_timit.py View on Github external
def preprocess_audio(audio_path):
    '''
    Returns Features (time_steps, nb_features) and sequence length (scalar)
    '''
    y, sr = librosa.load(audio_path)
    S = librosa.feature.melspectrogram(y, sr=sr, hop_length=int(1e-2*sr), n_fft=int(25e-3*sr), n_mels=40)
    d = librosa.feature.delta(S)
    dd = librosa.feature.delta(S, order=2)
    S_e = np.log(librosa.feature.rmse(S=S))
    d_e = np.log(librosa.feature.rmse(S=d))
    dd_e = np.log(librosa.feature.rmse(S=dd))
    return np.vstack((S, d, dd, S_e, d_e, dd_e)).T, S.shape[1]