How to use the librosa.feature.mfcc function in librosa

To help you get started, we’ve selected a few librosa examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github buriburisuri / speech-to-text-wavenet / preprocess.py View on Github external
target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists( target_filename ):
            continue
        # print info
        print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" % (i, len(wave_files), wave_file, offset))
        # load wave file
        if not os.path.exists( wave_file ):
            sph_file = wave_file.rsplit('.',1)[0]
            if os.path.exists( sph_file ):
                convert_sph( sph_file, wave_file )
            else:
                raise RuntimeError("Missing sph file from TedLium corpus at %s"%(sph_file))
        wave, sr = librosa.load(wave_file, mono=True, sr=None, offset=offset, duration=dur)

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
github anhnguyen9a7 / vietnamese-speech-to-text-wavenet / recognize_module.py View on Github external
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)

	# to dense tensor
	y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1

	#
	# regcognize wave file
	#

	# command line argument for input wave file path
	tf.sg_arg_def(file=(filepath, 'speech wave file to recognize.'))

	# load wave file
	wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000)
	# get mfcc feature
	mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1])

	# run network
	with tf.Session() as sess:

		# init variables
		tf.sg_init(sess)

		# restore parameters
		saver = tf.train.Saver()
		saver.restore(sess, tf.train.latest_checkpoint('asset/train'))
		# run session
		label = sess.run(y, feed_dict={x: mfcc})

		# print label
		# data.print_index(label)
github 1eedaegon / KYLius-method / x_ksh / keep / sound_analysis3.py View on Github external
def data2array(file):
    dic = {}
    i=0
    for filename in file:
        y, sr = sf.read(path+filename, dtype='float32')
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        max_mfcc=np.max(mfcc, axis=1)
        mins, maxs=np.min(max_mfcc), np.max(max_mfcc)
        scaled_mfcc=(max_mfcc-mins)/(maxs-mins)
        dic[i] = scaled_mfcc
        i+=1
    array=np.array(list(dic.values()))
    return(array)
github micah5 / pyAudioClassification / pyaudioclassification / feat_extract.py View on Github external
def extract_feature(file_name):
    """Generates feature input (mfccs, chroma, mel, contrast, tonnetz).
    -*- author: mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    X, sample_rate = sf.read(file_name, dtype='float32')
    if X.ndim > 1:
        X = X[:,0]
    X = X.T
    X = np.asfortranarray(X)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs, chroma, mel, contrast, tonnetz
github adzialocha / tomomibot / tomomibot / audio.py View on Github external
def mfcc_features(y, sr, n_mels=128, n_mfcc=13):
    """Extract MFCCs (Mel-Frequency Cepstral Coefficients)"""
    # Analyze only first second
    y = y[0:sr]

    # Calculate MFCCs (Mel-Frequency Cepstral Coefficients)
    mel_spectrum = librosa.feature.melspectrogram(y,
                                                  sr=sr,
                                                  n_mels=n_mels)
    log_spectrum = librosa.amplitude_to_db(mel_spectrum,
                                           ref=np.max)
    mfcc = librosa.feature.mfcc(S=log_spectrum,
                                sr=sr,
                                n_mfcc=n_mfcc)

    if mfcc.shape[-1] < DELTA_WIDTH:
        raise RuntimeError('MFCC vector does not contain enough time steps')

    if not mfcc.any():
        return np.zeros(n_mfcc * 3)

    # Standardize feature for equal variance
    delta_mfcc = librosa.feature.delta(mfcc, width=DELTA_WIDTH)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2, width=DELTA_WIDTH)
    feature_vector = np.concatenate((
        np.mean(mfcc, 1),
        np.mean(delta_mfcc, 1),
        np.mean(delta2_mfcc, 1)))
github amitchone / ASR / proof.py View on Github external
def get_mfcc_vector(sample, framelength=0.025, frameoverlap=0.0125, n_mfccs=13):
    fs, data = read(sample)
    samplevector = list()

    for frame, samples in get_frames(framelength, frameoverlap, fs, data).iteritems():
        samplevector.append(librosa.feature.mfcc(samples, sr=fs, n_mfcc=13))

    return numpy.array(samplevector)
github jim-schwoebel / voicebook / chapter_4_modeling / train_audiotextclassify.py View on Github external
def featurize(wavfile):
    #initialize features 
    hop_length = 512
    n_fft=2048
    #load file 
    y, sr = librosa.load(wavfile)
    #extract mfcc coefficients 
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc) 
    #extract mean, standard deviation, min, and max value in mfcc frame, do this across all mfccs
    mfcc_features=np.array([np.mean(mfcc[0]),np.std(mfcc[0]),np.amin(mfcc[0]),np.amax(mfcc[0]),
                            np.mean(mfcc[1]),np.std(mfcc[1]),np.amin(mfcc[1]),np.amax(mfcc[1]),
                            np.mean(mfcc[2]),np.std(mfcc[2]),np.amin(mfcc[2]),np.amax(mfcc[2]),
                            np.mean(mfcc[3]),np.std(mfcc[3]),np.amin(mfcc[3]),np.amax(mfcc[3]),
                            np.mean(mfcc[4]),np.std(mfcc[4]),np.amin(mfcc[4]),np.amax(mfcc[4]),
                            np.mean(mfcc[5]),np.std(mfcc[5]),np.amin(mfcc[5]),np.amax(mfcc[5]),
                            np.mean(mfcc[6]),np.std(mfcc[6]),np.amin(mfcc[6]),np.amax(mfcc[6]),
                            np.mean(mfcc[7]),np.std(mfcc[7]),np.amin(mfcc[7]),np.amax(mfcc[7]),
                            np.mean(mfcc[8]),np.std(mfcc[8]),np.amin(mfcc[8]),np.amax(mfcc[8]),
                            np.mean(mfcc[9]),np.std(mfcc[9]),np.amin(mfcc[9]),np.amax(mfcc[9]),
                            np.mean(mfcc[10]),np.std(mfcc[10]),np.amin(mfcc[10]),np.amax(mfcc[10]),
                            np.mean(mfcc[11]),np.std(mfcc[11]),np.amin(mfcc[11]),np.amax(mfcc[11]),
                            np.mean(mfcc[12]),np.std(mfcc[12]),np.amin(mfcc[12]),np.amax(mfcc[12]),
                            np.mean(mfcc_delta[0]),np.std(mfcc_delta[0]),np.amin(mfcc_delta[0]),np.amax(mfcc_delta[0]),
github jordipons / elmarc / src / old_mfcc_GTZAN.py View on Github external
pass it through the tensor-flow model to extract the *features_list*

    :param audio: String pointing where the audio is located
    :param sampling_rate: Sampling rate used when loading the audio (change it for down-sampling)

    :return features: Extracted features per *audio* song
    """
    if feature_type == 'MFCC':

        src_zeros = np.zeros(1024) # min length to have 3-frame mfcc's
        src, sr = librosa.load(audio, sr=sampling_rate, duration=29.) # max len: 29s, can be shorter.
        if len(src) < 1024:
            src_zeros[:len(src)] = src
            src = src_zeros
    
        mfcc = librosa.feature.mfcc(src, sampling_rate, n_mfcc=20)
        dmfcc = mfcc[:, 1:] - mfcc[:, :-1]
        ddmfcc = dmfcc[:, 1:] - dmfcc[:, :-1]
        return np.concatenate((np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
                               np.mean(dmfcc, axis=1), np.std(dmfcc, axis=1),
                               np.mean(ddmfcc, axis=1), np.std(ddmfcc, axis=1)), 
                               axis=0)
github 1eedaegon / KYLius-method / x_ksh / keep / sound_analysis5.py View on Github external
@author: kimseunghyuck
"""

import librosa
import soundfile as sf
from matplotlib import pyplot as plt
import numpy as np
import os
path = '/Users/kimseunghyuck/desktop/audio_train/'
files=os.listdir(path)

#show one sample file
filename = files[0]
y, sr = sf.read(path+filename, dtype='float32')
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
#stft=librosa.core.stft(y=y)
#stft.shape  #1025, 161
mfcc.shape   #20, 161

#show second sample file
filename = files[1]
y, sr = sf.read(path+filename, dtype='float32')
#mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
stft=librosa.core.stft(y=y)
stft.shape  #1025, 109
#주파수 범위가 1025, 109는 시간
#패딩해서 max로 맟춘 다음에 cnn하면 되지 않을까.

#show graph
plt.figure(figsize=(15, 5))
plt.plot(mfcc)
github dhavalthakkar93 / speech_emotion_recognition / ensemble_emotion_simulation.py View on Github external
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
                                              sr=sample_rate).T, axis=0)
    return mfccs, chroma, mel, contrast, tonnetz