Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def compute(self, chunk, sampling_rate, corpus=None, utterance=None):
# Cleanup rest if it's the first frame
if chunk.offset == 0: = None
# Compute mel-spectrogram
power_spec = np.abs(spectral.stft_from_frames( ** 2
mel = np.abs(librosa.feature.melspectrogram(S=power_spec, n_mels=self.n_mels, sr=sampling_rate))
mel_power = librosa.power_to_db(mel)
# Compute onset strengths
oenv = librosa.onset.onset_strength(S=mel_power, center=False)
# Remove context, otherwise we have duplicate frames while online processing
oenv = oenv[chunk.left_context:]
if is not None:
all_frames = np.concatenate([, oenv])
# Its the first chunk --> pad to center tempogram windows at the beginning
all_frames = np.pad(oenv, (self.win_length // 2, 0), mode='linear_ramp', end_values=0)
if chunk.is_last:
# Its the last chunk --> pad to center tempogram windows at end
all_frames = np.pad(all_frames, (0, self.win_length // 2), mode='linear_ramp', end_values=0)
if not os.path.isfile(os.path.expanduser(infl_new.strip())):
print (infl + ' Not Found')
#Fetch the audio samples at the sampling rate.
y,sr = librosa.load(infl_new.strip(),sr=None)
if len(y.shape) > 1:
print ('Mono Conversion')
y = librosa.to_mono(y)
if sr != sampling_freq:
print ('Resampling {}'.format(sr))
y = librosa.resample(y,sr,sampling_freq)
spec = librosa.feature.melspectrogram(y, sr=sampling_freq, n_fft=window_length, hop_length=hop_length, n_mels=num_mels)
#Log scaling
spec = librosa.power_to_db(spec,ref=1.0)
infl_list = infl_new.strip().split("/")
file_name = infl_list[-1].strip()
out_dir = output_root + "/" + infl_list[-2]
if not os.path.exists(out_dir):
specfile = str(out_dir) + '/' + str(file_name) + '.orig.spec.npy'
print (specfile), spec, allow_pickle=False)
if infl_list[-2].strip() == "":
output_mel_file = str(file_name) + '.orig.spec.npy'
output_mel_file = infl_list[-2].strip() + '/' + str(file_name) + '.orig.spec.npy'
print (output_mel_file)
def extract_features_from_waveforms(waveforms):
Extract log-scaled mel-spectrograms and their corresponding
deltas from the audio waveform (not the filename)
log_specgrams = []
for s in waveforms:
sound_clip = shape_sound_clip(s)
melspec = librosa.feature.melspectrogram(sound_clip, n_mels = 120, n_fft=1024)
#print melspec.shape
logspec = librosa.power_to_db(melspec, ref = np.max)
#print logspec.shape
logspec = logspec.T.flatten()[:, np.newaxis].T
#print logspec.shape
#print "Produce of two elements in melspec: ", melspec.shape[0]*melspec.shape[1]
del sound_clip
del melspec
del logspec
features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3)
for i in range(len(features)):
def preprocessing_imi(imi_path):
y, sr = librosa.load(imi_path, sr=16000)
# zero-padding
if y.shape[0] < 4*sr:
pad = np.zeros((4*sr-y.shape[0]))
y_fix = np.append(y, pad)
y_fix = y[0:int(4*sr)]
S = librosa.feature.melspectrogram(y=y_fix, sr=sr, n_fft=133,
hop_length=133, power=2, n_mels=39,
fmin=0.0, fmax=5000)
S = S[:, :482]
S_db = librosa.power_to_db(S, ref=np.max)
imi_spectrogram = [S_db]
imi_spectrogram = np.array(imi_spectrogram).astype('float32')
imi_spectrogram_norm = normalize_spectrogram(imi_spectrogram)
return imi_spectrogram_norm
plt.title('Harmonic and Percussive')
# Add file information.
plt.subplot(3, 1, 3)
plt.text(0.0, 1.0, info_str_tex, color='black', verticalalignment='top')
# Calculating MEL spectrogram and MFCC.
db_pow = np.abs(
librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)) ** 2
s_mel = librosa.feature.melspectrogram(S=db_pow, sr=sr, hop_length=hop_length,
fmax=f_max, fmin=f_min, n_mels=n_mels)
s_mel = librosa.power_to_db(s_mel, ref=np.max)
s_mfcc = librosa.feature.mfcc(S=s_mel, sr=sr, n_mfcc=n_mfcc)
# STFT (Short-time Fourier Transform)
plt.figure(figsize=(12, 10))
db = librosa.amplitude_to_db(librosa.magphase(librosa.stft(y))[0], ref=np.max)
plt.subplot(3, 2, 1)
display.specshow(db, sr=sr, x_axis='time', y_axis='linear', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Linear-frequency power spectrogram')
plt.subplot(3, 2, 2)
display.specshow(db, sr=sr, x_axis='time', y_axis='log', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Log-frequency power spectrogram')
i = 0
segments = []
while ((i + (4*sr)) <= y_copy.shape[0]):
i = i + (2*sr)
ref_spectrograms_segments = []
for seg in segments:
S = librosa.feature.melspectrogram(y=seg, sr=sr, n_fft=1024, hop_length=1024, power=2)
S = S[:, 0:128]
S_db = librosa.power_to_db(S, ref=np.max)
ref_spec = [S_db]
ref_spec = np.array(ref_spec).astype('float32')
# print ref_spec.shape
ref_spec = normalize_spectrogram(ref_spec)
ref_spectrograms_segments = []
# print ref_file_names
return np.array(ref_file_names), ref_sepctrograms
x (np.ndarray): Input time-series signal.
sample_rate (number): Sampling rate of signal.
np.ndarray: The logmel feature vector.
# Resample to target sampling rate
x = librosa.resample(x, sample_rate, self.sample_rate)
# Compute short-time Fourier transform
D = librosa.stft(x, n_fft=self.n_window, hop_length=self.hop_length)
# Transform to Mel frequency scale
S =, np.abs(D) ** 2).T
# Apply log nonlinearity and return as float32
return librosa.power_to_db(S, ref=np.max, top_db=None)
_min = np.amin(log_power_spectra)
_max = np.amax(log_power_spectra)
normalized_log_power_spectra = (log_power_spectra - _min) / (_max - _min)
filename = f"ir_{sample_names[idx]}_{specs_generated+1}"
np.savetxt(os.path.join(output_dir, filename + ".txt"), normalized_log_power_spectra)
specs_generated += 1
if save_plots:
if not os.path.isdir("spect_plots"):
plot_specgrams(log_power_spectra, normalized_log_power_spectra,
16000, filename + ".png", "spect_plots")
S = librosa.stft(audio, n_fft=n_fft, hop_length=n_hop, center=True)
power_spectra = np.abs(S)**2
log_power_spectra = librosa.power_to_db(power_spectra)
_min = np.amin(log_power_spectra)
_max = np.amax(log_power_spectra)
if _min == _max:
print(f"divide by zero in {filename}")
normalized_log_power_spectra = (log_power_spectra - _min) / (_max - _min)
filename = f"ir_{sample_names[idx]}_{specs_generated+1}"
np.savetxt(os.path.join(output_dir, filename + ".txt"), normalized_log_power_spectra)
specs_generated += 1
if save_plots:
if not os.path.isdir("spect_plots"):
plot_specgrams(normalized_log_power_spectra, 16000, filename + ".png", "spect_plots")
sys.stdout.write(f"* Computed {specs_generated}/{n_specs} RIR spectrograms\r")
specs_generated = 0
n_specs = len(IRs)
for idx in range(len(IRs)):
audio = np.reshape(IRs[idx], (IRs[idx].shape[0],))
if augment_data:
augmented_audio = augment_audio(audio, 16000,
stretch_factors=[0.80, 0.90, 1.10, 1.20],
shift_factors=[-2, -1, 1, 2])
n_specs = len(IRs) * (len(stretch_factors) + len(shift_factors))
for augment in augmented_audio:
S = librosa.stft(augment, n_fft=n_fft, hop_length=n_hop, center=True)
power_spectra = np.abs(S)**2
log_power_spectra = librosa.power_to_db(power_spectra)
_min = np.amin(log_power_spectra)
_max = np.amax(log_power_spectra)
normalized_log_power_spectra = (log_power_spectra - _min) / (_max - _min)
filename = f"ir_{sample_names[idx]}_{specs_generated+1}"
np.savetxt(os.path.join(output_dir, filename + ".txt"), normalized_log_power_spectra)
specs_generated += 1
if save_plots:
if not os.path.isdir("spect_plots"):
plot_specgrams(log_power_spectra, normalized_log_power_spectra,
16000, filename + ".png", "spect_plots")
S = librosa.stft(audio, n_fft=n_fft, hop_length=n_hop, center=True)
power_spectra = np.abs(S)**2
log_power_spectra = librosa.power_to_db(power_spectra)