Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _test_istft_of_sine(self, amplitude, L, n):
# stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L
x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype())
sound = amplitude * torch.sin(2 * math.pi / L * x * n)
# stft = torch.stft(sound, L, hop_length=L, win_length=L,
# window=torch.ones(L), center=False, normalized=False)
stft = torch.zeros((L // 2 + 1, 2, 2))
stft_largest_val = (amplitude * L) / 2.0
if n < stft.size(0):
stft[n, :, 1] = -stft_largest_val
if 0 <= L - n < stft.size(0):
# symmetric about L // 2
stft[L - n, :, 1] = stft_largest_val
estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L,
window=torch.ones(L), center=False, normalized=False)
# There is a larger error due to the scaling of amplitude
self._compare_estimate(sound, estimate, atol=1e-3)
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
waveform = torch.arange(num_samples).float()
output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)
# from NumFrames in feature-window.cc
n = window_size
if snip_edges:
m = 0 if num_samples < window_size else 1 + (num_samples - window_size) // window_shift
else:
m = (num_samples + (window_shift // 2)) // window_shift
self.assertTrue(output.dim() == 2)
self.assertTrue(output.shape[0] == m and output.shape[1] == n)
window = torch.empty((m, window_size))
for r in range(m):
extract_window(window, waveform, r, window_size, window_shift, snip_edges)
self.assertTrue(torch.allclose(window, output))
def test_resample_size(self):
input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
waveform, sample_rate = torchaudio.load(input_path)
upsample_rate = sample_rate * 2
downsample_rate = sample_rate // 2
invalid_resample = torchaudio.transforms.Resample(sample_rate, upsample_rate, resampling_method='foo')
self.assertRaises(ValueError, invalid_resample, waveform)
upsample_resample = torchaudio.transforms.Resample(
sample_rate, upsample_rate, resampling_method='sinc_interpolation')
up_sampled = upsample_resample(waveform)
# we expect the upsampled signal to have twice as many samples
self.assertTrue(up_sampled.size(-1) == waveform.size(-1) * 2)
downsample_resample = torchaudio.transforms.Resample(
sample_rate, downsample_rate, resampling_method='sinc_interpolation')
down_sampled = downsample_resample(waveform)
# we expect the downsampled signal to have half as many samples
self.assertTrue(down_sampled.size(-1) == waveform.size(-1) // 2)
def test_resample_size(self):
input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
waveform, sample_rate = torchaudio.load(input_path)
upsample_rate = sample_rate * 2
downsample_rate = sample_rate // 2
invalid_resample = torchaudio.transforms.Resample(sample_rate, upsample_rate, resampling_method='foo')
self.assertRaises(ValueError, invalid_resample, waveform)
upsample_resample = torchaudio.transforms.Resample(
sample_rate, upsample_rate, resampling_method='sinc_interpolation')
up_sampled = upsample_resample(waveform)
# we expect the upsampled signal to have twice as many samples
self.assertTrue(up_sampled.size(-1) == waveform.size(-1) * 2)
downsample_resample = torchaudio.transforms.Resample(
sample_rate, downsample_rate, resampling_method='sinc_interpolation')
down_sampled = downsample_resample(waveform)
# we expect the downsampled signal to have half as many samples
self.assertTrue(down_sampled.size(-1) == waveform.size(-1) // 2)
def test_resample_waveform_multi_channel(self):
num_channels = 3
sound, sample_rate = torchaudio.load_wav(self.test_8000_filepath) # (1, 8000)
multi_sound = sound.repeat(num_channels, 1) # (num_channels, 8000)
for i in range(num_channels):
multi_sound[i, :] *= (i + 1) * 1.5
multi_sound_sampled = kaldi.resample_waveform(multi_sound, sample_rate, sample_rate // 2)
# check that sampling is same whether using separately or in a tensor of size (c, n)
for i in range(num_channels):
single_channel = sound * (i + 1) * 1.5
single_channel_sampled = kaldi.resample_waveform(single_channel, sample_rate, sample_rate // 2)
self.assertTrue(torch.allclose(multi_sound_sampled[i, :], single_channel_sampled, rtol=1e-4))
def test_resample_waveform_downsample_size(self):
sound, sample_rate = torchaudio.load_wav(self.test_8000_filepath)
downsample_sound = kaldi.resample_waveform(sound, sample_rate, sample_rate // 2)
self.assertTrue(downsample_sound.size(-1) == sound.size(-1) // 2)
inputs = inputs[:x[-1].astype(int)]
targets = targets[:x[-1].astype(int)]
inputs = inputs[:len(inputs) // seq_M * seq_M]
targets = targets[:len(targets) // seq_M * seq_M]
h = f(np.arange(1, len(inputs) + 1))
train_wav.append(inputs)
train_features.append(h)
train_targets.append(targets)
train_wav = np.concatenate(train_wav)
train_features = np.vstack(train_features)
train_targets = np.concatenate(train_targets)
enc = transforms.MuLawEncoding(channels)
dec = transforms.MuLawExpanding(channels)
train_wav = enc(train_wav)
train_targets = enc(train_targets)
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
train_wav = train_wav.reshape(-1, seq_M)
train_features = np.rollaxis(train_features.reshape(-1, seq_M, features_size), 2, 1)
train_targets = train_targets.reshape(-1, seq_M)
train_wav = torch.from_numpy(train_wav).long()
train_features = torch.from_numpy(train_features).float()
train_targets = torch.from_numpy(train_targets).long()
print(train_features.shape, train_wav.shape, train_targets.shape)
# transformer: convert int to one-hot vector
class Int2OneHot(object):
def __init__(self, num_labels):
self.num_labels = num_labels
def __call__(self, targets):
one_hots = list()
for t in targets:
one_hot = torch.LongTensor(self.num_labels).zero_()
one_hot[t] = 1
one_hots.append(one_hot)
return one_hots
class BatchTransformer(torchaudio.transforms.Compose):
def __init__(self,
resample=True, sample_rate=params.SAMPLE_RATE,
tempo=True, tempo_range=params.TEMPO_RANGE,
pitch=True, pitch_range=params.PITCH_RANGE,
noise=True, noise_range=params.NOISE_RANGE,
offset=True, offset_range=None,
padding=True, num_padding=None,
window_shift=params.WINDOW_SHIFT, window_size=params.WINDOW_SIZE, nfft=params.NFFT,
unit_frames=params.WIDTH, stride=2, split=False):
if offset and offset_range is None:
offset_range = (0, stride * WIN_SAMP_SHIFT)
if padding and num_padding is None:
pad = int(((params.WIDTH * stride) // 2 - 1) * WIN_SAMP_SHIFT)
num_padding = (pad, pad)
super().__init__([
def forward(self, waveform):
r"""
Args:
waveform (torch.Tensor): Tensor of audio of dimension (channel, time)
Returns:
torch.Tensor: Dimension (channel, freq, time), where channel
is unchanged, freq is ``n_fft // 2 + 1`` where ``n_fft`` is the number of
Fourier bins, and time is the number of window hops (n_frame).
"""
return F.spectrogram(waveform, self.pad, self.window, self.n_fft, self.hop_length,
self.win_length, self.power, self.normalized)
def torchaudio_info(path):
import torchaudio
# get length of file in samples
info = {}
si, _ = torchaudio.info(str(path))
info['samplerate'] = si.rate
info['samples'] = si.length // si.channels
info['duration'] = info['samples'] / si.rate
return info