How to use torchaudio - 10 common examples

To help you get started, we’ve selected a few torchaudio examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / audio / test / test_functional.py View on Github external
def _test_istft_of_sine(self, amplitude, L, n):
        # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L
        x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype())
        sound = amplitude * torch.sin(2 * math.pi / L * x * n)
        # stft = torch.stft(sound, L, hop_length=L, win_length=L,
        #                   window=torch.ones(L), center=False, normalized=False)
        stft = torch.zeros((L // 2 + 1, 2, 2))
        stft_largest_val = (amplitude * L) / 2.0
        if n < stft.size(0):
            stft[n, :, 1] = -stft_largest_val

        if 0 <= L - n < stft.size(0):
            # symmetric about L // 2
            stft[L - n, :, 1] = stft_largest_val

        estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L,
                                               window=torch.ones(L), center=False, normalized=False)
        # There is a larger error due to the scaling of amplitude
        self._compare_estimate(sound, estimate, atol=1e-3)
github pytorch / audio / test / test_compliance_kaldi.py View on Github external
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
        waveform = torch.arange(num_samples).float()
        output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)

        # from NumFrames in feature-window.cc
        n = window_size
        if snip_edges:
            m = 0 if num_samples < window_size else 1 + (num_samples - window_size) // window_shift
        else:
            m = (num_samples + (window_shift // 2)) // window_shift

        self.assertTrue(output.dim() == 2)
        self.assertTrue(output.shape[0] == m and output.shape[1] == n)

        window = torch.empty((m, window_size))

        for r in range(m):
            extract_window(window, waveform, r, window_size, window_shift, snip_edges)
        self.assertTrue(torch.allclose(window, output))
github pytorch / audio / test / test_transforms.py View on Github external
def test_resample_size(self):
        input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
        waveform, sample_rate = torchaudio.load(input_path)

        upsample_rate = sample_rate * 2
        downsample_rate = sample_rate // 2
        invalid_resample = torchaudio.transforms.Resample(sample_rate, upsample_rate, resampling_method='foo')

        self.assertRaises(ValueError, invalid_resample, waveform)

        upsample_resample = torchaudio.transforms.Resample(
            sample_rate, upsample_rate, resampling_method='sinc_interpolation')
        up_sampled = upsample_resample(waveform)

        # we expect the upsampled signal to have twice as many samples
        self.assertTrue(up_sampled.size(-1) == waveform.size(-1) * 2)

        downsample_resample = torchaudio.transforms.Resample(
            sample_rate, downsample_rate, resampling_method='sinc_interpolation')
        down_sampled = downsample_resample(waveform)

        # we expect the downsampled signal to have half as many samples
        self.assertTrue(down_sampled.size(-1) == waveform.size(-1) // 2)
github pytorch / audio / test / test_transforms.py View on Github external
def test_resample_size(self):
        input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav')
        waveform, sample_rate = torchaudio.load(input_path)

        upsample_rate = sample_rate * 2
        downsample_rate = sample_rate // 2
        invalid_resample = torchaudio.transforms.Resample(sample_rate, upsample_rate, resampling_method='foo')

        self.assertRaises(ValueError, invalid_resample, waveform)

        upsample_resample = torchaudio.transforms.Resample(
            sample_rate, upsample_rate, resampling_method='sinc_interpolation')
        up_sampled = upsample_resample(waveform)

        # we expect the upsampled signal to have twice as many samples
        self.assertTrue(up_sampled.size(-1) == waveform.size(-1) * 2)

        downsample_resample = torchaudio.transforms.Resample(
            sample_rate, downsample_rate, resampling_method='sinc_interpolation')
        down_sampled = downsample_resample(waveform)

        # we expect the downsampled signal to have half as many samples
        self.assertTrue(down_sampled.size(-1) == waveform.size(-1) // 2)
github pytorch / audio / test / test_compliance_kaldi.py View on Github external
def test_resample_waveform_multi_channel(self):
        num_channels = 3

        sound, sample_rate = torchaudio.load_wav(self.test_8000_filepath)  # (1, 8000)
        multi_sound = sound.repeat(num_channels, 1)  # (num_channels, 8000)

        for i in range(num_channels):
            multi_sound[i, :] *= (i + 1) * 1.5

        multi_sound_sampled = kaldi.resample_waveform(multi_sound, sample_rate, sample_rate // 2)

        # check that sampling is same whether using separately or in a tensor of size (c, n)
        for i in range(num_channels):
            single_channel = sound * (i + 1) * 1.5
            single_channel_sampled = kaldi.resample_waveform(single_channel, sample_rate, sample_rate // 2)
            self.assertTrue(torch.allclose(multi_sound_sampled[i, :], single_channel_sampled, rtol=1e-4))
github pytorch / audio / test / test_compliance_kaldi.py View on Github external
def test_resample_waveform_downsample_size(self):
        sound, sample_rate = torchaudio.load_wav(self.test_8000_filepath)
        downsample_sound = kaldi.resample_waveform(sound, sample_rate, sample_rate // 2)
        self.assertTrue(downsample_sound.size(-1) == sound.size(-1) // 2)
github yoyololicon / pytorch_FFTNet / FFTNet_vocoder.py View on Github external
inputs = inputs[:x[-1].astype(int)]
        targets = targets[:x[-1].astype(int)]
        inputs = inputs[:len(inputs) // seq_M * seq_M]
        targets = targets[:len(targets) // seq_M * seq_M]

        h = f(np.arange(1, len(inputs) + 1))

        train_wav.append(inputs)
        train_features.append(h)
        train_targets.append(targets)

    train_wav = np.concatenate(train_wav)
    train_features = np.vstack(train_features)
    train_targets = np.concatenate(train_targets)

    enc = transforms.MuLawEncoding(channels)
    dec = transforms.MuLawExpanding(channels)

    train_wav = enc(train_wav)
    train_targets = enc(train_targets)

    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)

    train_wav = train_wav.reshape(-1, seq_M)
    train_features = np.rollaxis(train_features.reshape(-1, seq_M, features_size), 2, 1)
    train_targets = train_targets.reshape(-1, seq_M)

    train_wav = torch.from_numpy(train_wav).long()
    train_features = torch.from_numpy(train_features).float()
    train_targets = torch.from_numpy(train_targets).long()
    print(train_features.shape, train_wav.shape, train_targets.shape)
github jinserk / pytorch-asr / asr / utils / dataset.py View on Github external
# transformer: convert int to one-hot vector
class Int2OneHot(object):

    def __init__(self, num_labels):
        self.num_labels = num_labels

    def __call__(self, targets):
        one_hots = list()
        for t in targets:
            one_hot = torch.LongTensor(self.num_labels).zero_()
            one_hot[t] = 1
            one_hots.append(one_hot)
        return one_hots


class BatchTransformer(torchaudio.transforms.Compose):

    def __init__(self,
                 resample=True, sample_rate=params.SAMPLE_RATE,
                 tempo=True, tempo_range=params.TEMPO_RANGE,
                 pitch=True, pitch_range=params.PITCH_RANGE,
                 noise=True, noise_range=params.NOISE_RANGE,
                 offset=True, offset_range=None,
                 padding=True, num_padding=None,
                 window_shift=params.WINDOW_SHIFT, window_size=params.WINDOW_SIZE, nfft=params.NFFT,
                 unit_frames=params.WIDTH, stride=2, split=False):
        if offset and offset_range is None:
            offset_range = (0, stride * WIN_SAMP_SHIFT)
        if padding and num_padding is None:
            pad = int(((params.WIDTH * stride) // 2 - 1) * WIN_SAMP_SHIFT)
            num_padding = (pad, pad)
        super().__init__([
github pytorch / audio / torchaudio / transforms.py View on Github external
def forward(self, waveform):
        r"""
        Args:
            waveform (torch.Tensor): Tensor of audio of dimension (channel, time)

        Returns:
            torch.Tensor: Dimension (channel, freq, time), where channel
            is unchanged, freq is ``n_fft // 2 + 1`` where ``n_fft`` is the number of
            Fourier bins, and time is the number of window hops (n_frame).
        """
        return F.spectrogram(waveform, self.pad, self.window, self.n_fft, self.hop_length,
                             self.win_length, self.power, self.normalized)
github sigsep / open-unmix-pytorch / utils.py View on Github external
def torchaudio_info(path):
    import torchaudio
    # get length of file in samples
    info = {}
    si, _ = torchaudio.info(str(path))
    info['samplerate'] = si.rate
    info['samples'] = si.length // si.channels
    info['duration'] = info['samples'] / si.rate
    return info