How to use the torchaudio.load function in torchaudio

To help you get started, we’ve selected a few torchaudio examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / audio / test / test_sox_effects.py View on Github external
def test_reverse(self):
        x_orig, _ = torchaudio.load(self.test_filepath)
        E = torchaudio.sox_effects.SoxEffectsChain()
        E.set_input_file(self.test_filepath)
        E.append_effect_to_chain("reverse", "")
        x_rev, _ = E.sox_build_flow_effects()
        # check if effect worked
        rev_idx = torch.LongTensor(range(x_orig.size(1))[::-1])
        self.assertTrue(x_orig.allclose(x_rev[:, rev_idx], rtol=1e-5, atol=2e-5))
github pytorch / audio / test / test_functional.py View on Github external
def test_batch_pitch(self):
        waveform, sample_rate = torchaudio.load(self.test_filepath)

        # Single then transform then batch
        expected = F.detect_pitch_frequency(waveform, sample_rate)
        expected = expected.unsqueeze(0).repeat(3, 1, 1)

        # Batch then transform
        waveform = waveform.unsqueeze(0).repeat(3, 1, 1)
        computed = F.detect_pitch_frequency(waveform, sample_rate)

        self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
        self.assertTrue(torch.allclose(computed, expected))
        _test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate)
github pytorch / audio / torchaudio / datasets / yesno.py View on Github external
def load_yesno_item(fileid, path, ext_audio):
    # Read label
    labels = [int(c) for c in fileid.split("_")]

    # Read wav
    file_audio = os.path.join(path, fileid + ext_audio)
    waveform, sample_rate = torchaudio.load(file_audio)

    return waveform, sample_rate, labels
github pytorch / audio / torchaudio / datasets / vctk.py View on Github external
def load_vctk_item(
    fileid, path, ext_audio, ext_txt, folder_audio, folder_txt, downsample=False
):
    speaker_id, utterance_id = fileid.split("_")

    # Read text
    file_txt = os.path.join(path, folder_txt, speaker_id, fileid + ext_txt)
    with open(file_txt) as file_text:
        utterance = file_text.readlines()[0]

    # Read wav
    file_audio = os.path.join(path, folder_audio, speaker_id, fileid + ext_audio)
    waveform, sample_rate = torchaudio.load(file_audio)
    if downsample:
        # TODO Remove this parameter after deprecation
        F = torchaudio.functional
        T = torchaudio.transforms
        # rate
        sample = T.Resample(sample_rate, 16000, resampling_method='sinc_interpolation')
        waveform = sample(waveform)
        # dither
        waveform = F.dither(waveform, noise_shaping=True)

    return waveform, sample_rate, utterance, speaker_id, utterance_id
github mlperf / inference / others / cloud / speech_recognition / pytorch / dataset / data_loader.py View on Github external
def load_audio(path, frame_start=0, frame_end=-1):
    sound, _ = torchaudio.load(path)
    sound = sound.numpy()
    if len(sound.shape) > 1:
        if sound.shape[1] == 1:
            sound = sound.squeeze()
        else:
            sound = sound.mean(axis=1)  # multiple channels, average
    if frame_end > 0 or frame_start > 0:
        assert frame_start < frame_end, "slicing does not yet support inverting audio"
        if frame_end > sound.shape[0]:
            repeats = ceil((frame_end - sound.shape[0])/float(sound.shape[0]))
            appendage = sound
            for _ in range(int(repeats)):
                sound = np.concatenate((sound,appendage))
        sound = sound[frame_start:frame_end]
    return sound
github jinserk / pytorch-asr / asr / utils / dataloader.py View on Github external
transformer = Augment(resample=True, sample_rate=params.SAMPLE_RATE)
        wav_file = Path("/home/jbaik/src/enf/stt/test/conan1-8k.wav")
        audio = transformer(wav_file)

    # test Spectrogram
    if True:
        import matplotlib
        matplotlib.use('TkAgg')
        matplotlib.interactive(True)
        import matplotlib.pyplot as plt

        nperseg = int(params.SAMPLE_RATE * params.WINDOW_SIZE)
        noverlap = int(params.SAMPLE_RATE * (params.WINDOW_SIZE - params.WINDOW_SHIFT))

        wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav")
        audio, _ = torchaudio.load(wav_file)

        # pyplot specgram
        audio = torch.squeeze(audio)
        fig = plt.figure(0)
        plt.specgram(audio, Fs=params.SAMPLE_RATE, NFFT=params.NFFT, noverlap=noverlap, cmap='plasma')

        # implemented transformer - scipy stft
        transformer = Spectrogram(sample_rate=params.SAMPLE_RATE, window_stride=params.WINDOW_SHIFT,
                                  window_size=params.WINDOW_SIZE, nfft=params.NFFT)
        data, f, t = transformer(audio)
        print(data.shape)
        mag = data[0]
        fig = plt.figure(1)
        plt.pcolormesh(t, f, np.log10(np.expm1(data[0])), cmap='plasma')
        fig = plt.figure(2)
        plt.pcolormesh(t, f, data[1], cmap='plasma')
github faroit / python_audio_loading_benchmark / loaders.py View on Github external
def load_torchaudio(fp):
    sig, rate = torchaudio.load(fp)
    return sig
github JusperLee / Conv-TasNet / Conv_TasNet_Pytorch / AudioReader.py View on Github external
def read_wav(fname, return_rate=False):
    '''
         Read wavfile using Pytorch audio
         input:
               fname: wav file path
               return_rate: Whether to return the sampling rate
         output:
                src: output tensor of size C x L 
                     L is the number of audio frames 
                     C is the number of channels. 
                sr: sample rate
    '''
    src, sr = torchaudio.load(fname, channels_first=True)
    if return_rate:
        return src.squeeze(), sr
    else:
        return src.squeeze()
github jinserk / pytorch-asr / asr / utils / dataset.py View on Github external
offset=False, offset_range=(0, 40),
                              padding=False, num_padding=0)
        wav_file = "/d1/jbaik/ics-asr/temp/conan1-8k.wav"
        audio = transformer(wav_file)
    # test Spectrogram
    elif test == 2:
        import matplotlib
        matplotlib.use('TkAgg')
        matplotlib.interactive(True)
        import matplotlib.pyplot as plt

        nperseg = int(params.SAMPLE_RATE * params.WINDOW_SIZE)
        noverlap = int(params.SAMPLE_RATE * (params.WINDOW_SIZE - params.WINDOW_SHIFT))

        wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav")
        audio, _ = torchaudio.load(wav_file)

        # pyplot specgram
        audio = torch.squeeze(audio)
        fig = plt.figure(0)
        plt.specgram(audio, Fs=params.SAMPLE_RATE, NFFT=params.NFFT, noverlap=noverlap, cmap='plasma')

        # implemented transformer - scipy stft
        transformer = Spectrogram(sample_rate=params.SAMPLE_RATE, window_stride=params.WINDOW_SHIFT,
                                  window_size=params.WINDOW_SIZE, nfft=params.NFFT)
        data, f, t = transformer(audio)
        mag = data[0]
        fig = plt.figure(1)
        plt.pcolormesh(t, f, np.log10(np.expm1(data[0])), cmap='plasma')
        fig = plt.figure(2)
        plt.pcolormesh(t, f, data[1], cmap='plasma')
        #print(max(data[0].view(257*601)), min(data[0].view(257*601)))
github vsimkus / voice-conversion / datasets / vcc_world_preprocessor.py View on Github external
def read_audio(fp, trim_silence=False):
    if trim_silence:
        E = torchaudio.sox_effects.SoxEffectsChain()
        E.set_input_file(fp)

        E.append_effect_to_chain("silence", [1, 100, 1])
        E.append_effect_to_chain("reverse")
        E.append_effect_to_chain("silence", [1, 100, 1])
        E.append_effect_to_chain("reverse")
    
        sig, sample_rate = E.sox_build_flow_effects()
    else:
        sig, sample_rate = torchaudio.load(fp)
    sig = sig.contiguous()
    return sig, sample_rate