How to use the torchaudio.compliance.kaldi.fbank function in torchaudio

To help you get started, we’ve selected a few torchaudio examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / audio / test / compliance / generate_fbank_data.py View on Github external
'round_to_power_of_two', 'snip_edges', 'subtract_mean', 'use_energy', 'use_log_fbank',
        'use_power', 'vtln_high', 'vtln_low', 'vtln_warp', 'window_type']
    fn_split = fn.split('-')
    assert len(fn_split) == len(arr), ('Len mismatch: %d and %d' % (len(fn_split), len(arr)))
    inputs = {arr[i]: utils.parse(fn_split[i]) for i in range(len(arr))}

    # print flags for C++
    s = ' '.join(['--' + arr[i].replace('_', '-') + '=' + fn_split[i] for i in range(len(arr))])
    logging.info(exe_path + ' --dither=0.0 --debug-mel=true ' + s + ' ' + scp_path + ' ' + out_fn)
    logging.info()
    # print args for python
    inputs['dither'] = 0.0
    logging.info(inputs)
    sound, sample_rate = torchaudio.load_wav(sound_path)
    kaldi_output_dict = {k: v for k, v in torchaudio.kaldi_io.read_mat_ark(out_fn)}
    res = torchaudio.compliance.kaldi.fbank(sound, **inputs)
    torch.set_printoptions(precision=10, sci_mode=False)
    logging.info(res)
    logging.info(kaldi_output_dict['my_id'])
github pytorch / audio / test / test_compliance_kaldi.py View on Github external
def get_output_fn(sound, args):
            output = kaldi.fbank(
                sound,
                blackman_coeff=args[1],
                dither=0.0,
                energy_floor=args[2],
                frame_length=args[3],
                frame_shift=args[4],
                high_freq=args[5],
                htk_compat=args[6],
                low_freq=args[7],
                num_mel_bins=args[8],
                preemphasis_coefficient=args[9],
                raw_energy=args[10],
                remove_dc_offset=args[11],
                round_to_power_of_two=args[12],
                snip_edges=args[13],
                subtract_mean=args[14],
github Alexander-H-Liu / End-to-end-ASR-Pytorch / src / audio.py View on Github external
def __init__(self, mode="fbank", num_mel_bins=40, **kwargs):
        super(ExtractAudioFeature, self).__init__()
        self.mode = mode
        self.extract_fn = torchaudio.compliance.kaldi.fbank if mode == "fbank" else torchaudio.compliance.kaldi.mfcc
        self.num_mel_bins = num_mel_bins
        self.kwargs = kwargs
github freewym / espresso / examples / speech_recognition / data / asr_dataset.py View on Github external
def __getitem__(self, index):
        import torchaudio
        import torchaudio.compliance.kaldi as kaldi
        tgt_item = self.tgt[index] if self.tgt is not None else None

        path = self.aud_paths[index]
        if not os.path.exists(path):
            raise FileNotFoundError("Audio file not found: {}".format(path))
        sound, sample_rate = torchaudio.load_wav(path)
        output = kaldi.fbank(
            sound,
            num_mel_bins=self.num_mel_bins,
            frame_length=self.frame_length,
            frame_shift=self.frame_shift
        )
        output_cmvn = data_utils.apply_mv_norm(output)

        return {"id": index, "data": [output_cmvn.detach(), tgt_item]}
github DigitalPhonetics / adviser / adviser / services / hci / speech / SpeechInputFeatureExtractor.py View on Github external
def speech_to_features(self, speech_in: Tuple[numpy.array, int]):
        """
        Turns numpy array with utterance into features

        Args:
            speech_in (tuple(np.array), int): The utterance, represented as array and the sampling rate

        Returns:
            np.array: The extracted features of the utterance
        """
        sample_frequence = speech_in[1]
        speech_in = torch.from_numpy(speech_in[0]).unsqueeze(0)

        filter_bank = torchaudio.compliance.kaldi.fbank(speech_in, num_mel_bins=80, sample_frequency=sample_frequence)
        # Default ASR model uses 16kHz, but different models are possible, then the sampling rate only needs to be changd in the recorder
        pitch = torch.zeros(filter_bank.shape[0], 3)  # TODO: check if torchaudio pitch function is better
        speech_in_features = torch.cat([filter_bank, pitch], 1).numpy()

        return {'speech_features': speech_in_features}
github DigitalPhonetics / adviser / adviser / services / hci / speech / SpeechInputFeatureExtractor.py View on Github external
def speech_to_fbank(self, speech_in):
        """
        Extracts 23 filterbanks from input utterance.

        Args:
            speech_in (tuple(np.array), int): The utterance, represented as array and the sampling rate

        Returns:
            np.array: The extracted features of the utterance
        """
        speech = torch.from_numpy(speech_in[0]).unsqueeze(0)
        fbank = torchaudio.compliance.kaldi.fbank(
            speech,
            sample_frequency=speech_in[1]
        )
        return {'fbank': fbank}