How to use the torchaudio.compliance.kaldi function in torchaudio

To help you get started, we’ve selected a few torchaudio examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / audio / test / test_compliance_kaldi.py View on Github external
def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
        waveform = torch.arange(num_samples).float()
        output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)

        # from NumFrames in feature-window.cc
        n = window_size
        if snip_edges:
            m = 0 if num_samples < window_size else 1 + (num_samples - window_size) // window_shift
        else:
            m = (num_samples + (window_shift // 2)) // window_shift

        self.assertTrue(output.dim() == 2)
        self.assertTrue(output.shape[0] == m and output.shape[1] == n)

        window = torch.empty((m, window_size))

        for r in range(m):
            extract_window(window, waveform, r, window_size, window_shift, snip_edges)
        self.assertTrue(torch.allclose(window, output))
github pytorch / audio / test / compliance / utils.py View on Github external
def generate_rand_window_type():
    # Generates a random window type
    return torchaudio.compliance.kaldi.WINDOWS[random.randint(0, len(torchaudio.compliance.kaldi.WINDOWS) - 1)]
github ZhengkunTian / OpenTransformer / otrans / data.py View on Github external
import os
import torch
import random
import kaldiio as kio
import numpy as np
import torchaudio as ta
from torch.utils.data import Dataset, DataLoader
from prefetch_generator import BackgroundGenerator

PAD = 0
EOS = 1
BOS = 1
UNK = 2
MASK = 2
unk = ''
compute_fbank = ta.compliance.kaldi.fbank


def load_vocab(vocab_file):
    # unit2idx = {'<s>': 0, '': 1, '': 2}
    unit2idx = {}
    with open(os.path.join(vocab_file), 'r', encoding='utf-8') as v:
        for line in v:
            unit, idx = line.strip().split()
            unit2idx[unit] = int(idx)
    return unit2idx


def normalization(feature):
    std, mean = torch.std_mean(feature, dim=0)
    return (feature - mean) / std
</s>
github DigitalPhonetics / adviser / adviser / services / hci / speech / SpeechInputFeatureExtractor.py View on Github external
def speech_to_mfcc(self, speech_in):
        """
        Extracts 13 Mel Frequency Cepstral Coefficients (MFCC) from input utterance.

        Args:
            speech_in (tuple(np.array), int): The utterance, represented as array and the sampling rate

        Returns:
            np.array: The extracted features of the utterance
        """
        speech = torch.from_numpy(speech_in[0]).unsqueeze(0)
        mfcc = torchaudio.compliance.kaldi.mfcc(
            speech,
            sample_frequency=speech_in[1]
        )
        return {'mfcc': mfcc}