How to use the nnmnkwii.io.hts.load function in nnmnkwii

To help you get started, we’ve selected a few nnmnkwii examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github r9y9 / nnmnkwii / tests / test_frontend.py View on Github external
def test_linguistic_features_for_acoustic_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Linguistic features
    # To train acoustic model paired with linguistic features,
    # we need frame-level linguistic feature representation.
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=True,
                               subphone_features="full"
                               )
    y = np.fromfile(join(DATA_DIR, "binary_label_425",
                         "arctic_a0001.lab"), dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)
github r9y9 / nnmnkwii / tests / test_io.py View on Github external
def test_hts_labels_contains_multiple_whitespaces():
    lab_path = join(DATA_DIR, "p225_001.lab")
    labels = hts.load(lab_path)
    print(labels)
github r9y9 / wavenet_vocoder / jsut.py View on Github external
def _process_utterance(out_dir, index, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Trim silence from hts labels if available
    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert "sil" in labels[0][-1]
        assert "sil" in labels[-1][-1]
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
        start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
        wav = wav[start:end]
github mertcokluk / GlotNET / cmu_arctic.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    fs = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    out = wav
    constant_values = 0.0
    out_dtype = np.float32

    p_vt=5
    p_gl=5
github azraelkuan / tensorflow_wavenet_vocoder / datasets / cmu_arctic.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text, silence_threshold, fft_size, ):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    # Mu-law quantize
    quantized = P.mulaw_quantize(wav)

    # Trim silences
    start, end = audio.start_and_end_indices(quantized, silence_threshold)
    quantized = quantized[start:end]
    wav = wav[start:end]

    # Compute a mel-scale spectrogram from the trimmed wav:
github Sharad24 / Neural-Voice-Cloning-with-Few-Samples / dv3 / vctk.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = dv3.audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = dv3.audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
github azraelkuan / tensorflow_wavenet_vocoder / datasets / feature.py View on Github external
# uv = (lf0 != 0).astype(np.float32)

    # continuous lf0 or not
    lf0 = interp1d(lf0, kind='slinear')

    # order 59
    # mgc dim 60*3
    # mgc = apply_delta_windows(mgc, hparams.windows)
    # # lf0 dim 1*3
    # lf0 = apply_delta_windows(lf0, hparams.windows)

    features = np.hstack((mgc, lf0))

    # cut silence frames by hts alignment
    if label_path is not None:
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

    return features.astype(np.float32)