How to use the nnmnkwii.io.hts function in nnmnkwii

To help you get started, we’ve selected a few nnmnkwii examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github r9y9 / nnmnkwii / tests / test_io.py View on Github external
def test_labels_number_of_frames():
    # https://github.com/r9y9/nnmnkwii/issues/85
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "jp.hed"))
    labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
    linguistic_features = fe.linguistic_features(
        labels, binary_dict, continuous_dict, add_frame_features=True)
    assert labels.num_frames() == linguistic_features.shape[0]
github r9y9 / nnmnkwii / tests / test_io.py View on Github external
def test_succeeding_times():
        l = hts.HTSLabelFile()
        l.append((0, 1000000, "OK"))
        l.append((1000000, 2000000, "OK"))
github espnet / espnet / egs / jsut / tts2 / local / prep_segments.py View on Github external
description='Prepare segments from HTS-style alignment files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('wav_scp', type=str, help='wav scp file')
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args(sys.argv[1:])

    with open(args.wav_scp) as f:
        for l in f:
            recording_id, path = l.split()
            lab_path = path.replace("wav/", "lab/").replace(".wav", ".lab")
            assert os.path.exists(lab_path)

            labels = hts.load(lab_path)
            assert "sil" in labels[0][-1]
            assert "sil" in labels[-1][-1]
            segment_begin = "{:.3f}".format(labels[0][1] * 1e-7)
            segment_end = "{:.3f}".format(labels[-1][0] * 1e-7)

            # recording_id = "{}_{}_{}".format(utt_id, segment_begin, segment_end)
            # As we assume that there's only a single utterance per recording,
            # utt_id is same as recording_id.
            # https://kaldi-asr.org/doc/data_prep.html
            utt_id = recording_id
            sys.stdout.write("{} {} {} {}\n".format(utt_id, recording_id, segment_begin, segment_end))
github espnet / espnet / egs / jsut / tts1 / local / prep_segments.py View on Github external
description='Prepare segments from HTS-style alignment files',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('wav_scp', type=str, help='wav scp file')
    return parser


if __name__ == "__main__":
    args = get_parser().parse_args(sys.argv[1:])

    with open(args.wav_scp) as f:
        for l in f:
            recording_id, path = l.split()
            lab_path = path.replace("wav/", "lab/").replace(".wav", ".lab")
            assert os.path.exists(lab_path)

            labels = hts.load(lab_path)
            assert "sil" in labels[0][-1]
            assert "sil" in labels[-1][-1]
            segment_begin = "{:.3f}".format(labels[0][1] * 1e-7)
            segment_end = "{:.3f}".format(labels[-1][0] * 1e-7)

            # As we assume that there's only a single utterance per recording,
            # utt_id is same as recording_id.
            # https://kaldi-asr.org/doc/data_prep.html
            utt_id = recording_id
            sys.stdout.write("{} {} {} {}\n".format(utt_id, recording_id, segment_begin, segment_end))
github r9y9 / wavenet_vocoder / cmu_arctic.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array. Resampled if needed
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    # TODO
    if exists(lab_path) and False:
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=20)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=20)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Mu-law quantize
    if is_mulaw_quantize(hparams.input_type):
        # [0, quantize_channels)
        out = P.mulaw_quantize(wav, hparams.quantize_channels)

        # Trim silences
github Sharad24 / Neural-Voice-Cloning-with-Few-Samples / deepvoice3_pytorch / vctk.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'vctk-spec-%05d.npy' % index
github Sharad24 / Neural-Voice-Cloning-with-Few-Samples / dv3 / jsut.py View on Github external
def _process_utterance(out_dir, index, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = dv3.audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert labels[0][-1] == "silB"
        assert labels[-1][-1] == "silE"
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram =dv3.audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
github r9y9 / deepvoice3_pytorch / vctk.py View on Github external
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
github r9y9 / nnmnkwii / nnmnkwii / frontend / merlin.py View on Github external
def load_labels_with_phone_alignment(hts_labels,
                                     binary_dict,
                                     continuous_dict,
                                     subphone_features=None,
                                     add_frame_features=False,
                                     frame_shift_in_micro_sec=50000):
    dict_size = len(binary_dict) + len(continuous_dict)
    frame_feature_size = get_frame_feature_size(subphone_features)
    dimension = frame_feature_size + dict_size

    assert isinstance(hts_labels, hts.HTSLabelFile)
    if add_frame_features:
        label_feature_matrix = np.empty((hts_labels.num_frames(), dimension))
    else:
        label_feature_matrix = np.empty((hts_labels.num_phones(), dimension))

    label_feature_index = 0

    if subphone_features == "coarse_coding":
        cc_features = compute_coarse_coding_features()

    for idx, (start_time, end_time, full_label) in enumerate(hts_labels):
        frame_number = int(end_time / frame_shift_in_micro_sec) - int(start_time / frame_shift_in_micro_sec)

        label_binary_vector = pattern_matching_binary(
            binary_dict, full_label)
github Sharad24 / Neural-Voice-Cloning-with-Few-Samples / deepvoice3_pytorch / jsut.py View on Github external
def _process_utterance(out_dir, index, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert labels[0][-1] == "silB"
        assert labels[-1][-1] == "silE"
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk: