How to use asrtoolkit - 10 common examples

To help you get started, we’ve selected a few asrtoolkit examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_txt_conversion():
    " execute stm to txt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_txt_test.txt")
    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_txt_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_vtt_conversion():
    " execute stm to vtt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_vtt_test.vtt")

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.vtt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_vtt_test.vtt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_wer.py View on Github external
def test_conversion_wer():
    " execute single test "

    reference_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    transcript_file = time_aligned_text(
        "samples/BillGatesTEDTalk_intentionally_poor_transcription.txt")

    # test fixed precision output of wer calculation
    assert ("{:5.3f}".format(
        wer(reference_file.text(), transcript_file.text(), True)) == "3.332")
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_json_to_stm_conversion():
    " execute json to stm tests "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.json")
    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    input_file.write("tests/json_to_stm_test_1.stm")
    new_sha = hashlib.sha1(
        open("tests/json_to_stm_test_1.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha

    input_file = time_aligned_text("samples/simple_test.json")
    reference_sha = hashlib.sha1(
        open("samples/simple_test.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    input_file.write("tests/json_to_stm_test_2.stm")
    new_sha = hashlib.sha1(
        open("tests/json_to_stm_test_2.stm", "r",
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_srt_conversion():
    " execute stm to srt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_srt_test.srt")

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.srt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_srt_test.srt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_initialization.py View on Github external
def test_txt_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text = time_aligned_text(input_dict)
    text.file_extension = "txt"

    text_object = time_aligned_text(text.__str__())

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.txt")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_initialization.py View on Github external
def test_json_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text_object = time_aligned_text(input_dict)

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.stm")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_split_corpus.py View on Github external
dev_dir = pjoin(split_dir, "dev")

    setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
    orig_corpus = corpus({'location': orig_dir})
    split_corpus(
        orig_dir,
        split_dir=split_dir,
        split_name="dev",
        split_words=19,
        min_split_segs=1,
        leftover_data_split_name="train",
        rand_seed=1337,
    )

    # Make sure we didn't destroy input data
    final_corpus = corpus({'location': orig_dir})
    assert orig_corpus.validate() == 1
    assert final_corpus.validate() == 1
    orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
    final_hashes = [_.hash() for _ in final_corpus.exemplars]
    assert all(h in final_hashes for h in orig_hashes)

    # Make sure correct number of words present in data split
    dev_corpus = corpus({'location': dev_dir})
    assert sum(e.count_words() for e in dev_corpus.exemplars) == 20
    assert dev_corpus.validate()
github finos / greenkey-asrtoolkit / tests / test_split_corpus.py View on Github external
def test_split_corpus():
    """ Test corpus splitter """
    n_exemplars = 10
    corpus_dir = "tests/split-corpus"

    orig_dir = pjoin(corpus_dir, "orig")
    split_dir = pjoin(corpus_dir, "splits")
    trn_dir = pjoin(split_dir, "train")
    dev_dir = pjoin(split_dir, "dev")

    setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
    orig_corpus = corpus({'location': orig_dir})
    split_corpus(
        orig_dir,
        split_dir=split_dir,
        split_name="dev",
        split_words=19,
        min_split_segs=1,
        leftover_data_split_name="train",
        rand_seed=1337,
    )

    # Make sure we didn't destroy input data
    final_corpus = corpus({'location': orig_dir})
    assert orig_corpus.validate() == 1
    assert final_corpus.validate() == 1
    orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
    final_hashes = [_.hash() for _ in final_corpus.exemplars]
github finos / greenkey-asrtoolkit / tests / test_clean_up.py View on Github external
),
        (
            "net booking which grew 6% to $380 million.",
            "net booking which grew six percent to three hundred and eighty million dollars",
        ),
        (
            "to $25 dollars or $0.21 per share price.",
            "to twenty five dollars dollars or zero dollars and twenty one cents per share price",
        ),
        ("year-over-year", "year over year"),
        ("HTC VIVE", "h t c v i v e"),
    ]

    for test in tests:
        input_string = test[0]
        result = clean_up(input_string)
        assert result == test[1]