How to use the asrtoolkit.data_structures.time_aligned_text.time_aligned_text function in asrtoolkit

To help you get started, we’ve selected a few asrtoolkit examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_txt_conversion():
    " execute stm to txt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_txt_test.txt")
    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_txt_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_vtt_conversion():
    " execute stm to vtt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_vtt_test.vtt")

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.vtt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_vtt_test.vtt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_wer.py View on Github external
def test_conversion_wer():
    " execute single test "

    reference_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    transcript_file = time_aligned_text(
        "samples/BillGatesTEDTalk_intentionally_poor_transcription.txt")

    # test fixed precision output of wer calculation
    assert ("{:5.3f}".format(
        wer(reference_file.text(), transcript_file.text(), True)) == "3.332")
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_json_to_stm_conversion():
    " execute json to stm tests "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.json")
    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    input_file.write("tests/json_to_stm_test_1.stm")
    new_sha = hashlib.sha1(
        open("tests/json_to_stm_test_1.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha

    input_file = time_aligned_text("samples/simple_test.json")
    reference_sha = hashlib.sha1(
        open("samples/simple_test.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    input_file.write("tests/json_to_stm_test_2.stm")
    new_sha = hashlib.sha1(
        open("tests/json_to_stm_test_2.stm", "r",
github finos / greenkey-asrtoolkit / tests / test_conversion.py View on Github external
def test_stm_to_srt_conversion():
    " execute stm to srt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_srt_test.srt")

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.srt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_srt_test.srt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_initialization.py View on Github external
def test_txt_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text = time_aligned_text(input_dict)
    text.file_extension = "txt"

    text_object = time_aligned_text(text.__str__())

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.txt")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / tests / test_initialization.py View on Github external
def test_json_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text_object = time_aligned_text(input_dict)

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.stm")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.stm", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
github finos / greenkey-asrtoolkit / asrtoolkit / data_structures / time_aligned_text.py View on Github external
def split(self, target_dir):
        """
        Split transcript into many pieces based on valid segments of transcript
        """
        os.makedirs(target_dir, exist_ok=True)
        for iseg, seg in enumerate(self.segments):
            new_seg = time_aligned_text()
            new_seg.file_extension = self.file_extension
            new_seg.location = generate_segmented_file_name(
                target_dir, self.location, iseg)
            new_seg.segments = [seg]
            new_seg.write(new_seg.location)
github finos / greenkey-asrtoolkit / asrtoolkit / combine_audio_files.py View on Github external
def check_transcript(transcript):
    if valid_input_file(transcript):
        return time_aligned_text(input_data=transcript)
    else:
        LOGGER.error("Invalid transcript file {}".format(transcript))
        sys.exit(1)
github finos / greenkey-asrtoolkit / asrtoolkit / wer.py View on Github external
"""
    Given an input transcript or time_aligned_text object,
    remove non-speech events
    [optionally] remove non-silence noises

    >>> standardize_transcript("this is a test")
    'this is a test'
    >>> standardize_transcript("this is um a test")
    'this is um a test'
    >>> standardize_transcript("this is um a test", remove_nsns=True)
    'this is a test'
    """

    # accept time_aligned_text objects but use their output text
    input_transcript = (input_transcript.text() if isinstance(
        input_transcript, time_aligned_text) else input_transcript)

    # remove tagged noises and other non-speech events
    input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)

    if remove_nsns:
        input_transcript = remove_nonsilence_noises(input_transcript)

    # clean punctuation, etc.
    input_transcript = clean_up(input_transcript)

    return input_transcript