Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_stm_to_txt_conversion():
" execute stm to txt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_txt_test.txt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.txt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_txt_test.txt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_stm_to_vtt_conversion():
" execute stm to vtt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_vtt_test.vtt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.vtt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_vtt_test.vtt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_conversion_wer():
" execute single test "
reference_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
transcript_file = time_aligned_text(
"samples/BillGatesTEDTalk_intentionally_poor_transcription.txt")
# test fixed precision output of wer calculation
assert ("{:5.3f}".format(
wer(reference_file.text(), transcript_file.text(), True)) == "3.332")
def test_json_to_stm_conversion():
" execute json to stm tests "
input_file = time_aligned_text("samples/BillGatesTEDTalk.json")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.stm", "r",
encoding="utf8").read().encode()).hexdigest()
input_file.write("tests/json_to_stm_test_1.stm")
new_sha = hashlib.sha1(
open("tests/json_to_stm_test_1.stm", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
input_file = time_aligned_text("samples/simple_test.json")
reference_sha = hashlib.sha1(
open("samples/simple_test.stm", "r",
encoding="utf8").read().encode()).hexdigest()
input_file.write("tests/json_to_stm_test_2.stm")
new_sha = hashlib.sha1(
open("tests/json_to_stm_test_2.stm", "r",
def test_stm_to_srt_conversion():
" execute stm to srt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_srt_test.srt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.srt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_srt_test.srt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_txt_initialization():
" execute single test "
input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
text = time_aligned_text(input_dict)
text.file_extension = "txt"
text_object = time_aligned_text(text.__str__())
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.txt", "r",
encoding="utf8").read().encode()).hexdigest()
text_object.write("tests/file_conversion_test.txt")
new_sha = hashlib.sha1(
open("tests/file_conversion_test.txt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_json_initialization():
" execute single test "
input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
text_object = time_aligned_text(input_dict)
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.stm", "r",
encoding="utf8").read().encode()).hexdigest()
text_object.write("tests/file_conversion_test.stm")
new_sha = hashlib.sha1(
open("tests/file_conversion_test.stm", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
dev_dir = pjoin(split_dir, "dev")
setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
orig_corpus = corpus({'location': orig_dir})
split_corpus(
orig_dir,
split_dir=split_dir,
split_name="dev",
split_words=19,
min_split_segs=1,
leftover_data_split_name="train",
rand_seed=1337,
)
# Make sure we didn't destroy input data
final_corpus = corpus({'location': orig_dir})
assert orig_corpus.validate() == 1
assert final_corpus.validate() == 1
orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
final_hashes = [_.hash() for _ in final_corpus.exemplars]
assert all(h in final_hashes for h in orig_hashes)
# Make sure correct number of words present in data split
dev_corpus = corpus({'location': dev_dir})
assert sum(e.count_words() for e in dev_corpus.exemplars) == 20
assert dev_corpus.validate()
def test_split_corpus():
""" Test corpus splitter """
n_exemplars = 10
corpus_dir = "tests/split-corpus"
orig_dir = pjoin(corpus_dir, "orig")
split_dir = pjoin(corpus_dir, "splits")
trn_dir = pjoin(split_dir, "train")
dev_dir = pjoin(split_dir, "dev")
setup_test_corpus(orig_dir, trn_dir, dev_dir, n_exemplars)
orig_corpus = corpus({'location': orig_dir})
split_corpus(
orig_dir,
split_dir=split_dir,
split_name="dev",
split_words=19,
min_split_segs=1,
leftover_data_split_name="train",
rand_seed=1337,
)
# Make sure we didn't destroy input data
final_corpus = corpus({'location': orig_dir})
assert orig_corpus.validate() == 1
assert final_corpus.validate() == 1
orig_hashes = [_.hash() for _ in orig_corpus.exemplars]
final_hashes = [_.hash() for _ in final_corpus.exemplars]
),
(
"net booking which grew 6% to $380 million.",
"net booking which grew six percent to three hundred and eighty million dollars",
),
(
"to $25 dollars or $0.21 per share price.",
"to twenty five dollars dollars or zero dollars and twenty one cents per share price",
),
("year-over-year", "year over year"),
("HTC VIVE", "h t c v i v e"),
]
for test in tests:
input_string = test[0]
result = clean_up(input_string)
assert result == test[1]