Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_stm_to_txt_conversion():
" execute stm to txt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_txt_test.txt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.txt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_txt_test.txt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_stm_to_vtt_conversion():
" execute stm to vtt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_vtt_test.vtt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.vtt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_vtt_test.vtt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_conversion_wer():
" execute single test "
reference_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
transcript_file = time_aligned_text(
"samples/BillGatesTEDTalk_intentionally_poor_transcription.txt")
# test fixed precision output of wer calculation
assert ("{:5.3f}".format(
wer(reference_file.text(), transcript_file.text(), True)) == "3.332")
def test_json_to_stm_conversion():
" execute json to stm tests "
input_file = time_aligned_text("samples/BillGatesTEDTalk.json")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.stm", "r",
encoding="utf8").read().encode()).hexdigest()
input_file.write("tests/json_to_stm_test_1.stm")
new_sha = hashlib.sha1(
open("tests/json_to_stm_test_1.stm", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
input_file = time_aligned_text("samples/simple_test.json")
reference_sha = hashlib.sha1(
open("samples/simple_test.stm", "r",
encoding="utf8").read().encode()).hexdigest()
input_file.write("tests/json_to_stm_test_2.stm")
new_sha = hashlib.sha1(
open("tests/json_to_stm_test_2.stm", "r",
def test_stm_to_srt_conversion():
" execute stm to srt test "
input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
input_file.write("tests/stm_to_srt_test.srt")
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk.srt", "r",
encoding="utf8").read().encode()).hexdigest()
new_sha = hashlib.sha1(
open("tests/stm_to_srt_test.srt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_txt_initialization():
" execute single test "
input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
text = time_aligned_text(input_dict)
text.file_extension = "txt"
text_object = time_aligned_text(text.__str__())
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.txt", "r",
encoding="utf8").read().encode()).hexdigest()
text_object.write("tests/file_conversion_test.txt")
new_sha = hashlib.sha1(
open("tests/file_conversion_test.txt", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def test_json_initialization():
" execute single test "
input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
text_object = time_aligned_text(input_dict)
reference_sha = hashlib.sha1(
open("samples/BillGatesTEDTalk_transcribed.stm", "r",
encoding="utf8").read().encode()).hexdigest()
text_object.write("tests/file_conversion_test.stm")
new_sha = hashlib.sha1(
open("tests/file_conversion_test.stm", "r",
encoding="utf8").read().encode()).hexdigest()
assert reference_sha == new_sha
def split(self, target_dir):
"""
Split transcript into many pieces based on valid segments of transcript
"""
os.makedirs(target_dir, exist_ok=True)
for iseg, seg in enumerate(self.segments):
new_seg = time_aligned_text()
new_seg.file_extension = self.file_extension
new_seg.location = generate_segmented_file_name(
target_dir, self.location, iseg)
new_seg.segments = [seg]
new_seg.write(new_seg.location)
def check_transcript(transcript):
if valid_input_file(transcript):
return time_aligned_text(input_data=transcript)
else:
LOGGER.error("Invalid transcript file {}".format(transcript))
sys.exit(1)
"""
Given an input transcript or time_aligned_text object,
remove non-speech events
[optionally] remove non-silence noises
>>> standardize_transcript("this is a test")
'this is a test'
>>> standardize_transcript("this is um a test")
'this is um a test'
>>> standardize_transcript("this is um a test", remove_nsns=True)
'this is a test'
"""
# accept time_aligned_text objects but use their output text
input_transcript = (input_transcript.text() if isinstance(
input_transcript, time_aligned_text) else input_transcript)
# remove tagged noises and other non-speech events
input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)
if remove_nsns:
input_transcript = remove_nonsilence_noises(input_transcript)
# clean punctuation, etc.
input_transcript = clean_up(input_transcript)
return input_transcript