Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
),
(
"net booking which grew 6% to $380 million.",
"net booking which grew six percent to three hundred and eighty million dollars",
),
(
"to $25 dollars or $0.21 per share price.",
"to twenty five dollars dollars or zero dollars and twenty one cents per share price",
),
("year-over-year", "year over year"),
("HTC VIVE", "h t c v i v e"),
]
for test in tests:
input_string = test[0]
result = clean_up(input_string)
assert result == test[1]
def count_words(self, clean_func=clean_up):
""" Count words in a exemplar after cleaning it """
return len(clean_func(self.transcript_file.text()).split()) if self.validate() else 0
def format_segment(seg):
"""
:param seg: segment object
:return str: text for a particular STM line (see segment __str__ method)
Formats a segment assuming it's an instance of class segment with elements
filename, channel, speaker, start and stop times, label, and text
"""
# clean_up used to unformat stm file text
return " ".join([
str(getattr(seg, _))
for _ in ("filename", "channel", "speaker", "start", "stop", "label")
] + [clean_up(seg.text)])
def clean_line(line):
"clean up a line and test for empty values"
return clean_up(" ".join(
map(lambda val: str(val) if not pd.isnull(val) else "", line)))
>>> standardize_transcript("this is um a test", remove_nsns=True)
'this is a test'
"""
# accept time_aligned_text objects but use their output text
input_transcript = (input_transcript.text() if isinstance(
input_transcript, time_aligned_text) else input_transcript)
# remove tagged noises and other non-speech events
input_transcript = re.sub(re_tagged_nonspeech, " ", input_transcript)
if remove_nsns:
input_transcript = remove_nonsilence_noises(input_transcript)
# clean punctuation, etc.
input_transcript = clean_up(input_transcript)
return input_transcript