How to use the asrtoolkit.file_utils.name_cleaners.strip_extension function in asrtoolkit

To help you get started, we’ve selected a few asrtoolkit examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github finos / greenkey-asrtoolkit / asrtoolkit / data_structures / audio_file.py View on Github external
def prepare_for_training(self, file_name, sample_rate=16000):
        """
        Converts to single channel (from channel 1) audio file
        in SPH file format
        Returns audio_file object on success, else None
        """
        if file_name.split(".")[-1] != "sph":
            LOGGER.warning(
                "Forcing training data to use SPH file format for %s",
                file_name)
            file_name = strip_extension(file_name) + ".sph"

        file_name = sanitize_hyphens(file_name)

        # return None if error code given, otherwise return audio_file object
        output_file = (audio_file(file_name) if not subprocess.call(
            "sox -V1 {} {} rate {} remix -".format(self.location, file_name,
                                                   sample_rate),
            shell=True,
        ) else None)

        return output_file
github finos / greenkey-asrtoolkit / asrtoolkit / align_json.py View on Github external
ref_txt, str - reference text file containing ground truth
      json_file, str - hypothesis gk JSON file
      filename, str - output STM filename
    """

    ref_tokens = preprocess_txt.parse_transcript(ref_txt)
    gk_json = preprocess_gk_json.preprocess_transcript(json_file)
    segments = align(gk_json, ref_tokens)

    if filename is None:
        filename = basename(sanitize(strip_extension(ref_txt))) + ".stm"

    # fix segment filename and speaker
    for seg in segments:
        seg.filename = strip_extension(filename)
        seg.speaker = strip_extension(filename) + "UnknownSpeaker"

    output = time_aligned_text()
    output.segments = segments
    output.write(filename)
github finos / greenkey-asrtoolkit / asrtoolkit / combine_audio_files.py View on Github external
def combine_transcripts(transcripts, output_file_name):
    # Get one list of segments
    out_transcript = reduce(operator.add, transcripts)
    out_transcript.location = os.path.join(
        strip_extension(output_file_name) + "." +
        out_transcript.file_extension)
    out_transcript.write(out_transcript.location)
github finos / greenkey-asrtoolkit / asrtoolkit / align_json.py View on Github external
Input
      ref_txt, str - reference text file containing ground truth
      json_file, str - hypothesis gk JSON file
      filename, str - output STM filename
    """

    ref_tokens = preprocess_txt.parse_transcript(ref_txt)
    gk_json = preprocess_gk_json.preprocess_transcript(json_file)
    segments = align(gk_json, ref_tokens)

    if filename is None:
        filename = basename(sanitize(strip_extension(ref_txt))) + ".stm"

    # fix segment filename and speaker
    for seg in segments:
        seg.filename = strip_extension(filename)
        seg.speaker = strip_extension(filename) + "UnknownSpeaker"

    output = time_aligned_text()
    output.segments = segments
    output.write(filename)
github finos / greenkey-asrtoolkit / asrtoolkit / data_structures / corpus.py View on Github external
setattr(self, key, kwargs[key])

        # only if not defined above should we search for exemplars
        # based on location
        if not self.exemplars:
            # instantiate exemplars for this object to override
            # static class variable
            self.exemplars = []

            audio_extensions_to_try = ["sph", "wav", "mp3"][::-1]
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(strip_extension(fl) + ".stm"),
                }) for audio_extension in audio_extensions_to_try
                for fl in (get_files(self.location, audio_extension) if self.
                           location else [])
                if (os.path.exists(strip_extension(fl) + ".stm"))
            ]

            # gather all exemplars from /stm and /sph subdirectories if present
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(self.location + "/stm/" +
                                      basename(strip_extension(fl)) + ".stm"),
                }) for audio_extension in audio_extensions_to_try for fl in
                (get_files(self.location +