How to use sacremoses - 10 common examples

To help you get started, we’ve selected a few sacremoses examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Findus23 / se-simulator / markov.py View on Github external
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer

tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()


class MarkovText(markovify.Text):
    def word_split(self, sentence):
        return tokenizer.tokenize(sentence)

    def word_join(self, words):
        return detokenizer.detokenize(words, return_str=True)


class MarkovUserName(markovify.Text):
    def word_split(self, word):
        return list(word)

    def word_join(self, characters):
github Findus23 / se-simulator / markov.py View on Github external
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer

tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()


class MarkovText(markovify.Text):
    def word_split(self, sentence):
        return tokenizer.tokenize(sentence)

    def word_join(self, words):
        return detokenizer.detokenize(words, return_str=True)


class MarkovUserName(markovify.Text):
    def word_split(self, word):
        return list(word)

    def word_join(self, characters):
        return "".join(characters)
github alvations / sacremoses / sacremoses / cli.py View on Github external
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detokenize(str.split(line)), end="\n", file=fout)
            else:
                document_iterator = map(str.split, fin.readlines())
                for outline in parallelize_preprocess(
                    moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
                ):
                    print(outline, end="\n", file=fout)
github alvations / sacremoses / sacremoses / cli.py View on Github external
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detokenize(str.split(line)), end="\n", file=fout)
            else:
                document_iterator = map(str.split, fin.readlines())
                for outline in parallelize_preprocess(
                    moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
                ):
                    print(outline, end="\n", file=fout)
github alvations / sacremoses / sacremoses / cli.py View on Github external
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detokenize(str.split(line)), end="\n", file=fout)
            else:
                document_iterator = map(str.split, fin.readlines())
                for outline in parallelize_preprocess(
                    moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
                ):
                    print(outline, end="\n", file=fout)
github alvations / sacremoses / sacremoses / truecase.py View on Github external
def _load_model(self, filename):
        """
        Loads pre-trained truecasing file.

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        """
        casing = defaultdict(Counter)
        with open(filename, encoding=self.encoding) as fin:
            for line in fin:
                line = line.strip().split()
                for token, count in grouper(line, 2):
                    count = count.split("/")[0].strip("()")
                    casing[token.lower()][token] = int(count)
        # Returns the best and known object from `_casing_to_model()`
        return self._casing_to_model(casing)
github alvations / sacremoses / sacremoses / tokenize.py View on Github external
# Keep track of no. of quotation marks.
        quote_counts = {u"'": 0, u'"': 0, u"``": 0, u"`": 0, u"''": 0}

        # The *prepend_space* variable is used to control the "effects" of
        # detokenization as the function loops through the list of tokens and
        # changes the *prepend_space* accordingly as it sequentially checks
        # through the language specific and language independent conditions.
        prepend_space = " "
        detokenized_text = ""
        tokens = text.split()
        # Iterate through every token and apply language specific detokenization rule(s).
        for i, token in enumerate(iter(tokens)):
            # Check if the first char is CJK.
            if is_cjk(token[0]) and self.lang != "ko":
                # Perform left shift if this is a second consecutive CJK word.
                if i > 0 and is_cjk(tokens[i - 1][-1]):
                    detokenized_text += token
                # But do nothing special if this is a CJK word that doesn't follow a CJK word
                else:
                    detokenized_text += prepend_space + token
                prepend_space = " "
            # If it's a currency symbol.
            elif re.search(u"^[" + self.IsSc + u"\(\[\{\¿\¡]+$", token):
                # Perform right shift on currency and other random punctuation items
                detokenized_text += prepend_space + token
                prepend_space = ""

            elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):
                # In French, these punctuations are prefixed with a non-breakable space.
                if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):
                    detokenized_text += " "
                # Perform left shift on punctuation items.
github freewym / espresso / fairseq / data / encoders / moses_tokenizer.py View on Github external
def __init__(self, args):
        self.args = args

        if getattr(args, 'moses_source_lang', None) is None:
            args.moses_source_lang = getattr(args, 'source_lang', 'en')
        if getattr(args, 'moses_target_lang', None) is None:
            args.moses_target_lang = getattr(args, 'target_lang', 'en')

        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer
            self.tok = MosesTokenizer(args.moses_source_lang)
            self.detok = MosesDetokenizer(args.moses_target_lang)
        except ImportError:
            raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
github krasserm / fairseq-image-captioning / preprocess / tokenize_captions.py View on Github external
def tokenize_captions(captions, lang='en'):
    """Tokenizes captions list with Moses tokenizer.
    """

    tokenizer = MosesTokenizer(lang=lang)
    return [tokenizer.tokenize(caption, return_str=True) for caption in captions]
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens: