How to use the contractions.Contractions function in contractions

To help you get started, we’ve selected a few contractions examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github andre-martins / TurboParser / python / tokenizer / italian_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
import os
import codecs
from contractions import Contractions


class ItalianContractions(Contractions):
    def __init__(self):
        # Load Italian verbs and their inflections from a lexicon.
        filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
                                'italian_verbs.txt'])
        self.verbs = set()
        with codecs.open(filepath, encoding='utf8') as f:
            for line in f:
                fields = line.rstrip('\n').split()
                assert len(fields) == 3
                self.verbs.add(fields[0])

    def split_if_contraction(self, word):
        original_word = word

        # Handle preposition+determiner contractions.
        word = regex.sub(ur'^([A|a])l$', ur'\1 il', word)
github andre-martins / TurboParser / python / tokenizer / english_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
from contractions import Contractions


class EnglishContractions(Contractions):
    def __init__(self):
        # List of contractions adapted from Robert MacIntyre's tokenizer.
        # These were in turn collected from the TreebankWordTokenizer in NLTK.
        self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"),
                             regex.compile(
                                 r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")]
        self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"),
                              regex.compile(r"(?i)\b(d)('ye)\b"),
                              regex.compile(r"(?i)\b(gim)(me)\b"),
                              regex.compile(r"(?i)\b(gon)(na)\b"),
                              regex.compile(r"(?i)\b(got)(ta)\b"),
                              regex.compile(r"(?i)\b(lem)(me)\b"),
                              regex.compile(r"(?i)\b(mor)('n)\b"),
                              regex.compile(r"(?i)\b(wan)(na) ")]
        self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"),
                              regex.compile(r"(?i) ('t)(was)\b")]
github andre-martins / TurboParser / python / tokenizer / portuguese_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
from contractions import Contractions


class PortugueseContractions(Contractions):
    def __init__(self):
        # A blacklist of words that should not be confused with contractions.
        # If True, mark consonants removed due to enclitics with symbols # and
        # -CL- for mesoclitics.
        self.mark_enclitics = False
        self.non_contractions = {}
        self.contractions = self._generate_contractions()
        self.clitics, self.clitic_suffixes = self._generate_clitics()

    def _generate_contractions(self):
        """
        Generate contractions for Portuguese, along with the words
        and lemmas that are contracted (e.g. contraction "das" is composed by
        words "de" + "as", with corresponding lemmas "de" + "o".
        Return a dictionary of contractions, each entry containing a list of
        words and a list of lemmas (typically lists of length two).
github andre-martins / TurboParser / python / tokenizer / spanish_contractions.py View on Github external
# -*- coding: utf-8 -*-

import codecs
import os

import regex

from contractions import Contractions


class SpanishContractions(Contractions):
    def __init__(self):
        # A blacklist of words that should not be confused with contractions.
        self.non_contractions = {}  # {u'perla', u'perlas', u'arte', u'parte', \
                                    # u'aparte'}
        # A whitelist of frequent words that regexes are not getting but are
        # contractions.
        self.contractions = {}
        verbs = []  # [u'convencer', u'haber', u'hacer', u'meter', u'vender', \
                    # u'poner', u'tener', u'comer', u'mover', u'atender', \
                    # u'responder', u'devolver', u'dar']
        for verb in verbs:
            for suffix in [u'me', u'te', u'nos', u'os']:
                self.contractions[verb + suffix] = [verb, suffix]

        # Load Spanish verbs and their inflections from a lexicon.
        filepath = os.sep.join([os.path.dirname(os.path.realpath(__file__)),
github andre-martins / TurboParser / python / tokenizer / french_contractions.py View on Github external
# -*- coding: utf-8 -*-

import regex
from contractions import Contractions


class FrenchContractions(Contractions):
    def __init__(self):
        pass

    def split_if_contraction(self, word):
        # Handle preposition+determiner contractions.
        word = regex.sub(ur'^(A|a)u$', ur'à le', word)
        word = regex.sub(ur'^(A|a)uquel$', ur'à lequel', word)
        word = regex.sub(ur'^(A|a)ux$', ur'à les', word)
        word = regex.sub(ur'^(A|a)uxquels$', ur'à lesquels', word)
        word = regex.sub(ur'^(A|a)uxquelles$', ur'à lesquelles', word)
        word = regex.sub(ur'^(D|d)u$', ur'de le', word)
        word = regex.sub(ur'^(D|d)uquel$', ur'de lequel', word)
        word = regex.sub(ur'^(D|d)es$', ur'de les', word)
        word = regex.sub(ur'^(D|d)esquels$', ur'de lesquels', word)
        word = regex.sub(ur'^(D|d)esquelles$', ur'de lesquelles', word)

contractions

Fixes contractions such as `you're` to you `are`

MIT
Latest version published 2 years ago

Package Health Score

55 / 100
Full package analysis