How to use the scrubadub.utils.CanonicalStringSet function in scrubadub

To help you get started, we’ve selected a few scrubadub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_init(self):
        """make sure that lower case casting works in __init__"""
        s = CanonicalStringSet(['TKTK', 'tKtK', 'Tktk'])
        self.assertTrue('tktk' in s)
        self.assertEqual(len(s), 1)
github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_add(self):
        """make sure that lower case casting works in add"""
        s = CanonicalStringSet()
        s.add('TKTK')
        s.add('tKtK')
        s.add('Tktk')
        self.assertTrue('tktk' in s)
        self.assertEqual(len(s), 1)
github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_update(self):
        """make sure lower case casting works in update"""
        s = CanonicalStringSet()
        s.update(['TKTK', 'tKtK', 'Tktk'])
        self.assertTrue('tktk' in s)
        self.assertEqual(len(s), 1)
github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_contains(self):
        """make sure __contains__ casts things properly"""
        s = CanonicalStringSet(['tktk'])
        self.assertTrue('TKTK' in s)
        self.assertTrue('Tktk' in s)
        self.assertTrue('tKtK' in s)
github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_remove(self):
        """make sure remove works properly"""
        s = CanonicalStringSet(['tktk'])
        s.remove('TKTK')
        self.assertFalse('tktk' in s)
github datascopeanalytics / scrubadub / tests / test_canonical_string_set.py View on Github external
def test_discard(self):
        """make sure discard works properly"""
        s = CanonicalStringSet(['tktk'])
        s.discard('TKTK')
        s.discard('TkTk')
        s.discard('Tktk')
        self.assertFalse('tktk' in s)
github datascopeanalytics / scrubadub / scrubadub / utils.py View on Github external
def add(self, element):
        return super(CanonicalStringSet, self).add(
            self._cast_as_lower(element)
        )
github datascopeanalytics / scrubadub / scrubadub / detectors / name.py View on Github external
import textblob

from .base import RegexDetector
from ..filth import NameFilth
from ..utils import CanonicalStringSet


class NameDetector(RegexDetector):
    """Use part of speech tagging to clean proper nouns out of the dirty dirty
    ``text``. Disallow particular nouns by adding them to the
    ``NameDetector.disallowed_nouns`` set.
    """
    filth_cls = NameFilth

    disallowed_nouns = CanonicalStringSet(["skype"])

    def iter_filth(self, text):

        if not isinstance(self.disallowed_nouns, CanonicalStringSet):
            raise TypeError(
                'NameDetector.disallowed_nouns must be CanonicalStringSet'
            )

        # find the set of proper nouns using textblob.
        proper_nouns = set()
        blob = textblob.TextBlob(text)
        for word, part_of_speech in blob.tags:
            is_proper_noun = part_of_speech in ("NNP", "NNPS")
            if is_proper_noun and word.lower() not in self.disallowed_nouns:
                proper_nouns.add(word)
github datascopeanalytics / scrubadub / scrubadub / detectors / name.py View on Github external
def iter_filth(self, text):

        if not isinstance(self.disallowed_nouns, CanonicalStringSet):
            raise TypeError(
                'NameDetector.disallowed_nouns must be CanonicalStringSet'
            )

        # find the set of proper nouns using textblob.
        proper_nouns = set()
        blob = textblob.TextBlob(text)
        for word, part_of_speech in blob.tags:
            is_proper_noun = part_of_speech in ("NNP", "NNPS")
            if is_proper_noun and word.lower() not in self.disallowed_nouns:
                proper_nouns.add(word)

        # use a regex to replace the proper nouns by first escaping any
        # lingering punctuation in the regex
        # http://stackoverflow.com/a/4202559/564709
        if proper_nouns: