How to use the scrubadub.detectors.base.RegexDetector function in scrubadub

To help you get started, we’ve selected a few scrubadub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datascopeanalytics / scrubadub / tests / test_detector.py View on Github external
def test_regex_filth(self):
        """make sure RegexDetector only works with RegexFilth"""

        class MyFilth(Filth):
            pass

        class MyDetector(RegexDetector):
            filth_cls = MyFilth

        text = 'dirty dirty text'
        detector = MyDetector()
        with self.assertRaises(UnexpectedFilth):
            for filth in detector.iter_filth(text):
                pass
github datascopeanalytics / scrubadub / scrubadub / detectors / name.py View on Github external
import re

import textblob

from .base import RegexDetector
from ..filth import NameFilth
from ..utils import CanonicalStringSet


class NameDetector(RegexDetector):
    """Use part of speech tagging to clean proper nouns out of the dirty dirty
    ``text``. Disallow particular nouns by adding them to the
    ``NameDetector.disallowed_nouns`` set.
    """
    filth_cls = NameFilth

    disallowed_nouns = CanonicalStringSet(["skype"])

    def iter_filth(self, text):

        if not isinstance(self.disallowed_nouns, CanonicalStringSet):
            raise TypeError(
                'NameDetector.disallowed_nouns must be CanonicalStringSet'
            )

        # find the set of proper nouns using textblob.
github datascopeanalytics / scrubadub / scrubadub / detectors / email.py View on Github external
import re

from .base import RegexDetector
from ..filth import EmailFilth


class EmailDetector(RegexDetector):
    """Use regular expression magic to remove email addresses from dirty
    dirty ``text``. This method also catches email addresses like ``john at
    gmail.com``.
    """
    filth_cls = EmailFilth
github datascopeanalytics / scrubadub / scrubadub / detectors / skype.py View on Github external
import re

import nltk
import textblob

from .base import RegexDetector
from ..filth import SkypeFilth


class SkypeDetector(RegexDetector):
    """Skype usernames tend to be used inline in dirty dirty text quite
    often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method
    looks at words within ``word_radius`` words of "skype" for things that
    appear to be misspelled or have punctuation in them as a means to
    identify skype usernames.

    Default ``word_radius`` is 10, corresponding with the rough scale of
    half of a sentence before or after the word "skype" is used. Increasing
    the ``word_radius`` will increase the false positive rate and
    decreasing the ``word_radius`` will increase the false negative rate.
    """
    filth_cls = SkypeFilth

    word_radius = 10

    def iter_filth(self, text):
github datascopeanalytics / scrubadub / scrubadub / detectors / url.py View on Github external
from .base import RegexDetector
from ..filth import UrlFilth


class UrlDetector(RegexDetector):
    """Use regular expressions to remove URLs that begin with ``http://``,
    ``https://`` or ``www.`` from dirty dirty ``text``.

    With ``keep_domain=True``, this detector only obfuscates the path on a
    URL, not its domain. For example,
    ``http://twitter.com/someone/status/234978haoin`` becomes
    ``http://twitter.com/{{replacement}}``.
    """
    filth_cls = UrlFilth