How to use the scrubadub.filth.base.RegexFilth function in scrubadub

To help you get started, we’ve selected a few scrubadub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datascopeanalytics / scrubadub / scrubadub / filth / skype.py View on Github external
import re

from .base import RegexFilth


class SkypeFilth(RegexFilth):
    type = 'skype'

    # these two regular expressions are used to validate a skype usernames.
    # _TOKEN is the core regular expression that is used to chunk text into
    # tokens to make sure all valid skype usernames are considered the same
    # token. Importantly, the word "skype" must pass the _SKYPE regex.
    # SKYPE_TOKEN is used to tokenize text and SKYPE_USERNAME is the same thing
    # but with the 6-32 character limit imposed on the username. adapted from
    # http://bit.ly/1FQs1hD
    _SKYPE = r'[a-zA-Z][a-zA-Z0-9_\-\,\.]'
    SKYPE_TOKEN = re.compile(_SKYPE+'+')
    SKYPE_USERNAME = re.compile(_SKYPE+'{5,31}')
github datascopeanalytics / scrubadub / scrubadub / filth / email.py View on Github external
import re

from .base import RegexFilth


class EmailFilth(RegexFilth):
    type = 'email'

    # there may be better solutions than this out there and this certainly
    # doesn't do that great of a job with people that spell out the
    # hyphenation of their email address, but its a pretty solid start.
    #
    # adapted from https://gist.github.com/dideler/5219706
    regex = re.compile((
        "[a-z0-9!#$%&'*+\/=?^_`{|}~-]+"             # start with this character
        "(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*"      # valid next characters
        "(@|\sat\s)"                                # @ or at fanciness
        "(?:"
        "[a-z0-9]"                                  # domain starts like this
        "(?:[a-z0-9-]*[a-z0-9])?"                   # might have this
        "(\.|\sdot\s)"                              # . or dot fanciness
        ")+"                                        # repeat as necessary
github datascopeanalytics / scrubadub / scrubadub / filth / name.py View on Github external
from .base import RegexFilth


class NameFilth(RegexFilth):
    type = 'name'
github datascopeanalytics / scrubadub / scrubadub / filth / url.py View on Github external
import re

from .base import RegexFilth


class UrlFilth(RegexFilth):
    type = 'url'

    # This allows you to keep the domain
    keep_domain = False

    # this can be used to customize the output, particularly when
    # keep_domain=True
    url_placeholder = type.upper()

    # this regular expression is convenient for captures the domain name
    # and the path separately, which is useful for keeping the domain name
    # but sanitizing the path altogether
    regex = re.compile(r'''
        (?P
            (https?:\/\/(www\.)?|www\.)          # protocol http://, etc
            [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
github datascopeanalytics / scrubadub / scrubadub / filth / credential.py View on Github external
import re

from .base import RegexFilth


class CredentialFilth(RegexFilth):
    type = 'credential'

    # specify how the username/password are replaced
    username_placeholder = 'USERNAME'
    password_placeholder = 'PASSWORD'

    # this regular expression searches for patterns like
    #     "username: root password: root"
    # that tend to occur very frequently in text. This does not currently catch
    # things like "username / password is root / root"
    regex = re.compile(r'''
        (username|login|u:)\s*:?\s*    # username might have : and whitespace
        (?P[\w\-\.@+]*)      # capture the username for replacement
        \s+                            # some whitespace between
        (password|pw|p:)\s*:?\s*       # password might have : and whitespace
        (?P.*)               # password can be anything until EOL
github datascopeanalytics / scrubadub / scrubadub / filth / ssn.py View on Github external
import re

from .base import RegexFilth


class SSNFilth(RegexFilth):
    type = 'ssn'

    # please note that this not only captures valid SSNs but also invalid ones.
    # This choice is delibrate in that we want to be biased toward replacing
    # any filth with a cleaner alternative.
    # https://en.wikipedia.org/wiki/Social_Security_number#Valid_SSNs
    regex = re.compile((
        "[0-9][0-9][0-9]"       # first three digits
        "[\-. ]"                # separator
        "[0-9][0-9]"            # next two digits
        "[\-. ]"                # separator
        "[0-9][0-9][0-9][0-9]"  # last four digits
    ), re.VERBOSE)
github datascopeanalytics / scrubadub / scrubadub / filth / base.py View on Github external
def __init__(self, match):
        self.match = match
        super(RegexFilth, self).__init__(
            beg=match.start(),
            end=match.end(),
            text=match.string[match.start():match.end()],
        )
github datascopeanalytics / scrubadub / scrubadub / filth / __init__.py View on Github external
def iter_filths():
    """Iterate over all instances of filth"""
    for filth_cls in iter_filth_clss():
        if issubclass(filth_cls, RegexFilth):
            m = next(re.finditer(r"\s+", "fake pattern string"))
            yield filth_cls(m)
        else:
            yield filth_cls()