Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import re
from .base import RegexFilth
class SkypeFilth(RegexFilth):
type = 'skype'
# these two regular expressions are used to validate a skype usernames.
# _TOKEN is the core regular expression that is used to chunk text into
# tokens to make sure all valid skype usernames are considered the same
# token. Importantly, the word "skype" must pass the _SKYPE regex.
# SKYPE_TOKEN is used to tokenize text and SKYPE_USERNAME is the same thing
# but with the 6-32 character limit imposed on the username. adapted from
# http://bit.ly/1FQs1hD
_SKYPE = r'[a-zA-Z][a-zA-Z0-9_\-\,\.]'
SKYPE_TOKEN = re.compile(_SKYPE+'+')
SKYPE_USERNAME = re.compile(_SKYPE+'{5,31}')
import re
from .base import RegexFilth
class EmailFilth(RegexFilth):
type = 'email'
# there may be better solutions than this out there and this certainly
# doesn't do that great of a job with people that spell out the
# hyphenation of their email address, but its a pretty solid start.
#
# adapted from https://gist.github.com/dideler/5219706
regex = re.compile((
"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+" # start with this character
"(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*" # valid next characters
"(@|\sat\s)" # @ or at fanciness
"(?:"
"[a-z0-9]" # domain starts like this
"(?:[a-z0-9-]*[a-z0-9])?" # might have this
"(\.|\sdot\s)" # . or dot fanciness
")+" # repeat as necessary
from .base import RegexFilth
class NameFilth(RegexFilth):
type = 'name'
import re
from .base import RegexFilth
class UrlFilth(RegexFilth):
type = 'url'
# This allows you to keep the domain
keep_domain = False
# this can be used to customize the output, particularly when
# keep_domain=True
url_placeholder = type.upper()
# this regular expression is convenient for captures the domain name
# and the path separately, which is useful for keeping the domain name
# but sanitizing the path altogether
regex = re.compile(r'''
(?P
(https?:\/\/(www\.)?|www\.) # protocol http://, etc
[\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
import re
from .base import RegexFilth
class CredentialFilth(RegexFilth):
type = 'credential'
# specify how the username/password are replaced
username_placeholder = 'USERNAME'
password_placeholder = 'PASSWORD'
# this regular expression searches for patterns like
# "username: root password: root"
# that tend to occur very frequently in text. This does not currently catch
# things like "username / password is root / root"
regex = re.compile(r'''
(username|login|u:)\s*:?\s* # username might have : and whitespace
(?P[\w\-\.@+]*) # capture the username for replacement
\s+ # some whitespace between
(password|pw|p:)\s*:?\s* # password might have : and whitespace
(?P.*) # password can be anything until EOL
import re
from .base import RegexFilth
class SSNFilth(RegexFilth):
type = 'ssn'
# please note that this not only captures valid SSNs but also invalid ones.
# This choice is delibrate in that we want to be biased toward replacing
# any filth with a cleaner alternative.
# https://en.wikipedia.org/wiki/Social_Security_number#Valid_SSNs
regex = re.compile((
"[0-9][0-9][0-9]" # first three digits
"[\-. ]" # separator
"[0-9][0-9]" # next two digits
"[\-. ]" # separator
"[0-9][0-9][0-9][0-9]" # last four digits
), re.VERBOSE)
def __init__(self, match):
self.match = match
super(RegexFilth, self).__init__(
beg=match.start(),
end=match.end(),
text=match.string[match.start():match.end()],
)
def iter_filths():
"""Iterate over all instances of filth"""
for filth_cls in iter_filth_clss():
if issubclass(filth_cls, RegexFilth):
m = next(re.finditer(r"\s+", "fake pattern string"))
yield filth_cls(m)
else:
yield filth_cls()