How to use the scrubadub.Scrubber function in scrubadub

To help you get started, we’ve selected a few scrubadub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datascopeanalytics / scrubadub / tests / test_advanced.py View on Github external
def test_disable_email(self):
        """
        BEFORE: contact Joe Duffy at joe@example.com
        AFTER:  contact {{NAME}} {{NAME}} at joe@example.com
        """
        before, after = self.get_before_after()
        import scrubadub
        scrubber = scrubadub.Scrubber()
        scrubber.remove_detector('email')
        self.check_equal(after, scrubber.clean(before))
github datascopeanalytics / scrubadub / tests / test_scrubbers.py View on Github external
def test_filth_merge_placeholder(self):
        """filths should be merged into the biggest filth"""
        text = "you can skype me at john.doe"
        scrubber = scrubadub.Scrubber()
        for filth in scrubber.iter_filth(text):
            self.assertIsInstance(filth, MergedFilth)
            self.assertTrue('SKYPE' in filth.placeholder, filth.placeholder)
            self.assertTrue('EMAIL' in filth.placeholder, filth.placeholder)
github datascopeanalytics / scrubadub / tests / test_scrubbers.py View on Github external
def test_filth_ordering(self):
        """make sure filth is returned in order"""
        scrubber = scrubadub.Scrubber()
        text = (
            "Alan can be reached by email alan@example.com or "
            "phone +1.312.456.6421"
        )
        order = []
        for filth in scrubber.iter_filth(text):
            order.append(filth.beg)
            order.append(filth.end)
        self.assertEqual(sorted(order), order)
github datascopeanalytics / scrubadub / tests / test_advanced.py View on Github external
def test_customize_filth_identification(self):
        """
        BEFORE: contact Joe Duffy at joe@example.com
        AFTER:  contact <b>NAME</b> <b>NAME</b> at <b>EMAIL</b>
        """
        before, after = self.get_before_after()
        import scrubadub
        prefix = scrubadub.filth.base.Filth.prefix
        suffix = scrubadub.filth.base.Filth.suffix
        scrubadub.filth.base.Filth.prefix = u'<b>'
        scrubadub.filth.base.Filth.suffix = u'</b>'
        scrubber = scrubadub.Scrubber()
        self.check_equal(after, scrubber.clean(before))
        scrubadub.filth.base.Filth.prefix = prefix
        scrubadub.filth.base.Filth.suffix = suffix
github datascopeanalytics / scrubadub / tests / test_scrubbers.py View on Github external
def test_add_non_detector(self):
        """make sure you can't add a detector that is not a Detector"""
        class NotDetector(object):
            pass
        scrubber = scrubadub.Scrubber()
        with self.assertRaises(TypeError):
            scrubber.add_detector(NotDetector)
github datascopeanalytics / scrubadub / tests / test_scrubbers.py View on Github external
def test_filth_merge(self):
        """filth should merge properly"""
        # this looks like an email address 'me at john.doe' and skype
        text = "you can skype me at john.doe"
        scrubber = scrubadub.Scrubber()
        filths = [filth for filth in scrubber.iter_filth(text)]
        self.assertEqual(len(filths), 1)
github datascopeanalytics / scrubadub / tests / test_scrubbers.py View on Github external
def test_add_duplicate_detector(self):
        """make sure adding a detector that already exists raises an error"""
        scrubber = scrubadub.Scrubber()
        with self.assertRaises(KeyError):
            scrubber.add_detector(scrubadub.detectors.email.EmailDetector)
github datascopeanalytics / scrubadub / design / customize_filth_resolution.py View on Github external
"""scrubadub ships with a very good method for resolving conflicts between
overlapping pieces of filth. There may be cases where it is necessary to
resolve these conflicts in a customized way to account for additional
information that someone might have.

For example, a user may preferentially want to remove any hint of a name from
text.
"""

import scrubadub
from scrubadub.filth import NameFilth

class MyScrubber(scrubadub.Scrubber):
    def resolve_conflicting_filth(self, *filths):
        for filth in filths:
            if isinstance(filth, NameFilth):
                return filth
        return super(MyScrubber, self).resolve_conflicting_filth(*filths)

# these methods on a Scrubber object should have identical behavior to the
# scrubadub.clean convenience function
scrubber = MyScrubber()
scrubber.clean(text)
scrubber.clean(text, replace_with="placeholder")
scrubber.clean(text, replace_with="identifier")
scrubber.clean(text, replace_with="surrogate")
github datascopeanalytics / scrubadub / design / customize_filth_detection.py View on Github external
a product, then a user should be able to easily adapt how scrubadub identifies
names.
"""

import scrubadub

# fine-tune how scrubadub detects names and omit product names
# https://github.com/deanmalmgren/scrubadub/issues/6
class MyNameDetector(scrubadub.detectors.NameDetector):
    def iter_filth(self, text):
        for filth in super(MyNameDetector, self).iter_filth(text):
            if filth != "iPhone":
                yield filth

# instantiate a scrubber and change the name detector to use our custom class
scrubber = scrubadub.Scrubber()
scrubber.detectors['name'] = MyNameDetector()

# these methods have identical on a Scrubber object should have identical
# behavior to the scrubadub.clean convenience function
clean_text = scrubber.clean(text)
clean_text = scrubber.clean(text, replace_with="placeholder")
clean_text = scrubber.clean(text, replace_with="surrogate")
clean_text = scrubber.clean(text, replace_with="identifier", lookup=lookup)