How to use the scrubadub.filth.Filth function in scrubadub

To help you get started, we’ve selected a few scrubadub examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github datascopeanalytics / scrubadub / tests / test_filth.py View on Github external
def test_nonoverlapping_filth(self):
        """can't merge non-overlapping filth"""
        a_filth = Filth(beg=0, end=3, text="the")
        b_filth = Filth(beg=4, end=7, text="end")
        with self.assertRaises(FilthMergeError):
            a_filth.merge(b_filth)
        with self.assertRaises(FilthMergeError):
            b_filth.merge(a_filth)
github datascopeanalytics / scrubadub / tests / test_filth.py View on Github external
def test_text_merge(self):
        """make sure text length is correct"""
        class SomeFilth(Filth):
            type = 'something'

        text = "the end"
        a_filth = SomeFilth(beg=0, end=3, text=text[:3])
        b_filth = SomeFilth(beg=1, end=7, text=text[1:])
        c_filth = a_filth.merge(b_filth)
        self.assertEqual(c_filth.text, text)
github datascopeanalytics / scrubadub / tests / test_filth.py View on Github external
def test_nonoverlapping_filth(self):
        """can't merge non-overlapping filth"""
        a_filth = Filth(beg=0, end=3, text="the")
        b_filth = Filth(beg=4, end=7, text="end")
        with self.assertRaises(FilthMergeError):
            a_filth.merge(b_filth)
        with self.assertRaises(FilthMergeError):
            b_filth.merge(a_filth)
github datascopeanalytics / scrubadub / tests / test_filth.py View on Github external
def test_disallowed_replace_with(self):
        """replace_with should fail gracefully"""
        filth = Filth()
        with self.assertRaises(InvalidReplaceWith):
            filth.replace_with('surrogate')
        with self.assertRaises(InvalidReplaceWith):
            filth.replace_with('something_invalid')
github datascopeanalytics / scrubadub / scrubadub / scrubbers.py View on Github external
def clean(self, text, **kwargs):
        """This is the master method that cleans all of the filth out of the
        dirty dirty ``text``. All keyword arguments to this function are passed
        through to the  ``Filth.replace_with`` method to fine-tune how the
        ``Filth`` is cleaned.
        """
        if sys.version_info < (3, 0):  # Only in Python 2, in 3 every string is a Python 2 unicode
            if not isinstance(text, unicode):
                raise exceptions.UnicodeRequired

        clean_chunks = []
        filth = Filth()
        for next_filth in self.iter_filth(text):
            clean_chunks.append(text[filth.end:next_filth.beg])
            clean_chunks.append(next_filth.replace_with(**kwargs))
            filth = next_filth
        clean_chunks.append(text[filth.end:])
        return u''.join(clean_chunks)
github datascopeanalytics / scrubadub / scrubadub / scrubbers.py View on Github external
def iter_filth(self, text):
        """Iterate over the different types of filth that can exist.
        """
        # currently doing this by aggregating all_filths and then sorting
        # inline instead of with a Filth.__cmp__ method, which is apparently
        # much slower http://stackoverflow.com/a/988728/564709
        #
        # NOTE: we could probably do this in a more efficient way by iterating
        # over all detectors simultaneously. just trying to get something
        # working right now and we can worry about efficiency later
        all_filths = []
        for detector in self._detectors.values():
            for filth in detector.iter_filth(text):
                if not isinstance(filth, Filth):
                    raise TypeError('iter_filth must always yield Filth')
                all_filths.append(filth)

        # Sort by start position. If two filths start in the same place then
        # return the longer one first
        all_filths.sort(key=lambda f: (f.beg, -f.end))

        # this is where the Scrubber does its hard work and merges any
        # overlapping filths.
        if not all_filths:
            raise StopIteration
        filth = all_filths[0]
        for next_filth in all_filths[1:]:
            if filth.end < next_filth.beg:
                yield filth
                filth = next_filth