How to use the dedupe.api.ActiveMatching function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_api.py View on Github external
def test_markPair(self) :
    from collections import OrderedDict
    good_training_pairs = OrderedDict((('distinct',  DATA_SAMPLE[0:3]),
                                       ('match', DATA_SAMPLE[3:5])))
    bad_training_pairs = {'non_dupes' : DATA_SAMPLE[0:3],
                          'match' : DATA_SAMPLE[3:5]}

    matcher = dedupe.api.ActiveMatching(self.field_definition)

    self.assertRaises(ValueError, matcher.markPairs, bad_training_pairs)

    matcher.markPairs(good_training_pairs)

    numpy.testing.assert_equal(matcher.training_data['label'],
                               [b'distinct', b'distinct', b'distinct', 
                                b'match', b'match'])

    with warnings.catch_warnings(record=True) as w:
      warnings.simplefilter("always")
      matcher.markPairs({'match' : [], 'distinct' : []})
      assert len(w) == 1
      assert str(w[-1].message) == "Didn't return any labeled record pairs"
github dedupeio / dedupe / tests / test_api.py View on Github external
def test_initialize_fields(self):
        self.assertRaises(TypeError, dedupe.api.ActiveMatching)

        matcher = dedupe.api.ActiveMatching({},)

        assert matcher.blocker is None
github dedupeio / dedupe / tests / test_api.py View on Github external
def test_add_training(self) :
    from collections import OrderedDict
    training_pairs = OrderedDict((('distinct', DATA_SAMPLE[0:3]),
                                  ('match', DATA_SAMPLE[3:5])))
    matcher = dedupe.api.ActiveMatching(self.field_definition)

    matcher._addTrainingData(training_pairs)
    numpy.testing.assert_equal(matcher.training_data['label'],
                               [b'distinct', b'distinct', b'distinct', 
                                b'match', b'match'])

    matcher._addTrainingData(training_pairs)
    numpy.testing.assert_equal(matcher.training_data['label'],
                               [b'distinct', b'distinct', b'distinct', 
                                b'match', b'match']*2)
github dedupeio / dedupe / tests / test_api.py View on Github external
def test_check_record(self) :
    matcher = dedupe.api.ActiveMatching(self.field_definition)

    self.assertRaises(ValueError, matcher._checkRecordPairType, ())
    self.assertRaises(ValueError, matcher._checkRecordPairType, (1,2))
    self.assertRaises(ValueError, matcher._checkRecordPairType, (1,2,3))
    self.assertRaises(ValueError, matcher._checkRecordPairType, ({},{}))

    matcher._checkRecordPairType(({'name' : 'Frank', 'age' : '72'},
                                  {'name' : 'Bob', 'age' : '27'}))
github dedupeio / dedupe / tests / test_api.py View on Github external
def test_initialize_fields(self) :
    self.assertRaises(TypeError, dedupe.api.ActiveMatching)

    matcher = dedupe.api.ActiveMatching({},)

    assert matcher.blocker is None
github dedupeio / dedupe / dedupe / api.py View on Github external
self.activeLearner = training.ActiveLearning(self.data_sample,
                                                     self.data_model,
                                                     self.num_cores)

    def _loadSampledRecords(self, data_sample):
        """Override to load blocking data from data_sample."""



class StaticDedupe(DedupeMatching, StaticMatching):
    """
    Mixin Class for Static Deduplication
    """


class Dedupe(DedupeMatching, ActiveMatching):
    """
    Mixin Class for Active Learning Deduplication

    Public Methods
    - sample
    """
    canopies = True

    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
github dedupeio / dedupe / dedupe / api.py View on Github external
record_pair[0].keys() and record_pair[1].keys()
        except AttributeError:
            raise ValueError("A pair of record_pairs must be made up of two "
                             "dictionaries ")

        self.data_model.check(record_pair[0])
        self.data_model.check(record_pair[1])


class StaticDedupe(DedupeMatching, StaticMatching):
    """
    Mixin Class for Static Deduplication
    """


class Dedupe(DedupeMatching, ActiveMatching):
    """
    Mixin Class for Active Learning Deduplication

    Public Methods
    - sample
    """
    canopies = True

    def prepare_training(self,
                         data,
                         training_file=None,
                         sample_size=15000,
                         blocked_proportion=0.5,
                         original_length=None):
        '''
        Sets up the learner.
github dedupeio / dedupe / dedupe / api.py View on Github external
def _checkData(self, data):
        if len(data) == 0:
            raise ValueError(
                'Dictionary of records is empty.')

        self.data_model.check(next(iter(viewvalues(data))))


class StaticRecordLink(RecordLinkMatching, StaticMatching):
    """
    Mixin Class for Static Record Linkage
    """


class RecordLink(RecordLinkMatching, ActiveMatching):
    """
    Mixin Class for Active Learning Record Linkage

    Public Methods
    - sample
    """
    canopies = False

    def prepare_training(self,
                         data_1,
                         data_2,
                         training_file=None,
                         sample_size=15000,
                         blocked_proportion=0.5,
                         original_length_1=None,
                         original_length_2=None):