How to use the dedupe.training function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_training.py View on Github external
def test_uncovered_by(self):
        before = {1: {1, 2, 3}, 2: {1, 2}, 3: {3}}
        after = {1: {1, 2}, 2: {1, 2}}

        before_copy = before.copy()

        assert training.BranchBound.uncovered_by(before, set()) == before
        assert training.BranchBound.uncovered_by(before, {3}) == after
        assert before == before_copy
github dedupeio / dedupe / tests / test_blocking.py View on Github external
def test_dedupe_coverage(self) :
    predicates = self.data_model.predicates()
    blocker = dedupe.blocking.Blocker(predicates)
    dedupe.training.prepare_index(blocker, self.training_pairs, "Dedupe")

    coverage = dedupe.training.coveredBy(blocker.predicates, self.training)
    assert self.simple(coverage.keys()).issuperset(
          set(["SimplePredicate: (tokenFieldPredicate, name)", 
               "SimplePredicate: (commonSixGram, name)", 
               "TfidfTextCanopyPredicate: (0.4, name)", 
               "SimplePredicate: (sortedAcronym, name)",
               "SimplePredicate: (sameThreeCharStartPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.2, name)", 
               "SimplePredicate: (sameFiveCharStartPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.6, name)", 
               "SimplePredicate: (wholeFieldPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.8, name)", 
               "SimplePredicate: (commonFourGram, name)", 
               "SimplePredicate: (firstTokenPredicate, name)", 
               "SimplePredicate: (sameSevenCharStartPredicate, name)"]))
github dedupeio / dedupe / dedupe / labeler.py View on Github external
candidates,
                 data_1,
                 data_2,
                 original_length_1,
                 original_length_2,
                 index_include):

        super().__init__(data_model, candidates)

        sampled_records_1 = Sample(data_1, 600, original_length_1)
        index_data = Sample(data_2, 50000, original_length_2)
        sampled_records_2 = Sample(index_data, 600, original_length_2)

        preds = self.data_model.predicates(canopies=False)

        self.block_learner = training.RecordLinkBlockLearner(preds,
                                                             sampled_records_1,
                                                             sampled_records_2,
                                                             index_data)

        examples_to_index = candidates
        if index_include:
            candidates += index_include

        self._index_predicates(examples_to_index)
github dedupeio / dedupe / examples / sqlite_example / sqlite_blocking.py View on Github external
'state': {'type': 'String'},
              'zip': {'type': 'String'},
              }
    deduper = dedupe.Dedupe(fields)

    # Sometimes we will want to add additional labeled examples to a
    # training file. To do this can just load the existing labeled
    # pairs...
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_samples, training_file)

    print 'starting active labeling...'
    print 'finding uncertain pairs...'
    # ... and then call training with our interactive function
    deduper.train(data_samples, dedupe.training.consoleLabel)
    deduper.writeTraining(training_file)

print 'blocking...'
t_block = time.time()
blocker = deduper.blockingFunction(eta=0.001, epsilon=5)
deduper.writeSettings(settings_file)
print 'blocked in', time.time() - t_block, 'seconds'

# So the learning is done and we have our blocker. However we cannot
# block the data in memory. We have to pass through all the data and
# create a blocking map table.
#
# First though, if we learned a tf-idf predicate, we have to create an
# tfIDF blocks for the full data set.
print 'creating inverted index'
full_data = ((row['donor_id'], row) for row in con.execute(donor_select))
github dedupeio / dedupe / dedupe / api.py View on Github external
a record dictionary.

        In in the record dictionary the keys are the names of the
        record field and values are the record values.
        """
        self.data_model = datamodel.DataModel(variable_definition)

        if num_cores is None:
            self.num_cores = multiprocessing.cpu_count()
        else:
            self.num_cores = num_cores

        if data_sample:
            self._checkDataSample(data_sample)
            self.data_sample = data_sample
            self.activeLearner = training.ActiveLearning(self.data_sample,
                                                         self.data_model,
                                                         self.num_cores)
        else:
            self.data_sample = []
            self.activeLearner = None

        # Override _loadSampledRecords() to load blocking data from
        # data_sample.
        self._loadSampledRecords(data_sample)

        training_dtype = [('label', 'S8'),
                          ('distances', 'f4',
                           (len(self.data_model), ))]

        self.training_data = numpy.zeros(0, dtype=training_dtype)
        self.training_pairs = OrderedDict({u'distinct': [],
github markhuberty / psClean / code / dedupe / patstat_dedupe.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
    # }
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)

    # ## Active learning

    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.

    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'
    deduper.train(data_sample, dedupe.training.consoleLabel)

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                         dedupe.predicates.commonSetElementPredicate),
                              'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                              }
                             )
time_start = time.time()
print 'blocking...'

# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
                                                                    dupes,
github markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,