How to use dedupe - 10 common examples

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / canonical.py View on Github external
print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)
else:
    fields = [{'field' : 'name', 'type': 'String'},
              {'field' : 'name', 'type': 'Exact'},
              {'field' : 'address', 'type': 'String'},
              {'field' : 'cuisine', 'type': 'ShortString', 
               'has missing' : True},
              {'field' : 'city', 'type' : 'ShortString'}
              ]

    deduper = dedupe.Dedupe(fields, num_cores=5)
    deduper.sample(data_d, 10000)
    deduper.markPairs(training_pairs)
    deduper.train()
    with open(settings_file, 'wb') as f:
        deduper.writeSettings(f)


alpha = deduper.threshold(data_d, 1)

# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold=alpha)

print('Evaluate Clustering')
confirm_dupes = set([])
for dupes, score in clustered_dupes:
github dedupeio / dedupe / tests / test_core.py View on Github external
def test_exact_comparator(self):
        deduper = dedupe.Dedupe([{'field': 'name',
                                  'type': 'Exact'}
                                 ])

        record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}),
                        ({'name': 'Shmoo'}, {'name': 'Shmoo'}))

        numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs),
                                                numpy.array([[0.0],
                                                             [1.0]]),
                                                3)
github dedupeio / dedupe / tests / test_dedupe.py View on Github external
assert dedupe.predicates.commonIntegerPredicate(
            field) == set(['123', '16'])
        assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
        assert dedupe.predicates.firstIntegerPredicate('foo') == ()
        assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
        assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
        assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
        assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
        assert dedupe.predicates.commonFourGram('12') == set([])
        assert dedupe.predicates.sameFiveCharStartPredicate(
            field) == ('12316',)
        assert dedupe.predicates.sameSevenCharStartPredicate(
            field) == ('12316th',)
        assert dedupe.predicates.nearIntegersPredicate(
            field) == set(['15', '17', '16', '122', '123', '124'])
        assert dedupe.predicates.commonFourGram(field) == set(
            ['1231', '2316', '316t', '16th', '6ths', 'thst'])
        assert dedupe.predicates.commonSixGram(field) == set(
            ['12316t', '2316th', '316ths', '16thst'])
        assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
        assert dedupe.predicates.initials(field, 7) == ('123 16t',)
        assert dedupe.predicates.ngrams(
            field, 3) == ['123', '23 ', '3 1', ' 16', '16t', '6th', 'th ', 'h s', ' st']
        assert dedupe.predicates.commonTwoElementsPredicate(
            (1, 2, 3)) == set(('1 2', '2 3'))
        assert dedupe.predicates.commonTwoElementsPredicate((1,)) == set([])
        assert dedupe.predicates.commonThreeElementsPredicate(
            (1, 2, 3)) == set(('1 2 3',))
        assert dedupe.predicates.commonThreeElementsPredicate((1,)) == set([])

        assert dedupe.predicates.fingerprint(
            'time sandwich') == (u'sandwichtime',)
github dedupeio / dedupe / tests / test_dedupe.py View on Github external
assert dedupe.predicates.existsPredicate('') == ('0',)
        assert dedupe.predicates.existsPredicate(1) == ('1',)
        assert dedupe.predicates.existsPredicate(0) == ('0',)
        assert dedupe.predicates.sortedAcronym(field) == ('11s',)
        assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
        assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
        assert dedupe.predicates.firstTokenPredicate('') == ()
        assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
        assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
        assert dedupe.predicates.tokenFieldPredicate(
            field) == set(['123', '16th', 'st'])
        assert dedupe.predicates.commonIntegerPredicate(
            field) == set(['123', '16'])
        assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
        assert dedupe.predicates.firstIntegerPredicate('foo') == ()
        assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
        assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
        assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
        assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
        assert dedupe.predicates.commonFourGram('12') == set([])
        assert dedupe.predicates.sameFiveCharStartPredicate(
            field) == ('12316',)
        assert dedupe.predicates.sameSevenCharStartPredicate(
            field) == ('12316th',)
        assert dedupe.predicates.nearIntegersPredicate(
            field) == set(['15', '17', '16', '122', '123', '124'])
        assert dedupe.predicates.commonFourGram(field) == set(
            ['1231', '2316', '316t', '16th', '6ths', 'thst'])
        assert dedupe.predicates.commonSixGram(field) == set(
            ['12316t', '2316th', '316ths', '16thst'])
        assert dedupe.predicates.initials(field, 12) == ('123 16th st',)
        assert dedupe.predicates.initials(field, 7) == ('123 16t',)
github dedupeio / dedupe / tests / test_dedupe.py View on Github external
def test_predicates_correctness(self):
    field = '123 16th st'
    assert dedupe.predicates.existsPredicate(field) == ('1',)
    assert dedupe.predicates.existsPredicate('') == ('0',)
    assert dedupe.predicates.existsPredicate(1) == ('1',)
    assert dedupe.predicates.existsPredicate(0) == ('0',)
    assert dedupe.predicates.sortedAcronym(field) == ('11s',)
    assert dedupe.predicates.wholeFieldPredicate(field) == ('123 16th st',)
    assert dedupe.predicates.firstTokenPredicate(field) == ('123',)
    assert dedupe.predicates.firstTokenPredicate('') == ()
    assert dedupe.predicates.firstTokenPredicate('123/') == ('123',)
    assert dedupe.predicates.tokenFieldPredicate(' ') == set([])
    assert dedupe.predicates.tokenFieldPredicate(field) == set(['123', '16th', 'st'])
    assert dedupe.predicates.commonIntegerPredicate(field) == set(['123', '16'])
    assert dedupe.predicates.commonIntegerPredicate('foo') == set([])
    assert dedupe.predicates.firstIntegerPredicate('foo') == ()
    assert dedupe.predicates.firstIntegerPredicate('1foo') == ('1',)
    assert dedupe.predicates.firstIntegerPredicate('f1oo') == ()
    assert dedupe.predicates.sameThreeCharStartPredicate(field) == ('123',)
    assert dedupe.predicates.sameThreeCharStartPredicate('12') == ('12', )
    assert dedupe.predicates.commonFourGram('12') == set([])
github dedupeio / dedupe / tests / test_api.py View on Github external
import dedupe
import unittest
import random
import numpy
import warnings
from collections import OrderedDict

DATA_SAMPLE = ((dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}), 
                dedupe.core.frozendict({'age': '50', 'name': 'Bob'})),
               (dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}), 
                dedupe.core.frozendict({'age': '35', 'name': 'William'})),
               (dedupe.core.frozendict({'age': '10', 'name': 'Sue'}), 
                dedupe.core.frozendict({'age': '35', 'name': 'William'})),
               (dedupe.core.frozendict({'age': '27', 'name': 'Kyle'}), 
                dedupe.core.frozendict({'age': '20', 'name': 'Jimmy'})),
               (dedupe.core.frozendict({'age': '75', 'name': 'Charlie'}), 
                dedupe.core.frozendict({'age': '21', 'name': 'Jimbo'})))

data_dict = OrderedDict(((0, {'name' : 'Bob',         'age' : '51'}),
                         (1, {'name' : 'Linda',       'age' : '50'}),
                         (2, {'name' : 'Gene',        'age' : '12'}),
                         (3, {'name' : 'Tina',        'age' : '15'}),
                         (4, {'name' : 'Bob B.',      'age' : '51'}),
                         (5, {'name' : 'bob belcher', 'age' : '51'}),
github dedupeio / dedupe / tests / canonical_matching.py View on Github external
def canonicalImport(filename):
    preProcess = exampleIO.preProcess
    data_d = {}
 
    with open(filename) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            clean_row = [(k, preProcess(v)) for (k, v) in
                         viewitems(row)]
            data_d[filename + str(i)] = dedupe.core.frozendict(clean_row) 


    return data_d, reader.fieldnames
github dedupeio / dedupe / tests / test_core.py View on Github external
def test_hash_is_order_insensitive(self):
        frozendict = dedupe.core.frozendict

        test_dict = {'smtp': 21, 'dict': 2628}
        reverse_test_dict = {'dict': 2628, 'smtp': 21}
        assert test_dict == reverse_test_dict

        test_frozendict = frozendict(test_dict)
        reverse_test_frozendict = frozendict(reverse_test_dict)
        assert frozendict(test_dict) == frozendict(reverse_test_dict)

        assert hash(test_frozendict) == hash(reverse_test_frozendict)
github dedupeio / dedupe / tests / canonical.py View on Github external
def canonicalImport(filename):
    preProcess = exampleIO.preProcess

    data_d = {}

    with open(filename) as f:
        reader = csv.DictReader(f)
        for (i, row) in enumerate(reader):
            clean_row = [(k, preProcess(v)) for (k, v) in
                         viewitems(row)]
            data_d[i] = dedupe.core.frozendict(clean_row)

    return data_d, reader.fieldnames
github dedupeio / dedupe / tests / test_training.py View on Github external
def test_uncovered_by(self):
        before = {1: {1, 2, 3}, 2: {1, 2}, 3: {3}}
        after = {1: {1, 2}, 2: {1, 2}}

        before_copy = before.copy()

        assert training.BranchBound.uncovered_by(before, set()) == before
        assert training.BranchBound.uncovered_by(before, {3}) == after
        assert before == before_copy