How to use the dedupe.Dedupe function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / canonical.py View on Github external
print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)
else:
    fields = [{'field' : 'name', 'type': 'String'},
              {'field' : 'name', 'type': 'Exact'},
              {'field' : 'address', 'type': 'String'},
              {'field' : 'cuisine', 'type': 'ShortString', 
               'has missing' : True},
              {'field' : 'city', 'type' : 'ShortString'}
              ]

    deduper = dedupe.Dedupe(fields, num_cores=5)
    deduper.sample(data_d, 10000)
    deduper.markPairs(training_pairs)
    deduper.train()
    with open(settings_file, 'wb') as f:
        deduper.writeSettings(f)


alpha = deduper.threshold(data_d, 1)

# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold=alpha)

print('Evaluate Clustering')
confirm_dupes = set([])
for dupes, score in clustered_dupes:
github dedupeio / dedupe / tests / test_core.py View on Github external
def test_exact_comparator(self):
        deduper = dedupe.Dedupe([{'field': 'name',
                                  'type': 'Exact'}
                                 ])

        record_pairs = (({'name': 'Shmoo'}, {'name': 'Shmee'}),
                        ({'name': 'Shmoo'}, {'name': 'Shmoo'}))

        numpy.testing.assert_array_almost_equal(deduper.data_model.distances(record_pairs),
                                                numpy.array([[0.0],
                                                             [1.0]]),
                                                3)
github dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github external
'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        # The json file is of the form:
        # {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
        #  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
github markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github external
data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
github dedupeio / dedupe / examples / patent_example / patent_example.py View on Github external
'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',
            #                             'Interaction Fields': ['Coauthor_Count', 'Coauthor']
            #                             }
            }

        # Create a new deduper object and pass our data model to it.
        deduper = dedupe.Dedupe(fields)

        # If we have training data saved from a previous run of dedupe,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        # The json file is of the form:
        # {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
        #  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
github dedupeio / dedupe / examples / mysql_example / mysql_example.py View on Github external
# account
    fields = {'name': {'type': 'String'},
              'address': {'type': 'String', 'Has Missing' : True},
              'city': {'type': 'String', 'Has Missing' : True},
              'state': {'type': 'String'},
              'zip': {'type': 'String', 'Has Missing' : True},
              'person' : {'type' : 'Categorical', 
                          'Categories' : [0, 1]},
              'person-address' : {'type' : 'Interaction',
                                  'Interaction Fields' : ['person', 'address']},
              'name-address' : {'type' : 'Interaction', 
                                'Interaction Fields' : ['name', 'address']}
              }

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields, data_sample, num_processes=4)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    #
    # __Note:__ if you want to train from
    # scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.readTraining(training_file)

    # ## Active learning

    print 'starting active labeling...'
    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
github dedupeio / dedupe / examples / sqlite_example / sqlite_clustering.py View on Github external
import itertools

os.chdir('./examples/sqlite_example/')
settings_file = 'sqlite_example_settings.json'

t0 = time.time()

con = sqlite3.connect("illinois_contributions.db")
con.row_factory = sqlite3.Row
con.execute("ATTACH DATABASE 'blocking_map.db' AS bm")
cur = con.cursor()


if os.path.exists(settings_file):
    print 'reading from ', settings_file
    deduper = dedupe.Dedupe(settings_file)
else:
  raise ValueError('Settings File Not Found')

# We grab all the block_keys with more than one record associated with
# it. These associated records will make up a block of records we will
# compare within.
blocking_key_sql = "SELECT key, COUNT(donor_id) AS num_candidates " \
                   "FROM bm.blocking_map GROUP BY key HAVING num_candidates > 1"

block_keys = (row['key'] for row in con.execute(blocking_key_sql))

# This grabs a block of records for comparison. We rely on the
# ordering of the donor_ids
donor_select = "SELECT donor_id, LOWER(city) AS city, " \
               "LOWER(first_name) AS first_name, " \
               "LOWER(last_name) AS last_name, " \
github markhuberty / psClean / code / dedupe / patstat_dedupe.py View on Github external
else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
    # Define the fields dedupe will pay attention to
    fields = {'Name': {'type': 'String', 'Has Missing':True},
              'LatLong': {'type': 'LatLong', 'Has Missing':True},
              'Class': {'type': 'Custom', 'comparator':class_comparator},
              'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
              'patent_ct':{'type': 'Custom', 'comparator': integer_diff},
              'patent_ct_name': {'type': 'Interaction',
                                 'Interaction Fields': ['Name', 'patent_ct']
                                 }
              }

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    # The json file is of the form:
    # {0: [[{field:val dict of record 1}, {field:val dict of record 2}], ...(more nonmatch pairs)]
    #  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
    # }
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)

    # ## Active learning

    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
github dedupeio / dedupe / examples / patent_example / patent_example.py View on Github external
)
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
            # 'Coauthor_Count': {'type': 'Custom', 'comparator': idf},
            # 'Class_Count_Class': {'type': 'Interaction',
            #                       'Interaction Fields': ['Class_Count', 'Class']
            #                       },
            # 'Coauthor_Count_Coauthor': {'type': 'Interaction',