How to use the dedupe.distance function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github

# Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},

markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github

# Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},

markhuberty / psClean / code / dedupe / archive / nl / patent_example_twostage_nl.py View on Github

# Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 3 * input_df.shape[0])
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},

dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github

)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,

markhuberty / psClean / code / dedupe / gb_weighted / patent_example_twostage_gb.py View on Github

)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},

markhuberty / psClean / code / dedupe / patstat_dedupe.py View on Github

# Import the data
print 'importing data ...'
input_df = pd.read_csv(input_file)
input_df.Class.fillna('', inplace=True)
input_df.Coauthor.fillna('', inplace=True)
input_df.Lat.fillna('0.0', inplace=True)
input_df.Lng.fillna('0.0', inplace=True)
input_df.Name.fillna('', inplace=True)

# Read the data into a format dedupe can use
data_d = patent_util.readDataFrame(input_df)

# Build the comparators for class and coauthor
coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
classes = [row['Class'] for cidx, row in data_d.items()]
class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# Training
if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.Dedupe(settings_file)

else:
    # To train dedupe, we feed it a random sample of records.
    data_sample = dedupe.dataSample(data_d, 10 * input_df.shape[0])
    # Define the fields dedupe will pay attention to
    fields = {'Name': {'type': 'String', 'Has Missing':True},
              'LatLong': {'type': 'LatLong', 'Has Missing':True},
              'Class': {'type': 'Custom', 'comparator':class_comparator},
              'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator},
              'patent_ct':{'type': 'Custom', 'comparator': integer_diff},

markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github

)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,

dedupeio / dedupe / examples / patent_example / patent_example.py View on Github

}
        #input_file.set_index(cluster_name)
        consolidated_input = patent_util.consolidate(input_df,
                                                     cluster_name,
                                                     cluster_agg_dict
                                                     )
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},

dedupeio / dedupe / dedupe / variables / fieldclasses.py View on Github

def __init__(self, definition) :
        super(TextType, self).__init__(definition)

        if 'corpus' not in definition :
            definition['corpus'] = []

        self.comparator = dedupe.distance.CosineTextSimilarity(definition['corpus'])

markhuberty / psClean / code / dedupe / archive / nl / patent_example_twostage_nl.py View on Github

)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 3 * input_df.shape[0])
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,

How to use the dedupe.distance function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

dedupe

Package Health Score

Popular dedupe functions

Similar packages