How to use the dedupe.distance.cosine function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github markhuberty / psClean / code / dedupe / archive / fi / patent_example_twostage_fi.py View on Github external
)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
github markhuberty / psClean / code / dedupe / archive / es / patent_example_twostage_es.py View on Github external
)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
import time
import sys
import pandas as pd
import math
import datetime
import numpy as np
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')

## Load up local libraries
import patent_util
import AsciiDammit

# Finally load dedupe 
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience.  To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
github markhuberty / psClean / code / dedupe / archive / nl / patent_example_twostage_nl.py View on Github external
import optparse
import time
import sys
import pandas as pd
import math
import datetime
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')

## Load up local libraries
import patent_util
import AsciiDammit

# Finally load dedupe 
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience.  To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
github dedupeio / dedupe / examples / patent_example / patent_example.py View on Github external
import os
import csv
import re
import collections
import logging
import optparse
import time
import sys
import pandas as pd
import patent_util
import math
import AsciiDammit

import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

def idf(i, j) :
    i = int(i)
    j = int(j)
    max_i = max([i,j])
    return math.log(len(data_d)/int(max_i))


# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience.  To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
# Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},
github markhuberty / psClean / code / dedupe / gb_weighted / patent_example_twostage_gb.py View on Github external
import collections
import logging
import optparse
import time
import sys
import pandas as pd
import math
import datetime
import numpy as np
import patent_util
import AsciiDammit

# Finally load dedupe 
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

def integer_diff(a, b):
    r = 1.0 / (abs(a-b) + 1)
    return r

# Finally load dedupe 
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience.  To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
import optparse
import time
import sys
import pandas as pd
import math
import datetime
sys.path.append('/home/markhuberty/Documents/dedupe/examples/patent_example')

## Load up local libraries
import patent_util
import AsciiDammit

# Finally load dedupe 
import dedupe
from dedupe.distance import cosine
sys.modules['cosine'] = cosine

# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. Added
# for convenience.  To enable verbose logging, run `python
# examples/csv_example/csv_example.py -v`

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG