How to use the dedupe.distance.cosine.CosineSimilarity function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / dedupe / distance / cosine.py View on Github external
return numerator/(norm_1 * norm_2)

        else :
            return numpy.nan

    def __getstate__(self):
        result = self.__dict__.copy()
        result['vectors'] = {}
        return result


class CosineTextSimilarity(CosineSimilarity) :
    def _list(self, document) :
        return document.split()

class CosineSetSimilarity(CosineSimilarity) :

    def _list(self, document) :
        return document
github dedupeio / dedupe / dedupe / distance / cosine.py View on Github external
numerator = 0.0
            for word in set(vector_1) & set(vector_2) :
                numerator += vector_1[word] * vector_2[word]

            return numerator/(norm_1 * norm_2)

        else :
            return numpy.nan

    def __getstate__(self):
        result = self.__dict__.copy()
        result['vectors'] = {}
        return result


class CosineTextSimilarity(CosineSimilarity) :
    def _list(self, document) :
        return document.split()

class CosineSetSimilarity(CosineSimilarity) :

    def _list(self, document) :
        return document
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
)

            # Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, np.round(3 * input_df.shape[0], -1))
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
# Reset the index so that it is sequential. Then
            # store the new:old map
            consolidated_input.reset_index(inplace=True)
            index_map = consolidated_input['index'].to_dict()
        
        data_d = patent_util.readDataFrame(consolidated_input)
        del consolidated_input
        input_df.set_index(cluster_name, inplace=True)
        

## Build the comparators
    coauthors = [row['Coauthor'] for cidx, row in data_d.items()]
    classes = [row['Class'] for cidx, row in data_d.items()]
    class_comparator = dedupe.distance.cosine.CosineSimilarity(classes)
    coauthor_comparator = dedupe.distance.cosine.CosineSimilarity(coauthors)

# ## Training
    if os.path.exists(r_settings_file):
        print 'reading from', r_settings_file
        deduper = dedupe.Dedupe(r_settings_file)

    else:
        # To train dedupe, we feed it a random sample of records.
        data_sample = dedupe.dataSample(data_d, 600000)
        # Define the fields dedupe will pay attention to
        fields = {
            'Name': {'type': 'String', 'Has Missing':True},
            'LatLong': {'type': 'LatLong', 'Has Missing':True},
            'Class': {'type': 'Custom', 'comparator':class_comparator},
            'Coauthor': {'type': 'Custom', 'comparator': coauthor_comparator}# ,
            # 'Class_Count': {'type': 'Custom', 'comparator': idf},