How to use the dedupe.StaticDedupe function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / canonical.py View on Github external
data_d, header = canonicalImport(raw_data)

training_pairs = dedupe.trainingDataDedupe(data_d, 
                                           'unique_id', 
                                           5000)

duplicates_s = set(frozenset(pair) for pair in training_pairs['match'])

t0 = time.time()

print('number of known duplicate pairs', len(duplicates_s))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)
else:
    fields = [{'field' : 'name', 'type': 'String'},
              {'field' : 'name', 'type': 'Exact'},
              {'field' : 'address', 'type': 'String'},
              {'field' : 'cuisine', 'type': 'ShortString', 
               'has missing' : True},
              {'field' : 'city', 'type' : 'ShortString'}
              ]

    deduper = dedupe.Dedupe(fields, num_cores=5)
    deduper.sample(data_d, 10000)
    deduper.markPairs(training_pairs)
    deduper.train()
    with open(settings_file, 'wb') as f:
        deduper.writeSettings(f)
github dedupeio / dedupe / tests / canonical.py View on Github external
duplicates = set()
for _, pair in groupby(sorted(data_d.items(),
                              key=lambda x: x[1]['unique_id']),
                       key=lambda x: x[1]['unique_id']):
    pair = list(pair)
    if len(pair) == 2:
        a, b = pair
        duplicates.add(frozenset((a[0], b[0])))

t0 = time.time()

print('number of known duplicate pairs', len(duplicates))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)

else:
    fields = [{'field': 'name', 'type': 'String'},
              {'field': 'name', 'type': 'Exact'},
              {'field': 'address', 'type': 'String'},
              {'field': 'cuisine', 'type': 'ShortString',
               'has missing': True},
              {'field': 'city', 'type': 'ShortString'}
              ]

    deduper = dedupe.Dedupe(fields, num_cores=5)
    deduper.prepare_training(data_d, sample_size=10000)
    deduper.markPairs(training_pairs)
    deduper.train(index_predicates=False)
    with open(settings_file, 'wb') as f:
        deduper.writeSettings(f)
github aliasrobotics / RVD / rvd_tools / database / duplicates.py View on Github external
def find_duplicates(self, train, push, label):
        """
        Find duplicates and print them via stdout
        """
        # data_d = self.read_data()
        data_d = self.read_data(label, invalid=False)
        # pprint.pprint(data_d)

        if train:
            deduper = self.train(data_d)
        else:
            if os.path.exists(self.settings_file):
                print("reading from", self.settings_file)
                with open(self.settings_file, "rb") as f:
                    deduper = dedupe.StaticDedupe(f)
            else:
                red("Error: settings file does not exist, stoping")
                sys.exit(1)

        cyan("Finding the threshold for data...")
        threshold = deduper.threshold(data_d, recall_weight=1)

        cyan("Clustering...")
        clustered_dupes = deduper.match(data_d, threshold)

        cyan("Number of duplicate sets: " + str(len(clustered_dupes)))
        for aset in clustered_dupes:
            yellow("Found a duplicated pair...")
            ids, values = aset
            primary_issue = None  # reflects the primary ticket in a aset of
            # duplicates all the duplicates should point
github aliasrobotics / RVD / rvd_tools / database / duplicates.py View on Github external
NOTE: should be called from the RVD respository directory.

        :param flaw, Flaw
        :return list
        """
        data_d = self.read_data(None, invalid=False)  # data dict
        # pprint.pprint(data_d)

        # Append the flaw to the data dictonary with the ID 0
        data_d[0] = flaw.document_duplicates()
        # pprint.pprint(data_d)

        if os.path.exists(self.settings_file):
            print("reading from", self.settings_file)
            with open(self.settings_file, "rb") as f:
                deduper = dedupe.StaticDedupe(f)
        else:
            red("Error: settings file does not exist, stoping")
            sys.exit(1)

        cyan("Finding the threshold for data...")
        threshold = deduper.threshold(data_d, recall_weight=1)

        cyan("Clustering...")
        clustered_dupes = deduper.match(data_d, threshold)
        # pprint.pprint(clustered_dupes)  # debug purposes

        #  If ID 0 (corresponds with flaw passed as arg) is in there, is_duplicate
        for set in clustered_dupes:
            ids, values = set
            if 0 in ids:
                return list(ids)
github dedupeio / dedupe / examples / csv_example / csv_example.py View on Github external
for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)

    return data_d


print 'importing data ...'
data_d = readData(input_file)

# ## Training

if os.path.exists(settings_file):
    print 'reading from', settings_file
    deduper = dedupe.StaticDedupe(settings_file)

else:
    # Define the fields dedupe will pay attention to
    #
    # Notice how we are telling dedupe to use a custom field comparator
    # for the 'Zip' field. 
    fields = {
        'Site name': {'type': 'String'},
        'Address': {'type': 'String'},
        'Zip': {'type': 'Custom', 
                'comparator' : sameOrNotComparator, 
                'Has Missing' : True},
        'Phone': {'type': 'String', 'Has Missing' : True},
        }

    # Create a new deduper object and pass our data model to it.
github Lyonk71 / pandas-dedupe / pandas_dedupe / dedupe_dataframe.py View on Github external
Returns
        -------
        dedupe.Dedupe
            A dedupe model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, field_properties)
    
    if update_model == False:
        
        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticDedupe(f)
        
        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Dedupe(fields)
            
            # Launch active learning
            deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
            
    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Dedupe(fields)
        
        # Import existing model
        print('reading labeled examples from ', training_file)