How to use the dedupe.consoleLabel function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github open-apparel-registry / open-apparel-registry / research / dedupe / gazetteer / oar_gazeteer_example.py View on Github external
# look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file) as tf:
            gazetteer.readTraining(tf)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')

    dedupe.consoleLabel(gazetteer)

    gazetteer.train()

    # When finished, save our training away to disk
    with open(training_file, 'w') as tf:
        gazetteer.writeTraining(tf)

    # Make the canonical set
    gazetteer.index(canonical)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    sys.setrecursionlimit(3000)
    with open(settings_file, 'wb') as sf:
        gazetteer.writeSettings(sf, index=True)
github dedupeio / dedupe / examples / csv_example / csv_example.py View on Github external
# If we have training data saved from a previous run of dedupe,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.readTraining(training_file)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'

    dedupe.consoleLabel(deduper)

    deduper.train()

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    deduper.writeSettings(settings_file)


# ## Blocking

print 'blocking...'
github aliasrobotics / RVD / rvd_tools / database / duplicates.py View on Github external
# __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(self.training_file):
            gray("Reading labeled examples from ", self.training_file)
            with open(self.training_file, "rb") as f:
                deduper.prepare_training(data_d, f)
        else:
            deduper.prepare_training(data_d)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        gray("Starting active labeling...")
        dedupe.consoleLabel(deduper)

        # Using the examples we just labeled, train the deduper and learn
        # blocking predicates
        gray("Training...")
        deduper.train()

        # When finished, save our training to disk
        gray("Saving results to training file...")
        with open(self.training_file, "w+") as tf:
            deduper.writeTraining(tf)

        # Save our weights and predicates to disk.  If the settings file
        # exists, we will skip all the training and learning next time we run
        # this file.
        gray("Saving weights and predicates to settings file...")
        with open(self.settings_file, "wb+") as sf:
github Lyonk71 / pandas-dedupe / pandas_dedupe / dedupe_dataframe.py View on Github external
from.
        settings_file : str
            A path to a settings file that will be loaded if it exists.
            
        Returns
        -------
        dedupe.Dedupe
            A trained dedupe model instance.
    """
    # To train dedupe, we feed it a sample of records.
    sample_num = math.floor(len(data) * sample_size)
    deduper.sample(data, sample_num)

    print('starting active labeling...')

    dedupe.consoleLabel(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.
    with open(settings_file, 'wb') as sf:
        deduper.writeSettings(sf)
    
    return deduper
github titipata / grant_database / dedupe / link_affiliation.py View on Github external
grid_dict = pickle.load(open('grid_dict.pickle', 'r'))

    fields = [{'field' : 'insti_city', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_name', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_code', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_country', 'type': 'String', 'has missing': True},
              ]

    linker = dedupe.RecordLink(fields, num_cores=args.cores)

    linker.sample(grant_affils_dict, grid_dict, args.n)
    if os.path.exists(args.training):
        linker = read_training_file(linker, args.training)

    if not args.skiplabel:
        dedupe.consoleLabel(linker)

    if args.verbose:
        print('training linker...')
    linker.train(ppc=None, index_predicates=not args.nopredicates)
    write_training_file(linker, args.training) # update training file

    if args.verbose:
        print('finding threshold...')
    if args.threshold == 0:
        args.threshold = linker.threshold(grid_dict, grant_affils_dict,
                                            recall_weight=0.5)


    linked_records = linker.match(grid_dict, grant_affils_dict,
                                  threshold=args.threshold)
    # add grid_id to grant_affils_dict
github dedupeio / dedupe / examples / record_linkage_example / record_linkage_example.py View on Github external
# If we have training data saved from a previous run of linker,
    # look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        linker.readTraining(training_file)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as matches
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'

    dedupe.consoleLabel(linker)

    linker.train()

    # When finished, save our training away to disk
    linker.writeTraining(training_file)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    linker.writeSettings(settings_file)


# ## Blocking

# ## Clustering
github titipata / grant_database / dedupe / link_grant.py View on Github external
if __name__ == '__main__':
    params = Parameters()
    n_sample = params.n_sample

    print('prepare dataset...')
    nih_linkage_dict, nsf_linkage_dict = prepare_linkage_dict()

    fields = [{'field' : 'full_name', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_city', 'type': 'String', 'has missing' : True},
              {'field' : 'insti_name', 'type': 'String', 'has missing' : True}]
    linker = dedupe.RecordLink(fields)
    linker.sample(nih_linkage_dict, nsf_linkage_dict, params.n_sample)
    if os.path.exists(params.training_file):
        linker = read_training_file(linker, params.training_file)

    dedupe.consoleLabel(linker)
    print('training linker...')
    linker.train(ppc=None)
    write_training_file(linker, params.training_file) # update training file

    print('finding threshold...')
    if params.threshold is None:
        params.threshold = linker.threshold(nih_linkage_dict, nsf_linkage_dict,
                                            recall_weight=2.0)

    linked_records = linker.match(nih_linkage_dict, nsf_linkage_dict,
                                  threshold=params.threshold)
    print('# of record linkage = %s' % len(linked_records))