How to use the dedupe.training.consoleLabel function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / examples / patent_example / patent_example_twostage.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github markhuberty / psClean / code / dedupe / gb_weighted / patent_example_twostage_gb.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github dedupeio / dedupe / examples / csv_example / csv_example.py View on Github external
# look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)

    # ## Active learning

    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.

    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'
    deduper.train(data_sample, dedupe.training.consoleLabel)

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

# ## Blocking

print 'blocking...'
# Initialize our blocker. We'll learn our blocking rules if we haven't
# loaded them from a saved settings file.
blocker = deduper.blockingFunction()

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
deduper.writeSettings(settings_file)
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github dedupeio / dedupe / examples / csv_example / csv_data_matching_example.py View on Github external
# look for it an load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)

    # ## Active learning

    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.

    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'
    deduper.train(data_sample, dedupe.training.consoleLabel)

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

# ## Blocking

print 'blocking...'
# Initialize our blocker. We'll learn our blocking rules if we haven't
# loaded them from a saved settings file.
blocker = deduper.blockingFunction(constrained_matching)

# Save our weights and predicates to disk.  If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
deduper.writeSettings(settings_file)
github markhuberty / psClean / code / amadeus / amadeus_dedupe.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
    # }
    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        deduper.train(data_sample, training_file)

    # ## Active learning

    # Starts the training loop. Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.

    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print 'starting active labeling...'
    deduper.train(data_sample, dedupe.training.consoleLabel)

    # When finished, save our training away to disk
    deduper.writeTraining(training_file)

# Blocking
deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                         dedupe.predicates.commonSetElementPredicate),
                              'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                              }
                             )
time_start = time.time()
print 'blocking...'

# Initialize the blocker
blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(ppc,
                                                                    dupes,
github markhuberty / psClean / code / dedupe / archive / fi / patent_example_twostage_fi.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github dedupeio / dedupe / examples / patent_example / patent_example.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
github markhuberty / psClean / code / dedupe / archive / nl / patent_example_twostage_nl.py View on Github external
#  1: [[{field:val dict of record 1}, {field_val dict of record 2}], ...(more match pairs)]
        # }
        if os.path.exists(r_training_file):
            print 'reading labeled examples from ', r_training_file
            deduper.train(data_sample, r_training_file)

        # ## Active learning

        # Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,