How to use the dedupe.predicates.latLongGridPredicate function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_predicates.py View on Github external
def test_precise_latlong(self):
        block_val = predicates.latLongGridPredicate(self.latlong1)
        assert block_val == (u'[42.5, -5.0]',)
        block_val = predicates.latLongGridPredicate((0, 0))
        assert block_val == ()
github dedupeio / dedupe / tests / test_predicates.py View on Github external
def test_precise_latlong(self):
        block_val = predicates.latLongGridPredicate(self.latlong1)
        assert block_val == (u'[42.5, -5.0]',)
        block_val = predicates.latLongGridPredicate((0, 0))
        assert block_val == ()
github markhuberty / psClean / code / dedupe / archive / it / patent_example_twostage_it.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
github dedupeio / dedupe / dedupe / variables / latlong.py View on Github external
from math import sqrt

from .base import FieldType
from dedupe import predicates
from haversine import haversine


class LatLongType(FieldType):
    type = "LatLong"

    _predicate_functions = [predicates.latLongGridPredicate]

    @staticmethod
    def comparator(x, y):
        return sqrt(haversine(x, y))
github markhuberty / psClean / code / dedupe / archive / dk / patent_example_twostage_dk.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
github dedupeio / dedupe / dedupe / variables / fieldclasses.py View on Github external
class TextType(StringType) :
    type = "Text"

    def __init__(self, definition) :
        super(TextType, self).__init__(definition)

        if 'corpus' not in definition :
            definition['corpus'] = []

        self.comparator = dedupe.distance.CosineTextSimilarity(definition['corpus'])

class LatLongType(FieldType) :
    type = "LatLong"

    _predicate_functions = [predicates.latLongGridPredicate]

    @staticmethod
    def comparator(field_1, field_2) :
        if field_1 == (0.0,0.0) or field_2 == (0.0,0.0) :
            return numpy.nan
        else :
            return haversine(field_1, field_2)

class SetType(FieldType) :
    type = "Set"

    _predicate_functions = (dedupe.predicates.wholeSetPredicate,
                            dedupe.predicates.commonSetElementPredicate,
                            dedupe.predicates.lastSetElementPredicate,
                            dedupe.predicates.commonTwoElementsPredicate,
                            dedupe.predicates.commonThreeElementsPredicate,
github markhuberty / psClean / code / dedupe / archive / fi / patent_example_twostage_fi.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
github markhuberty / psClean / code / dedupe / archive / be / patent_example_twostage_be.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
github markhuberty / psClean / code / dedupe / archive / nl / patent_example_twostage_nl.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes
github markhuberty / psClean / code / dedupe / archive / es / patent_example_twostage_es.py View on Github external
# Starts the training loop. Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as duplicates
        # or not.

        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print 'starting active labeling...'
        deduper.train(data_sample, dedupe.training.consoleLabel)

        # When finished, save our training away to disk
        deduper.writeTraining(r_training_file)

# ## Blocking
    deduper.blocker_types.update({'Custom': (dedupe.predicates.wholeSetPredicate,
                                             dedupe.predicates.commonSetElementPredicate),
                                  'LatLong' : (dedupe.predicates.latLongGridPredicate,)
                                  }
                                 )
    time_start = time.time()
    print 'blocking...'
    # Initialize our blocker, which determines our field weights and blocking 
    # predicates based on our training data
    #blocker = deduper.blockingFunction(r_ppc, r_uncovered_dupes)
    blocker, ppc_final, ucd_final = patent_util.blockingSettingsWrapper(r_ppc,
                                                                        r_uncovered_dupes,
                                                                        deduper
                                                                        )

    if not blocker:
        print 'No valid blocking settings found'
        print 'Starting ppc value: %s' % r_ppc
        print 'Starting uncovered_dupes value: %s' % r_uncovered_dupes