How to use the lingpy.util.pb function in lingpy

To help you get started, we’ve selected a few lingpy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
desc='THRESHOLD DETERMINATION',
                        total=len(self.pairs)-len(self.cols)) as progress:
                    for l1, l2 in self.pairs:
                        progress.update(1)
                        if l1 != l2:
                            pairs = self.pairs[l1, l2]
                            for p1, p2 in pairs:
                                dx = [align(p1, pairs[random.randint(
                                    0, len(pairs) - 1)][1])
                                      for i in range(len(pairs) // 20 or 5)]
                                thresholds.extend(dx)
            if thresholds:
                threshold = sum(thresholds) / len(thresholds) * 0.5
                self._meta['guessed_threshold'] = threshold

        with util.pb(
                desc='SEQUENCE CLUSTERING',
                total=len(self.rows)) as progress:
            for concept, indices, matrix in matrices:
                progress.update(1)

                # check for keyword to guess the threshold
                if kw['guess_threshold'] and kw['gt_mode'] == 'item':
                    t = clustering.best_threshold(matrix, kw['gt_trange'])
                # FIXME: considering new function here JML
                # elif kw['guess_threshold'] and kw['gt_mode'] == 'nullditem':
                #    pass
                else:
                    t = threshold

                c = fclust(matrix, t)
github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
while len(words) < kw['rands']:
                            words += [words[random.randint(0, len(words)-1)]]

                    seqs[taxon], pros[taxon], weights[taxon] = [], [], []
                    for w in words:
                        cls = tokens2class(w.split(' '), self.model,
                                cldf=self._cldf)
                        pros[taxon].append(prosodic_string(w.split(' ')))
                        weights[taxon].append(prosodic_weights(pros[taxon][-1]))
                        seqs[taxon].append([
                            '{0}.{1}'.format(c, p) for c, p in zip(
                                 cls,
                                 [self._transform[pr] for pr in pros[taxon][-1]]
                                 )])

            with util.pb(
                    desc='RANDOM CORRESPONDENCE CALCULATION',
                    total=tasks) as progress:
                for (i, tA), (j, tB) in util.multicombinations2(
                        enumerate(self.cols)):
                    progress.update(1)
                    log.info(
                        "Calculating random alignments"
                        " for pair {0}/{1}.".format(tA, tB)
                    )
                    corrdist[tA, tB] = defaultdict(float)
                    for mode, gop, scale in kw['modes']:
                        corrs, included = calign.corrdist(
                            10.0,
                            [(seqs[tA][x], seqs[tB][y]) for x, y in sample],
                            [(weights[tA][x], weights[tB][y]) for x, y in
                                sample],
github lingpy / lingpy / lingpy / sequence / profile.py View on Github external
merge_vowels=merge_vowels, brackets=None, ignore_brackets=False,
                split_entries=False, preparse=None, rules=None,
                merge_geminates=merge_geminates)[0]

        # retain whole word if there are splitters in the word
        if [x for x in cleaned_string if x in brackets + splitters]:
            profile[word] += 1
            bad_words.add(word)
        else:
            for segment in cleaned_string.split(' '):
                profile[segment] += 1
            for segment in [x for x in word if x not in cleaned_string]:
                profile[segment] += 1
                nulls.add(segment)

    for s, f in pb(sorted(profile.items(), key=lambda x: x[1], reverse=True),
            desc='preparing profile'):
        sclass = token2class(s, 'dolgo')
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0' and s not in nulls:
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts:
            sound = clts.get(s, False)
            if not sound:
                ipa = '!'+s
            else:
                ipa = text_type(sound)
        else:
            ipa = s
github lingpy / lingpy / lingpy / compare / partial.py View on Github external
modes=rcParams['lexstat_modes'],
            factor=rcParams['align_factor'],
            restricted_chars=rcParams['restricted_chars'],
            runs=rcParams['lexstat_runs'],
            rands=rcParams['lexstat_rands'],
            limit=rcParams['lexstat_limit'],
            method=rcParams['lexstat_scoring_method'])
        kw.update(keywords)

        # determine the mode
        method = 'markov' if kw['method'] in ['markov', 'markov-chain', 'mc'] \
            else 'shuffle'

        corrdist = {}
        tasks = (self.width ** 2) / 2
        with util.pb(
                desc='RANDOM CORRESPONDENCE CALCULATION',
                total=tasks) as progress:
            for (i, tA), (j, tB) in util.multicombinations2(
                    enumerate(self.cols)):
                progress.update(1)
                log.info(
                    "Calculating random alignments"
                    "for pair {0}/{1}.".format(tA, tB)
                )
                corrdist[tA, tB] = defaultdict(float)
                
                # create morpheme-segmented pairs
                pairs = self.pairs[tA, tB]
                new_nums, new_weights, new_pros = [], [], []
                for idxA, idxB in pairs:
                    for iA, iB in self._slices[idxA]:
github lingpy / lingpy / lingpy / sequence / profile.py View on Github external
else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string)-1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post, cleaned_string):
                        profile[ctxA+segment+ctxB] += [(language, word)]
                    for segment in [x for x in word if x not in ' '.join(cleaned_string)]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))
    
    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(sorted(profile.items(), key=lambda x:
        len(x[1]), reverse=True)), desc='yielding entries', total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries][:max_entries], [l[0] for l in
                entries][:max_entries]
        languages = ', '.join(sorted(set(langs), key=lambda x: langs.count(x),
            reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(sorted(set(words), key=lambda x:
            words.count(x), reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
github lingpy / lingpy / lingpy / align / sca.py View on Github external
rcParams['ref'] = keywords['ref']

        # reassing ref for convenience
        ref = keywords['ref']

        # check for existing alignments
        test = list(self.msa[ref].keys())[0]
        if 'alignment' not in self.msa[ref][test]:
            log.error(
                "No alignments could be found. You should carry out"
                " an alignment analysis first!")
            return

        # go on with the analysis
        cons_dict = {}
        with util.pb(desc='CONSENSUS', total=len(self.etd[ref])) as progress:
            for cog in self.etd[ref]:
                progress.update(1)

                if cog in self.msa[ref]:
                    log.debug("Analyzing cognate set number '{0}'...".format(cog))

                    # temporary solution for sound-class integration
                    if classes == True:
                        _classes = []
                        if weights:
                            keywords['weights'] = prosodic_weights(
                                prosodic_string(self.msa[ref][cog]['_sonority_consensus'])
                            )
                        else:
                            keywords['weights'] = [
                                1.0 for i in range(len(self.msa[ref][cog]['alignment']))]
github lingpy / lingpy / lingpy / compare / partial.py View on Github external
# check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(method=method, scale=scale,
                factor=factor, restricted_chars=restricted_chars, mode=mode,
                gop=gop, imap_mode=kw['imap_mode'],
                split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list) # stores the pcogids
        G = {} # stores the graphs
        with util.pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold, matrix,
                            taxa=list(range(len(matrix))), revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                            matrix, taxa=list(range(len(matrix))), 
                            revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold, matrix, 
                            taxa = list(range(len(matrix))),
                            max_steps=kw['max_steps'],
                            inflation=kw['inflation'],
                            expansion=kw['expansion'],
github lingpy / lingpy / lingpy / compare / partial.py View on Github external
subset=False)
        kw.update(keywords)

        self._included = {}
        corrdist = {}

        if kw['preprocessing']:
            if kw['ref'] not in self.header:
                self.cluster(
                    method=kw['preprocessing_method'],
                    threshold=kw['preprocessing_threshold'],
                    gop=kw['gop'],
                    cluster_method=kw['cluster_method'],
                    ref=kw['ref'])

        with util.pb(
                desc='CORRESPONDENCE CALCULATION',
                total=self.width ** 2 / 2) as pb:
            for (i, tA), (j, tB) in util.multicombinations2(
                    enumerate(self.cols)):
                pb.update(1)
                log.info("Calculating alignments for pair {0} / {1}.".format(
                    tA, tB))

                corrdist[tA, tB] = defaultdict(float)
                for mode, gop, scale in kw['modes']:
                    pairs = self.pairs[tA, tB]
                    if kw['subset']:
                        pairs = [
                                pair for pair in pairs if pair in
                                self.subsets[tA, tB]]