How to use the lingpy.util.charstring function in lingpy

To help you get started, we’ve selected a few lingpy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
# create an index
        if not hasattr(self, 'freqs'):
            self.chars = set()
            self.freqs = {}
            for taxon in self.cols:
                self.freqs[taxon] = Counter()
                for word in self.get_list(
                        col=taxon, entry=self._numbers, flat=True):
                    self.freqs[taxon].update(word)
                self.chars = self.chars.union(self.freqs[taxon].keys())

            self.rchars = sorted(
                    set(char.split('.', 1)[1] for char in self.chars))
            self.chars = sorted(self.chars) \
                + [charstring(i + 1) for i in range(self.width)]
            if not self.chars:
                raise ValueError("Your input data contains no entries!")
            self.bad_chars = [char for char in self.chars if char[2] == '0']
            if len(self.bad_chars) / len(self.chars) > \
                    rcParams['lexstat_bad_chars_limit']:
                raise ValueError(
                    "{0:.0f}% of the unique characters in your word "
                    "list are not "
                    "recognized by {1}. You should set check=True!".format(
                        100 * len(self.bad_chars) / len(self.chars),
                        util.PROG))

        if not hasattr(self, "scorer"):
            self._meta['scorer'] = {}

        # create a scoring dictionary
github lingpy / lingpy / lingpy / compare / partial.py View on Github external
threshold,
                        new_nums,
                        new_weights,
                        new_pros,
                        gop,
                        scale,
                        kw['factor'],
                        self.bscorer,
                        mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist
github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
[self[pair, self._weights] for pair in pairs],
                        [self[pair, self._prostrings] for pair in pairs],
                        gop,
                        scale,
                        kw['factor'],
                        self.bscorer,
                        mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = charstring(i + 1)
                        elif b == '-':
                            b = charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist
github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
def lexstat_align(x, y):
            return calign.align_pair(
                    self[x, self._numbers], 
                    self[y, self._numbers],
                    [self.cscorer[charstring(self[y, 'langid']), n] for n in
                        self[x, self._numbers]],
                    [self.cscorer[charstring(self[x, 'langid']), n] for n in
                        self[y, self._numbers]], 
                    self[x, self._prostrings],
                    self[y, self._prostrings], 
                    1,
                    kw['scale'], 
                    kw['factor'], 
                    self.cscorer,
                    kw['mode'], 
                    kw['restricted_chars'], 1
                    )[2]
github lingpy / lingpy / lingpy / compare / partial.py View on Github external
new_weights,
                        new_pros,
                        gop,
                        scale,
                        kw['factor'],
                        self.bscorer,
                        mode,
                        kw['restricted_chars'])

                    # change representation of gaps
                    for (a, b), d in corrs.items():
                        # XXX check for bias XXX
                        if a == '-':
                            a = util.charstring(i + 1)
                        elif b == '-':
                            b = util.charstring(j + 1)
                        corrdist[tA, tB][a, b] += d / float(len(kw['modes']))

        return corrdist
github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
# get the correspondence distribution
        self._corrdist = self._get_corrdist(**kw)
        # get the random distribution
        self._randist = self._get_randist(**kw)

        # get the average gop
        gop = sum([m[1] for m in kw['modes']]) / len(kw['modes'])

        # create the new scoring matrix
        matrix = [[c for c in line] for line in self.bscorer.matrix]
        char_dict = self.bscorer.chars2int

        for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)):
            for charA, charB in product(
                list(self.freqs[tA]) + [charstring(i + 1)],
                list(self.freqs[tB]) + [charstring(j + 1)]
            ):
                exp = self._randist.get(
                        (tA, tB), {}).get((charA, charB), False)
                att = self._corrdist.get(
                        (tA, tB), {}).get((charA, charB), False)
                # in the following we follow the former lexstat protocol
                if att <= kw['smooth'] and i != j:
                    att = False

                if att and exp:
                    score = np.log2((att ** 2) / (exp ** 2))
                elif att and not exp:
                    score = np.log2((att ** 2) / kw['unexpected'])
                elif exp and not att:
                    score = kw['unattested']  # XXX gop ???
github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
gop,
                            scale,
                            kw['factor'],
                            self.bscorer,
                            mode,
                            kw['restricted_chars'])

                        # change representation of gaps
                        for a, b in list(corrs.keys()):
                            # get the correspondence count
                            d = corrs[a, b] * self._included[tA, tB] / included
                            # XXX check XXX* len(self.pairs[tA,tB]) / runs

                            # check for gaps
                            if a == '-':
                                a = charstring(i + 1)
                            elif b == '-':
                                b = charstring(j + 1)

                            corrdist[tA, tB][a, b] += d / len(kw['modes'])
        return corrdist