How to use lingpy - 10 common examples

To help you get started, we’ve selected a few lingpy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github lingpy / lingpy / lingpy / compare / lexstat.py View on Github external
desc='THRESHOLD DETERMINATION',
                        total=len(self.pairs)-len(self.cols)) as progress:
                    for l1, l2 in self.pairs:
                        progress.update(1)
                        if l1 != l2:
                            pairs = self.pairs[l1, l2]
                            for p1, p2 in pairs:
                                dx = [align(p1, pairs[random.randint(
                                    0, len(pairs) - 1)][1])
                                      for i in range(len(pairs) // 20 or 5)]
                                thresholds.extend(dx)
            if thresholds:
                threshold = sum(thresholds) / len(thresholds) * 0.5
                self._meta['guessed_threshold'] = threshold

        with util.pb(
                desc='SEQUENCE CLUSTERING',
                total=len(self.rows)) as progress:
            for concept, indices, matrix in matrices:
                progress.update(1)

                # check for keyword to guess the threshold
                if kw['guess_threshold'] and kw['gt_mode'] == 'item':
                    t = clustering.best_threshold(matrix, kw['gt_trange'])
                # FIXME: considering new function here JML
                # elif kw['guess_threshold'] and kw['gt_mode'] == 'nullditem':
                #    pass
                else:
                    t = threshold

                c = fclust(matrix, t)
github lingpy / lingpy / lingpy / basic / parser.py View on Github external
def read_conf(conf=''):
    # load the configuration file
    if not conf:
        conf = util.data_path('conf', 'qlc.rc')

    # read the file defined by its path in conf
    tmp = [line.split('\t') for line in util.read_config_file(conf)]
    
    aliasD, classD, class_stringD, alias2D = {}, {}, {}, {}
    for name, cls, alias in tmp:
        # make sure the name itself is there
        aliasD[name.lower()] = aliasD[name.upper()] = name
        classD[name.lower()] = classD[name.upper()] = eval(cls)
        class_stringD[name.lower()] = class_stringD[name.upper()] = cls

        # add the aliases
        for a in alias.split(','):
            aliasD[a.lower()] = aliasD[a.upper()] = name
            classD[a.lower()] = classD[a.upper()] = eval(cls)
            class_stringD[a.lower()] = class_stringD[a.upper()] = cls
github lingpy / lingpy / lingpy / sequence / tokenizer.py View on Github external
def _init_rules(self, f):
        # Process the orthography rules file.
        for line in util.read_config_file(f, normalize='NFD'):
            rule, replacement = line.split("\t")
            rule = rule.strip()  # just in case there's trailing whitespace
            replacement = replacement.strip()  # because there's probably trailing whitespace!
            self.op_rules.append(re.compile(rule))
            self.op_replacements.append(replacement)

        # check that num rules == num replacements; if not fail
        if len(self.op_rules) != len(self.op_replacements):
            raise ValueError("Number of inputs does not match number of outputs in the rules file.")
github lingpy / lingpy / lingpy / convert / html.py View on Github external
# open the infile
    if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''
github lingpy / lingpy / lingpy / convert / html.py View on Github external
if not os.path.exists(infile):
        infile = infile + '.alm'
    data = util.read_text_file(infile)

    # create the outfile
    if not filename:
        filename = rcParams['filename']

    # read in the templates
    html = util.read_text_file(main_template or template_path('alm2html.html'))
    if not table_template:
        table_template = template_path(
            'alm2html.table.js.html' if confidence else 'alm2html.table.html')
    table = util.read_text_file(table_template)
    css = util.read_text_file(template_path('alm.css'))
    js = util.read_text_file(template_path('alm.js'))

    # define a label function for the taxa
    label = lambda x: keywords['labels'][x] if x in keywords['labels'] else x

    # check for windows-compatibility
    data = data.replace(os.linesep, '\n')[:-1]

    # split the data into blocks
    blocks = data.split('\n\n')

    # retrieve the dataset
    dataset = dataset or blocks[0]

    # create the outstring
    tmp_str = ''
github lingpy / lingpy / lingpy / convert / html.py View on Github external
"""
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
github lingpy / lingpy / lingpy / read / csv.py View on Github external
A list-representation of the CSV file.

    """
    # check for correct fileformat
    if fileformat:
        infile = filename + '.' + fileformat
    else:
        infile = filename

    if dtype is None:
        dtype = []

    l = []

    # open the file
    infile = read_text_file(infile, lines=True, normalize="NFC")

    # check for header
    idx = 0 if header else -1

    for i, line in enumerate(infile):
        if line and (not comment or not line.startswith(comment)) and idx != i:
            if strip_lines:
                cells = [c.strip() for c in line.strip().split(sep)]
            else:
                cells = [c.strip() for c in line.split(sep)]
            if not dtype:
                l += [cells]
            else:
                l += [[f(c) for f, c in zip(dtype, cells)]]

    return l
github lingpy / lingpy / lingpy / read / phylip.py View on Github external
set this value to 0 and make sure to use tabstops as separators between
        values in your matrix file.
    comment : str (default = '#')
        The comment character to be used if your file contains additional
        information which should be ignored.

    Returns
    -------
    data : tuple
        A tuple consisting of a list of taxa and a matrix.

    """
    if '\n' in filename:
        lines = [f for f in filename.split('\n') if f.strip()]
    else:
        lines = read_text_file(filename, normalize="NFC", lines=True)

    taxa, matrix = [], []

    for line in lines[1:]:
        if not line.startswith(comment):
            if taxlen > 0:
                taxa.append(line[:taxlen].strip())
                matrix.append([float(val) for val in
                               re.split(r'\s+', line[taxlen + 1:].strip())])
            else:
                splits = line.split('\t')
                taxa.append(splits[0])
                matrix.append([float(val.strip()) for val in splits[1:]])

    return taxa, matrix
github lingpy / lingpy / lingpy / sequence / tokenizer.py View on Github external
# if no orthography profile is specified, simply return 
        # Unicode grapheme clusters, regex pattern "\X"
        if self.orthography_profile == None:
            return self.grapheme_clusters(string)

        parses = []
        for word in string.split():
            parse = getParse(self.root, word)

            # case where the parsing fails
            if len(parse) == 0:
                # replace characters in string but not in orthography profile with 
                parse = " "+self.find_missing_characters(self.characters(word))
                # write problematic stuff to standard error
                log.debug("The string '{0}' does not parse given the specified orthography profile {1}.\n".format(word, self.orthography_profile))
            
            parses.append(parse)

        # remove the outter word boundaries
        result = "".join(parses).replace("##", "#")
        result = result.rstrip("#")
        result = result.lstrip("#")
        return result.strip()
github lingpy / lingpy / lingpy / sequence / tokenizer.py View on Github external
tokens = line.split("\t") 
            grapheme = tokens[0].strip()

            # check for duplicates in the orthography profile (fail if dups)
            if not grapheme in self.op_graphemes:
                self.op_graphemes[grapheme] = 1
            else:
                raise Exception("You have a duplicate in your orthography profile.")

            if len(tokens) == 1:
                continue

            for i, token in enumerate(tokens):
                token = token.strip()
                self.mappings[grapheme, self.column_labels[i].lower()] = token
                log.debug('%s %s' % (grapheme, self.column_labels[i].lower()))

        # print the tree structure if debug mode is on
        if log.get_logger().getEffectiveLevel() <= logging.INFO:
            log.debug("A graphical representation of your orthography profile in a tree ('*' denotes sentinels):\n")
            printTree(self.root, "")
            print()