How to use the gtdbtk.biolib_lite.seq_io.read_fasta function in gtdbtk

To help you get started, we’ve selected a few gtdbtk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
make_sure_path_exists(out_root)
        result = {}

        if marker_set_id == 'bac120':
            out_pplacer = os.path.join(
                out_dir, PATH_BAC120_PPLACER_CLASS.format(prefix=prefix))
        elif marker_set_id == 'ar122':
            out_pplacer = os.path.join(
                out_dir, PATH_AR122_PPLACER_CLASS.format(prefix=prefix))
        else:
            self.logger.error('There was an error determining the marker set.')
            raise GenomeMarkerSetUnknown

        # We get the pplacer taxonomy for comparison
        with open(out_pplacer, 'w') as pplaceout:
            user_genome_ids = set(read_fasta(user_msa_file).keys())
            for leaf in tree.leaf_node_iter():
                if leaf.taxon.label in user_genome_ids:
                    taxa = []
                    cur_node = leaf
                    while cur_node.parent_node:
                        _support, taxon, _aux_info = parse_label(
                            cur_node.label)
                        if taxon:
                            for t in taxon.split(';')[::-1]:
                                taxa.append(t.strip())
                        cur_node = cur_node.parent_node
                    taxa_str = ';'.join(taxa[::-1])
                    pplaceout.write('{}\t{}\n'.format(
                        leaf.taxon.label, self.standardise_taxonomy(taxa_str, marker_set_id)))
                    result[leaf.taxon.label] = self.standardise_taxonomy(
                        taxa_str, marker_set_id)
github Ecogenomics / GTDBTk / gtdbtk / markers.py View on Github external
concatenated_file
            The path to the MSA.
        gtdb_taxonomy
            A dictionary mapping the accession to the 7 rank taxonomy.
        taxa_filter
            A comma separated list of taxa to include.
        outgroup_taxon
            If using an outgroup (de novo workflow), ensure this is retained.

        Returns
        -------
        Dict[str, str]
            The genome id to msa of those genomes specified in the filter.
        """

        msa = read_fasta(concatenated_file)
        msa_len = len(msa)
        self.logger.info(f'Read concatenated alignment for {msa_len:,} GTDB genomes.')

        if taxa_filter is not None:
            taxa_to_keep = set(taxa_filter.split(','))

            if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None:
                taxa_to_keep.add(outgroup_taxon)

            filtered_genomes = 0
            for genome_id, taxa in gtdb_taxonomy.items():
                common_taxa = taxa_to_keep.intersection(taxa)
                if len(common_taxa) == 0:
                    if genome_id in msa:
                        del msa[genome_id]
                        filtered_genomes += 1
github Ecogenomics / GTDBTk / scripts / verify_official_package.py View on Github external
with open(os.path.join(self.pack_dir, 'metadata', 'metadata.txt')) as metafile:
            for line in metafile:
                if line.startswith('VERSION_DATA'):
                    version = line.strip().split('=')[1]

        # List genomes in fastani folder
        list_genomes = [os.path.basename(x) for x in glob.glob(os.path.join(
            self.pack_dir, 'fastani', 'database/*.gz'))]
        list_genomes = [x.replace('_genomic.fna.gz', '').replace('GCA_', 'GB_GCA_').replace('GCF_', 'RS_GCF_') for x in
                        list_genomes]

        # Archaeal genome MSA is untrimmed
        ar_msa_file = glob.glob(os.path.join(
            self.pack_dir, 'msa/*ar122.faa'))[0]
        ar_msa = read_fasta(ar_msa_file)
        first_seq = ar_msa.get(list(ar_msa.keys())[0])
        if len(first_seq) != 32675:
            print('ERROR: len(first_seq) != 32675')

        # Bacterial genome MSA is untrimmed
        bac_msa_file = glob.glob(os.path.join(
            self.pack_dir, 'msa/*bac120.faa'))[0]
        bac_msa = read_fasta(bac_msa_file)
        first_seq = bac_msa.get(list(bac_msa.keys())[0])
        if len(first_seq) != 41155:
            print('ERROR: len(first_seq) != 41155')

        # Bacterial MASK is same length as the untrimmed bacterial genomes
        bac_mask_file = glob.glob(os.path.join(
            self.pack_dir, 'masks/*bac120.mask'))[0]
        bac_mask = ''
github Ecogenomics / GTDBTk / gtdbtk / trim_msa.py View on Github external
def run(self, msa_file, marker_list):
        """Randomly select a subset of columns from the MSA of each marker."""

        # read multiple sequence alignment
        self.logger.info('Reading multiple sequence alignment.')
        msa = read_fasta(msa_file, False)
        self.logger.info('Read MSA for %d genomes.' % len(msa))

        filtered_seqs, pruned_seqs = self.trim(msa, marker_list)

        self.logger.info('Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.' % (
            len(pruned_seqs),
            self.min_perc_aa))

        # write out trimmed sequences
        with open(os.path.join(
            self.output_dir, "filtered_msa.faa"), 'w') as filter_file:
            for gid, seq in filtered_seqs.items():
                fasta_outstr = ">%s\n%s\n" % (gid, seq)
                filter_file.write(fasta_outstr)

        self.logger.info('Done.')
github Ecogenomics / GTDBTk / scripts / rename_UBAs / prepare_gtdbtk_package.py View on Github external
preserve_underscores=True)
            for n in tree.leaf_node_iter():
                if n.taxon.label.startswith("U_"):
                    subdict = uba_acc.get(n.taxon.label)
                    if "gca" in subdict.keys():
                        n.taxon.label = subdict.get("gca")
                    else:
                        n.taxon.label = subdict.get("uba")
            tree.write_to_path(os.path.join(dirout, dom, 'pplacer', dom + "_r" + release + ".tree"),
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)

            trimmed_seqout = open(os.path.join(
                dirout, dom, 'pplacer', 'trimmed_msa_' + dom + '.faa'), 'w')
            trimmed_fasta = read_fasta(seqfile)
            for gid, seq in trimmed_fasta.items():
                if gid in genomes_to_retain:
                    if gid.startswith("U_"):
                        subdict = uba_acc.get(gid)
                        if "gca" in subdict.keys():
                            trimmed_seqout.write(
                                ">{0}\n{1}\n".format(subdict.get("gca"), seq))
                        else:
                            trimmed_seqout.write(
                                ">{0}\n{1}\n".format(subdict.get("uba"), seq))
                    else:
                        trimmed_seqout.write(">{0}\n{1}\n".format(gid, seq))
            trimmed_seqout.close()

            logoutf = open(os.path.join(dirout, dom, 'pplacer',
                                        'fitting_' + dom + '.log'), 'w')
github Ecogenomics / GTDBTk / gtdbtk / classify.py View on Github external
# we run a fastani comparison for each user genomes against the
        # selected genomes in the same genus
        if len(fastani_verification) > 0:
            fastani = FastANI(cpus=self.cpus, force_single=True)
            self.logger.info(f'fastANI version: {fastani.version}')
            d_ani_compare, d_paths = self._get_fastani_genome_path(
                fastani_verification, genomes)
            all_fastani_dict = fastani.run(d_ani_compare, d_paths)

        classified_user_genomes, unclassified_user_genomes = self._sort_fastani_results(
            fastani_verification, pplacer_taxonomy_dict, all_fastani_dict, msa_dict, percent_multihit_dict,
            trans_table_dict, bac_ar_diff, summaryfout)

        self.logger.info('{0} genome(s) have been classified using FastANI and pplacer.'.format(
            len(classified_user_genomes)))
        user_genome_ids = set(read_fasta(user_msa_file).keys())
        # we remove ids already classified with FastANI
        user_genome_ids = user_genome_ids.difference(
            set(classified_user_genomes))
        for leaf in tree.leaf_node_iter():
            if leaf.taxon.label in user_genome_ids:
                # In some cases , pplacer can associate 2 user genomes
                # on the same parent node so we need to go up the tree
                # to find a node with a reference genome as leaf.
                cur_node = leaf.parent_node
                list_subnode = [subnd.taxon.label.replace(
                    "'", '') for subnd in cur_node.leaf_iter()]
                while len(set(list_subnode) & set(self.reference_ids)) < 1:
                    cur_node = cur_node.parent_node
                    list_subnode = [subnd.taxon.label.replace(
                        "'", '') for subnd in cur_node.leaf_iter()]
github Ecogenomics / GTDBTk / gtdbtk / misc.py View on Github external
"""
        if maskid == 'bac' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
        elif maskid == 'arc' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
        elif mask_type == 'file':
            mask = maskid
        else:
            self.logger.error('Command not understood.')
            raise GTDBTkException('Command not understood.')

        with open(mask, 'r') as f:
            maskstr = f.readline()

        with open(output_file, 'w') as outfwriter:
            dict_genomes = read_fasta(untrimmed_msa, False)

            for k, v in dict_genomes.items():
                aligned_seq = ''.join([v[i] for i in range(0, len(maskstr)) if maskstr[i] == '1'])
                fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
                outfwriter.write(fasta_outstr)
github Ecogenomics / GTDBTk / scripts / verify_official_package.py View on Github external
print('ERROR: len(bac_mask) != 41155')

        # Archaeal MASK is same length as the untrimmed archaeal genomes
        ar_mask_file = glob.glob(os.path.join(
            self.pack_dir, 'masks/*ar122.mask'))[0]
        ar_mask = ''
        with open(ar_mask_file) as amf:
            ar_mask = amf.readline()
        if len(ar_mask) != 32675:
            print('ERROR: len(ar_mask) != 32675')

        # Archaeal Pplacer MSA should have the same number of genomes as the
        # Archaeal untrimmed MSA
        ar_pplacer_msa_file = glob.glob(os.path.join(
            self.pack_dir, 'pplacer', 'gtdb_' + version + '_ar122.refpkg', 'ar122_msa_r95.faa'))[0]
        ar_pplacer_msa = read_fasta(ar_pplacer_msa_file)
        if len(ar_pplacer_msa) != len(ar_msa):
            print('ERROR: len(ar_pplacer_msa) != len(ar_msa)')
            print('len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa)))
            print('len(ar_msa): {}'.format(len(ar_msa)))
            print('difference genomes: {}'.format(list(set(ar_msa.keys()).difference(set(ar_pplacer_msa.keys())))))
        first_seq = ar_pplacer_msa.get(list(ar_pplacer_msa.keys())[0])
        # Archaeal Pplacer MSA should have the same length as the Archaeal mask
        if len(first_seq) != len([a for a in ar_mask if a == '1']):
            print('ERROR: len(first_seq) != len([a for a in ar_mask if a ==1])')
            print('len(first_seq): {}'.format(len(first_seq)))
            print('len([a for a in ar_mask if a ==1]): {}'.format(len([a for a in ar_mask if a == '1'])))

        # Bacterial Pplacer MSA should have the same number of genomes as the
        # Bacterial untrimmed MSA
        bac_pplacer_msa_file = os.path.join(
            self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg', 'bac120_msa_r95.faa')
github Ecogenomics / GTDBTk / scripts / verify_official_package.py View on Github external
self.pack_dir, 'fastani', 'database/*.gz'))]
        list_genomes = [x.replace('_genomic.fna.gz', '').replace('GCA_', 'GB_GCA_').replace('GCF_', 'RS_GCF_') for x in
                        list_genomes]

        # Archaeal genome MSA is untrimmed
        ar_msa_file = glob.glob(os.path.join(
            self.pack_dir, 'msa/*ar122.faa'))[0]
        ar_msa = read_fasta(ar_msa_file)
        first_seq = ar_msa.get(list(ar_msa.keys())[0])
        if len(first_seq) != 32675:
            print('ERROR: len(first_seq) != 32675')

        # Bacterial genome MSA is untrimmed
        bac_msa_file = glob.glob(os.path.join(
            self.pack_dir, 'msa/*bac120.faa'))[0]
        bac_msa = read_fasta(bac_msa_file)
        first_seq = bac_msa.get(list(bac_msa.keys())[0])
        if len(first_seq) != 41155:
            print('ERROR: len(first_seq) != 41155')

        # Bacterial MASK is same length as the untrimmed bacterial genomes
        bac_mask_file = glob.glob(os.path.join(
            self.pack_dir, 'masks/*bac120.mask'))[0]
        bac_mask = ''
        with open(bac_mask_file) as bmf:
            bac_mask = bmf.readline()
        if len(bac_mask) != 41155:
            print('ERROR: len(bac_mask) != 41155')

        # Archaeal MASK is same length as the untrimmed archaeal genomes
        ar_mask_file = glob.glob(os.path.join(
            self.pack_dir, 'masks/*ar122.mask'))[0]