How to use biolib - 10 common examples

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / main.py View on Github external
def split(self, options):
        """Split command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(1)
        cluster.split(scaffold_stats,
                        options.criteria1,
                        options.criteria2,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / main.py View on Github external
def taxon_profile(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.output_dir)
        taxon_profile.run(gene_files,
                            options.scaffold_stats_file,
                            options.db_file,
                            options.taxonomy_file,
                            options.per_to_classify,
                            options.evalue,
github dparks1134 / RefineM / refinem / main.py View on Github external
def manual(self, options):
        """Manual command"""
        
        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)
        
        genome_id = remove_extension(options.genome_file)

        seqs = seq_io.read(options.genome_file)
        fout = {}
        with open(options.cluster_file) as f:
            f.readline()
            
            for line in f:
                line_split = line.rstrip().split('\t')
                scaffold_id = line_split[0]
                cluster_id = int(line_split[1])
                
                if cluster_id < 0:
github dparks1134 / RefineM / refinem / main.py View on Github external
def taxon_profile(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.output_dir)
        taxon_profile.run(gene_files,
                            options.scaffold_stats_file,
                            options.db_file,
                            options.taxonomy_file,
                            options.per_to_classify,
                            options.evalue,
                            options.per_identity,
                            options.per_aln_len,
github dparks1134 / RefineM / refinem / main.py View on Github external
def split(self, options):
        """Split command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(1)
        cluster.split(scaffold_stats,
                        options.criteria1,
                        options.criteria2,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / CompareM / comparem / classify.py View on Github external
keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """
        
        # read taxonomic identification of each genome
        taxonomy = {}
        if taxonomy_file:
            for line in open(taxonomy_file):
                genome_id, taxa_str = line.rstrip().split('\t')
                taxonomy[genome_id] = taxa_str

        # calculate AAI between query and target genomes
        aai_output_dir = os.path.join(output_dir, 'aai')
        make_sure_path_exists(aai_output_dir)
        aai_calculator = AAICalculator(self.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file,
                                                                target_gene_file,
                                                                sorted_hit_table,
                                                                evalue_threshold,
                                                                per_iden_threshold,
                                                                per_aln_len_threshold,
                                                                keep_rbhs,
                                                                aai_output_dir)

        # determine matches to each query genomes
        aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
        with open(aai_results_file) as f:
            f.readline()
            
            hits = defaultdict(list)
github dparks1134 / RefineM / refinem / ssu.py View on Github external
Concatenate hits within the specified number of base pairs.
        output_dir : str
            Output directory.

        Returns
        -------
        dict : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        """
        
        self.logger.info('Identifying SSU rRNA genes.')
        best_hits = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)
            make_sure_path_exists(genome_dir)
            
            # identify 16S reads from contigs/scaffolds
            self._hmm_search(genome_file, evalue_threshold, genome_dir)

            # read HMM hits
            hits_per_domain = {}
            for domain in ['archaea', 'bacteria', 'euk']:
                seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)

                hits = {}
                if len(seq_info) > 0:
                    for seq_id, seq_hits in seq_info.items():
                        for hit in seq_hits:
                            self._add_hit(hits, seq_id, hit, concatenate_threshold)

                hits_per_domain[domain] = hits
github dparks1134 / RefineM / refinem / main.py View on Github external
def modify_bin(self, options):
        """Modify bin command"""

        make_sure_path_exists(os.path.dirname(options.output_genome))

        options.compatible_file = None #*** Removed with Python 3 port       
        if not (options.add or options.remove or options.outlier_file or options.compatible_file):
            self.logger.warning('No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
            self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
github dparks1134 / RefineM / scripts / make_database.py View on Github external
continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))

        fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):

            annotation = annotation[annotation.find(' ') + 1:]  # remove additional gene id from annotation
            annotation += ' [IMG Gene ID: ' + gene_id + ']'  # append IMG gene id for future reference

            fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
            fout.write(seq + '\n')
        fout.close()
github dparks1134 / RefineM / refinem / outliers.py View on Github external
Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = set()
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()