How to use the biolib.common.remove_extension function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / outliers.py View on Github external
Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = set()
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
t = Taxonomy()
        taxonomy = t.read(taxonomy_file)
        if not t.validate(taxonomy, 
                            check_prefixes=True, 
                            check_ranks=True, 
                            check_hierarchy=False, 
                            check_species=False,
                            check_group_names=False,
                            check_duplicate_names=False, 
                            report_errors=True):
            self.logger.error('Invalid taxonomy file.')
            sys.exit(-1)
            
        # record length and number of genes in each scaffold
        for aa_file in gene_files:
            genome_id = remove_extension(aa_file)
            self.profiles[genome_id] = Profile(genome_id, percent_to_classify, taxonomy)

            for seq_id, seq in seq_io.read_seq(aa_file):
                scaffold_id = seq_id[0:seq_id.rfind('_')]
                self.profiles[genome_id].genes_in_scaffold[scaffold_id] += 1
                self.profiles[genome_id].coding_bases[scaffold_id] += len(seq) * 3  # length in nucleotide space

        # run diamond and create taxonomic profile for each genome
        self.logger.info('Running DIAMOND blastp with {:,} processes (be patient!)'.format(self.cpus))

        diamond = Diamond(self.cpus)
        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        if not os.path.exists(diamond_table_out):
            diamond.blastp(gene_file, 
                            db_file, 
                            evalue,
github dparks1134 / RefineM / refinem / ssu.py View on Github external
E-value threshold for defining valid hits.
        concatenate_threshold : int
            Concatenate hits within the specified number of base pairs.
        output_dir : str
            Output directory.

        Returns
        -------
        dict : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        """
        
        self.logger.info('Identifying SSU rRNA genes.')
        best_hits = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)
            make_sure_path_exists(genome_dir)
            
            # identify 16S reads from contigs/scaffolds
            self._hmm_search(genome_file, evalue_threshold, genome_dir)

            # read HMM hits
            hits_per_domain = {}
            for domain in ['archaea', 'bacteria', 'euk']:
                seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)

                hits = {}
                if len(seq_info) > 0:
                    for seq_id, seq_hits in seq_info.items():
                        for hit in seq_hits:
                            self._add_hit(hits, seq_id, hit, concatenate_threshold)
github dparks1134 / RefineM / refinem / main.py View on Github external
def manual(self, options):
        """Manual command"""
        
        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)
        
        genome_id = remove_extension(options.genome_file)

        seqs = seq_io.read(options.genome_file)
        fout = {}
        with open(options.cluster_file) as f:
            f.readline()
            
            for line in f:
                line_split = line.rstrip().split('\t')
                scaffold_id = line_split[0]
                cluster_id = int(line_split[1])
                
                if cluster_id < 0:
                    # negative values indicate scaffolds that should
                    # not be placed in a cluster
                    continue
github dparks1134 / RefineM / refinem / main.py View on Github external
def filter_bins(self, options):
        """Filter bins command"""
        
        make_sure_path_exists(options.output_dir)
        
        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        outliers = Outliers()
        for genome_file in genome_files:
            gf = remove_extension(genome_file) + '.filtered.' + options.genome_ext
            out_genome = os.path.join(options.output_dir, gf)
            outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only)

        self.logger.info('Modified genome written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / taxon_profile.py View on Github external
for seq_id, classification in seq_assignments.items():
                taxa = []
                for r in range(0, len(Taxonomy.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(taxa)] += profile.genes_in_scaffold[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir, 'gene_profiles.scaffolds.html')
        krona.create(krona_profiles, krona_output_file)

        # create Krona plot based on best hit of each gene
        krona_profiles = defaultdict(lambda: defaultdict(int))

        for aa_file in gene_files:
            genome_id = remove_extension(aa_file)

            profile = self.profiles[genome_id]
            for gene_id, _seq in seq_io.read_seq(aa_file):

                taxa_str = Taxonomy.unclassified_taxon
                if gene_id in profile.gene_hits:
                    taxa_str, _hit_info = profile.gene_hits[gene_id]

                krona_profiles[genome_id][taxa_str] += 1

        krona_output_file = os.path.join(self.output_dir, 'gene_profiles.genes.html')
        krona.create(krona_profiles, krona_output_file)
github dparks1134 / RefineM / scripts / make_database.py View on Github external
~

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
github dparks1134 / CompareM / comparem / similarity_search.py View on Github external
def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
        """Prefix all gene IDs with genome IDs: ~.
        
        Parameters
        ----------
        gene_files : list of str
            Genes in fasta files to modify.
        keep_headers : boolean
            If True, indicates FASTA headers already have the format ~.
        output_file : str
            Name of FASTA file to contain modified genes.
        """
        
        fout = open(output_file, 'w')
        for gf in gene_files:           
            genome_id = remove_extension(gf)
            if genome_id.endswith('_genes'):
                genome_id = genome_id[0:genome_id.rfind('_genes')]
                
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                if keep_headers:
                    fout.write('>' + seq_id  + ' ' + annotation + '\n')
                else:
                    fout.write('>' + genome_id + '~' + seq_id  + ' ' + annotation + '\n')
                fout.write(seq + '\n')
        fout.close()
github dparks1134 / CompareM / comparem / reciprocal_blast.py View on Github external
def _producer_blast(self, genome_pair):
        """Apply reciprocal blast to a pair of genomes.

        Parameters
        ----------
        genome_pair : list
            Identifier of genomes to process.
        """

        blast = Blast(cpus=self.producer_cpus)

        aa_gene_fileA, aa_gene_fileB = genome_pair

        genome_idA = remove_extension(aa_gene_fileA)
        genome_idB = remove_extension(aa_gene_fileB)

        dbA = os.path.join(self.output_dir, genome_idA + '.db')
        dbB = os.path.join(self.output_dir, genome_idB + '.db')

        output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv')
        blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue)

        output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv')
        blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue)

        return True