How to use the biolib.common.make_sure_path_exists function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / main.py View on Github external
def split(self, options):
        """Split command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(1)
        cluster.split(scaffold_stats,
                        options.criteria1,
                        options.criteria2,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / CompareM / comparem / classify.py View on Github external
keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """
        
        # read taxonomic identification of each genome
        taxonomy = {}
        if taxonomy_file:
            for line in open(taxonomy_file):
                genome_id, taxa_str = line.rstrip().split('\t')
                taxonomy[genome_id] = taxa_str

        # calculate AAI between query and target genomes
        aai_output_dir = os.path.join(output_dir, 'aai')
        make_sure_path_exists(aai_output_dir)
        aai_calculator = AAICalculator(self.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(query_gene_file,
                                                                target_gene_file,
                                                                sorted_hit_table,
                                                                evalue_threshold,
                                                                per_iden_threshold,
                                                                per_aln_len_threshold,
                                                                keep_rbhs,
                                                                aai_output_dir)

        # determine matches to each query genomes
        aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
        with open(aai_results_file) as f:
            f.readline()
            
            hits = defaultdict(list)
github dparks1134 / RefineM / refinem / ssu.py View on Github external
Concatenate hits within the specified number of base pairs.
        output_dir : str
            Output directory.

        Returns
        -------
        dict : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        """
        
        self.logger.info('Identifying SSU rRNA genes.')
        best_hits = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)
            make_sure_path_exists(genome_dir)
            
            # identify 16S reads from contigs/scaffolds
            self._hmm_search(genome_file, evalue_threshold, genome_dir)

            # read HMM hits
            hits_per_domain = {}
            for domain in ['archaea', 'bacteria', 'euk']:
                seq_info = self._read_hits(os.path.join(genome_dir, 'ssu' + '.hmm_' + domain + '.txt'), domain, evalue_threshold)

                hits = {}
                if len(seq_info) > 0:
                    for seq_id, seq_hits in seq_info.items():
                        for hit in seq_hits:
                            self._add_hit(hits, seq_id, hit, concatenate_threshold)

                hits_per_domain[domain] = hits
github dparks1134 / RefineM / refinem / main.py View on Github external
def modify_bin(self, options):
        """Modify bin command"""

        make_sure_path_exists(os.path.dirname(options.output_genome))

        options.compatible_file = None #*** Removed with Python 3 port       
        if not (options.add or options.remove or options.outlier_file or options.compatible_file):
            self.logger.warning('No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
            self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
github dparks1134 / CompareM / comparem / aai_calculator.py View on Github external
per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        output_dir : str
            Directory to store AAI results.
        """

        self.blast_dir = blast_dir

        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.output_dir = output_dir

        shared_genes_dir = os.path.join(output_dir, self.shared_genes)
        make_sure_path_exists(shared_genes_dir)
        self.shared_genes_dir = shared_genes_dir

        # calculate length of genes and number of genes each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths  = {}
        genes_in_genomes = defaultdict(int)
        for seq_id, seq in seq_io.read_fasta_seq(os.path.join(self.blast_dir, self.gene_file)):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            genes_in_genomes[genome_id] += 1

        # get byte offset of hits from each genome
github dparks1134 / RefineM / refinem / deprecated / taxonomic_profile.py View on Github external
diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)
github Ecogenomics / GTDBTk / src / gtdbtk / gtdbtk.py View on Github external
def infer(self, options):
        """Infer tree from MSA."""
        
        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)
        
        if (options.cpus > 1):
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])
        
        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' % options.prot_model)
        fasttree = FastTree(multithreaded=(options.cpus > 1))

        tree_unrooted_output = os.path.join(options.out_dir, options.prefix + '.unrooted.tree')
        tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log')
        tree_output_log = os.path.join(options.out_dir, 'fasttree.log')
        fasttree.run(options.msa_file, 
                        'prot', 
                        options.prot_model, 
                        tree_unrooted_output,
github dparks1134 / RefineM / refinem / main.py View on Github external
genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, outlier_file)
        self.logger.info('Outlier information written to: ' + outlier_file)

        # create outlier plots
        if options.create_plots:
            plot_dir = os.path.join(options.output_dir, 'plots')
            make_sure_path_exists(plot_dir)
        
            outliers.plot(scaffold_stats,
                            genome_stats,
                            outliers.gc_dist,
                            outliers.td_dist,
                            options, 
                            options.highlight_file,
                            options.links_file,
                            options.individual_plots,
                            plot_dir)
            
            self.logger.info('Outlier plots written to: ' + plot_dir)
github Ecogenomics / GTDBTk / src / gtdbtk / gtdbtk.py View on Github external
def identify(self, options):
        """Identify marker genes in genomes."""
        
        try:
        
            if options.genome_dir:
                check_dir_exists(options.genome_dir)
                
            if options.batchfile:
                check_file_exists(options.batchfile)
                
            make_sure_path_exists(options.out_dir)
                
            markers = Markers(options.cpus)
            markers.identify(options.genome_dir,
                                options.batchfile,
                                options.proteins,
                                options.out_dir, 
                                options.prefix)
                                
            self.logger.info('Done.')
        
        except Exception as e:
            self.logger.info('GTDB-Tk has encountered an error.')