How to use refinem - 10 common examples

To help you get started, we’ve selected a few refinem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / main.py View on Github external
def filter_bins(self, options):
        """Filter bins command"""
        
        make_sure_path_exists(options.output_dir)
        
        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        outliers = Outliers()
        for genome_file in genome_files:
            gf = remove_extension(genome_file) + '.filtered.' + options.genome_ext
            out_genome = os.path.join(options.output_dir, gf)
            outliers.remove_outliers(genome_file, options.filter_file, out_genome, options.modified_only)

        self.logger.info('Modified genome written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / main.py View on Github external
sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file,
                                                               options.scaffold_file,
                                                               options.add,
                                                               options.remove,
                                                               options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, 
                                        options.outlier_file, 
                                        options.output_genome, 
                                        False)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file, 
                                                options.genome_file, 
                                                options.compatible_file,
                                                options.min_len, 
                                                options.output_genome)
            elif options.closest_only:
                outliers.add_compatible_closest(options.scaffold_file, 
                                                    options.genome_file, 
                                                    options.compatible_file,
github dparks1134 / RefineM / refinem / tetranucleotide.py View on Github external
signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [kmer_order[i] for i in canonical_order_index]

                if canonical_order != self.canonical_order():
                    raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]

            return sig
        except IOError:
            print('[Error] Failed to open signature file: %s' % signature_file)
            sys.exit(-1)
        except ParsingError:
github dparks1134 / RefineM / refinem / scaffold_stats.py View on Github external
"""Read statistics for scaffolds.

        Parameters
        ----------
        stats_file : str
            File with statistics for individual scaffolds.
        """

        try:
            sig = {}
            self.genome_ids = set()
            with open(stats_file) as f:
                header = f.readline().split('\t')

                if 'AAAA' not in header:
                    raise ParsingError("[Error] Statistics file is missing tetranucleotide signature data: %s" % stats_file)

                tetra_index = header.index('AAAA')
                self.signature_headers = [x.strip() for x in header[tetra_index:]]
                self.coverage_headers = [x.strip() for x in header[4:tetra_index]]

                self.scaffolds_in_genome = defaultdict(set)
                self.stats = {}
                for line in f:
                    line_split = line.split('\t')
                    scaffold_id = line_split[0]
                    genome_id = line_split[1]
                    gc = float(line_split[2])
                    scaffold_len = int(line_split[3])

                    coverage = []
                    for cov in line_split[4:tetra_index]:
github dparks1134 / RefineM / refinem / coverage.py View on Github external
for line in f:
                    line_split = line.split('\t')
                    scaffold_id = line_split[0]
                    scaffold_len = int(line_split[1])

                    length[scaffold_id] = scaffold_len

                    for i, cov in enumerate(line_split[2:]):
                        coverage[scaffold_id][bam_ids[i]] = float(cov)
        except IOError:
            self.logger.error('Failed to open signature file: %s' % coverage_file)
            sys.exit(-1)
        except:
            print(traceback.format_exc())
            print('')
            raise ParsingError("[Error] Failed to process coverage file: " + coverage_file)
            sys.exit(-1)

        return coverage, length
github dparks1134 / RefineM / refinem / scaffold_stats.py View on Github external
coverage.append(float(cov))

                    signature = []
                    for freq in line_split[tetra_index:]:
                        signature.append(float(freq))

                    self.stats[scaffold_id] = self.ScaffoldStats(genome_id, gc, scaffold_len, coverage, signature)

                    if genome_id != self.unbinned:
                        self.scaffolds_in_genome[genome_id].add(scaffold_id)

            return sig
        except IOError:
            print('[Error] Failed to open scaffold statistics file: %s' % stats_file)
            sys.exit(-1)
        except ParsingError:
            sys.exit(-1)
github dparks1134 / RefineM / refinem / tetranucleotide.py View on Github external
Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError("[Error] Tetranucleotide file must contain exactly {:,} tetranucleotide columns.".format(len(self.canonical_order())))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [kmer_order[i] for i in canonical_order_index]

                if canonical_order != self.canonical_order():
                    raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]

            return sig
        except IOError:
            print('[Error] Failed to open signature file: %s' % signature_file)
            sys.exit(-1)
        except ParsingError:
            sys.exit(-1)

github dparks1134 / RefineM / refinem / main.py View on Github external
def split(self, options):
        """Split command"""
        
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(1)
        cluster.split(scaffold_stats,
                        options.criteria1,
                        options.criteria2,
                        options.genome_file,
                        options.output_dir)

        self.logger.info('Partitioned sequences written to: ' + options.output_dir)
github dparks1134 / RefineM / refinem / main.py View on Github external
def taxon_profile(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        taxon_profile = TaxonProfile(options.cpus, options.output_dir)
        taxon_profile.run(gene_files,
                            options.scaffold_stats_file,
                            options.db_file,
                            options.taxonomy_file,
                            options.per_to_classify,
                            options.evalue,
                            options.per_identity,
                            options.per_aln_len,
                            options.tmpdir)

        self.logger.info('Results written to: %s' % options.output_dir)
github dparks1134 / RefineM / refinem / main.py View on Github external
def reference(self, options):
        """Reference command"""

        check_file_exists(options.scaffold_prot_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(ref_gene_files):
            self.logger.warning('All files must contain amino acid sequences.')
            sys.exit()

        reference = Reference(options.cpus, options.output_dir)
        reference_out = reference.run(options.scaffold_prot_file,
                                        options.scaffold_stats_file,
                                        ref_gene_files,
                                        options.db_file,
                                        options.evalue,
                                        options.per_identity,
                                        options.per_aln_len)

        self.logger.info('Results written to: ' + reference_out)