How to use the biolib.common.find_nearest function in biolib

To help you get started, we’ve selected a few biolib examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dparks1134 / RefineM / refinem / outliers.py View on Github external
cov_corr,
                        cov_perc):

        genomic_signature = GenomicSignature(0)
        
        # make sure distributions have been loaded
        self.read_distributions()
        
        # find keys into GC and TD distributions
        # gc -> [mean GC][scaffold length][percentile]
        # td -> [scaffold length][percentile]
        gs = genome_stats[genome_id]
        closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
        sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
        d = self.gc_dist[closest_gc][sample_seq_len]
        gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
        gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

        td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
        
        outlying_stats = {}
        outlying_dists = defaultdict(list)
        for scaffold_id in scaffold_ids:
            base_scaffold_id = scaffold_id
            if '-#' in scaffold_id:
                base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
            stats = scaffold_stats.stats[base_scaffold_id]

            # find GC and TD bounds
            closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
            gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
            gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
github dparks1134 / RefineM / refinem / plots / gc_plots.py View on Github external
link_scaffold_ids,
                                                             xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # draw vertical line at x=0
        axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)

        # plot reference distributions
        closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
        for percentile in percentiles_to_plot:
            # find closest distribution values
            temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
            d = gc_dist[closest_gc][temp_scaffold_len]
            gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
            gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)

            xL = []
            xU = []
            y = []
            for window_size in gc_dist[closest_gc]:
                xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
                xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            xL = np.array(xL)[sort_indexY]
            xU = np.array(xU)[sort_indexY]
            y = np.array(y)[sort_indexY]
            axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
github dparks1134 / RefineM / refinem / plots / gc_plots.py View on Github external
pts = self.data_pts(genome_scaffold_stats, mean_gc)

        scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
                                                             pts,
                                                             highlight_scaffold_ids,
                                                             link_scaffold_ids,
                                                             xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # draw vertical line at x=0
        axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)

        # plot reference distributions
        closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
        for percentile in percentiles_to_plot:
            # find closest distribution values
            temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
            d = gc_dist[closest_gc][temp_scaffold_len]
            gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
            gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)

            xL = []
            xU = []
            y = []
            for window_size in gc_dist[closest_gc]:
                xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
                xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
                y.append(window_size / 1000.0)

            # sort by y-values
github dparks1134 / RefineM / refinem / plots / gc_plots.py View on Github external
xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # draw vertical line at x=0
        axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)

        # plot reference distributions
        closest_gc = find_nearest(list(gc_dist.keys()), mean_gc / 100)
        for percentile in percentiles_to_plot:
            # find closest distribution values
            temp_scaffold_len = list(gc_dist[closest_gc].keys())[0]
            d = gc_dist[closest_gc][temp_scaffold_len]
            gc_lower_bound_key = find_nearest(list(d.keys()), (100 - percentile) / 2.0)
            gc_upper_bound_key = find_nearest(list(d.keys()), (100 + percentile) / 2.0)

            xL = []
            xU = []
            y = []
            for window_size in gc_dist[closest_gc]:
                xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
                xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            xL = np.array(xL)[sort_indexY]
            xU = np.array(xU)[sort_indexY]
            y = np.array(y)[sort_indexY]
            axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
            axes_scatter.plot(xU, y, 'r--', lw=1.0, zorder=0)
github dparks1134 / RefineM / refinem / outliers.py View on Github external
processed_scaffolds,
                                    len(scaffold_stats.stats),
                                    processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
                sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.items():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
                sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.median_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)
github dparks1134 / RefineM / refinem / outliers.py View on Github external
len(scaffold_stats.stats),
                                    processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
                sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.items():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
                sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.median_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)

                # determine if scaffold compatible
github dparks1134 / RefineM / refinem / outliers.py View on Github external
genomic_signature = GenomicSignature(0)
        
        # make sure distributions have been loaded
        self.read_distributions()
        
        # find keys into GC and TD distributions
        # gc -> [mean GC][scaffold length][percentile]
        # td -> [scaffold length][percentile]
        gs = genome_stats[genome_id]
        closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
        sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
        d = self.gc_dist[closest_gc][sample_seq_len]
        gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
        gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

        td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
        
        outlying_stats = {}
        outlying_dists = defaultdict(list)
        for scaffold_id in scaffold_ids:
            base_scaffold_id = scaffold_id
            if '-#' in scaffold_id:
                base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
            stats = scaffold_stats.stats[base_scaffold_id]

            # find GC and TD bounds
            closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
            gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
            gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

            closest_seq_len = find_nearest(list(self.td_dist.keys()), stats.length)
            td_bound = self.td_dist[closest_seq_len][td_bound_key]
github dparks1134 / RefineM / refinem / outliers.py View on Github external
processed_scaffolds += 1
            if not self.logger.is_silent:
                sys.stdout.write('  Processed {:,} of {:,} ({:.1f}%) scaffolds.\r'.format(
                                    processed_scaffolds,
                                    len(scaffold_stats.stats),
                                    processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
                sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.items():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
                sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
github dparks1134 / RefineM / refinem / outliers.py View on Github external
cov_perc):

        genomic_signature = GenomicSignature(0)
        
        # make sure distributions have been loaded
        self.read_distributions()
        
        # find keys into GC and TD distributions
        # gc -> [mean GC][scaffold length][percentile]
        # td -> [scaffold length][percentile]
        gs = genome_stats[genome_id]
        closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
        sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
        d = self.gc_dist[closest_gc][sample_seq_len]
        gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
        gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

        td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
        
        outlying_stats = {}
        outlying_dists = defaultdict(list)
        for scaffold_id in scaffold_ids:
            base_scaffold_id = scaffold_id
            if '-#' in scaffold_id:
                base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
            stats = scaffold_stats.stats[base_scaffold_id]

            # find GC and TD bounds
            closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
            gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
            gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]
github dparks1134 / RefineM / refinem / plots / td_plots.py View on Github external
pts = self.data_pts(genome_scaffold_stats, mean_signature)
            
        scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
                                                             pts,
                                                             highlight_scaffold_ids,
                                                             link_scaffold_ids,
                                                             xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # plot reference distributions
        for percentile in percentiles_to_plot:
            # find closest distribution values
            first_key = list(td_dist.keys())[0]
            td_bound_key = find_nearest(list(td_dist[first_key].keys()), percentile)

            x = []
            y = []
            for window_size in td_dist:
                x.append(td_dist[window_size][td_bound_key])
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            x = np.array(x)[sort_indexY]
            y = np.array(y)[sort_indexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in range(0, len(x) - 1):
                for j in range(i + 1, len(x)):