How to use the medaka.common.Region function in medaka

To help you get started, we’ve selected a few medaka examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nanoporetech / medaka / medaka / features.py View on Github external
def _quarantine_sample(self, sample):
        """Add sample name and pileup width to a list."""
        # Note: the below assumes we haven't split a pileup on minor positions.
        # This should be the case: chunking on minor positions only occurs for
        # larger regions.
        start, _ = sample.first_pos
        end, _ = sample.last_pos
        end += 1  # end exclusive
        self._quarantined.append((
            medaka.common.Region(sample.ref_name, start, end), sample.size
        ))
github nanoporetech / medaka / medaka / datastore.py View on Github external
if samples is not None:
            # yield samples in the order they are asked for
            for sample, fname in samples:
                yield DataStore(fname).load_sample(sample)
        else:
            all_samples = self.index
            if regions is None:
                regions = [
                    medaka.common.Region.from_string(x)
                    for x in sorted(all_samples)]
            for reg in regions:
                if reg.ref_name not in self.index:
                    continue
                for sample in self.index[reg.ref_name]:
                    # samples can have major.minor coords, round to end excl.
                    sam_reg = medaka.common.Region(
                        sample['ref_name'],
                        int(float(sample['start'])),
                        int(float(sample['end'])) + 1)
                    if sam_reg.overlaps(reg):
                        with DataStore(sample['filename']) as store:
                            yield store.load_sample(sample['sample_key'])
github nanoporetech / medaka / medaka / compress.py View on Github external
:returns: list of `Region` objects.
    """
    with pysam.AlignmentFile(bam) as bam_fh:
        ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
    if region_strs is not None:
        if os.path.isfile(region_strs[0]):
            with open(region_strs[0]) as fh:
                region_strs = [ l.strip() for l in fh.readlines()]

        regions = []
        for r in parse_regions(region_strs):
            start = r.start if r.start is not None else 0
            end = r.end if r.end is not None else ref_lengths[r.ref_name]
            regions.append(Region(r.ref_name, start, end))
    else:
        regions = [Region(ref_name, 0, end) for ref_name, end in ref_lengths.items()]

    return regions
github nanoporetech / medaka / medaka / compress.py View on Github external
"""

        # filter truth alignments to restrict ourselves to regions of the ref where the truth
        # in unambiguous
        alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end)
        filtered_alignments = TruthAlignment.filter_alignments(alignments, start=region.start, end=region.end)
        if len(filtered_alignments) == 0:
            logging.info("Filtering removed all alignments of truth to ref from {}.".format(region))

        samples = []
        for aln in filtered_alignments:
            mock_compr = self.max_hp_len > 1 and not self.is_compressed
            truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
                                                       is_compressed=self.is_compressed, rle_dtype=True)
            sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
            # Create labels according to positions in pileup
            pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
            padder = itertools.repeat(pad)
            position_to_label = defaultdict(padder.__next__,
                                            zip([tuple(p) for p in truth_chunk.positions],
                                                [a for a in truth_chunk.reads[0]]))
            padded_labels = np.fromiter((position_to_label[tuple(p)] for p in sample.positions),
                                            dtype=truth_chunk.reads[0].dtype, count=len(sample.positions))

            sample = sample._asdict()
            sample['labels'] = padded_labels
            samples.append(Sample(**sample))
        return tuple(samples)
github nanoporetech / medaka / medaka / common.py View on Github external
"""
    with pysam.AlignmentFile(bam) as bam_fh:
        ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
    if region_strs is not None:
        if os.path.isfile(region_strs[0]):
            with open(region_strs[0]) as fh:
                region_strs = [line.strip() for line in fh.readlines()]

        regions = []
        for r in (Region.from_string(x) for x in region_strs):
            start = r.start if r.start is not None else 0
            end = r.end if r.end is not None else ref_lengths[r.ref_name]
            regions.append(Region(r.ref_name, start, end))
    else:
        regions = [
            Region(ref_name, 0, end)
            for ref_name, end in ref_lengths.items()]

    return regions
github nanoporetech / medaka / medaka / vcf.py View on Github external
def get_homo_regions(ref_name, pos, min_len=1000):
        regions = []
        hetero_gaps = np.ediff1d(pos)
        sort_inds = np.argsort(hetero_gaps)[::-1]
        homo_len = 0
        for i in sort_inds:
            if hetero_gaps[i] < min_len:
                break
            homo_len += hetero_gaps[i]
            start = pos[i]
            end = start + hetero_gaps[i]
            regions.append(medaka.common.Region(ref_name, start, end))
        return regions, homo_len
github nanoporetech / medaka / medaka / compress.py View on Github external
if args.model is not None:
        fe_kwargs = load_yaml_data(args.model, _feature_opt_path_)
        print(fe_kwargs)
    else:
        fe_kwargs = { k:v.default for (k,v) in inspect.signature(FeatureEncoder.__init__).parameters.items()
                      if v.default is not inspect.Parameter.empty}
    opt_str = '\n'.join(['{}: {}'.format(k,v) for k, v in fe_kwargs.items()])
    logging.info('FeatureEncoder options: \n{}'.format(opt_str))

    fe = FeatureEncoder(**fe_kwargs)

    reg_str = '\n'.join(['\t\t\t{ref_name}:{start}-{end}'.format(**r._asdict()) for r in regions])
    logging.info('Got regions:\n{}'.format(reg_str))

    chunked_regions = [Region(region.ref_name, start, end)
                       for region in regions
                       for start, end in segment_limits(region.start, region.end,
                                                        segment_len=10*args.chunk_len,
                                                        overlap_len=5*args.chunk_ovlp)]

    unique_refs = set((region.ref_name for region in regions))
    ref_rles = {r: fe.process_ref_seq(r, args.ref_fastq) for r in unique_refs}

    rg = partial(fe.bams_to_training_samples, args.truth, args.bam, read_fraction=args.read_fraction)
    chunker = partial(chunk_samples, chunk_len=args.chunk_len, overlap=args.chunk_ovlp)

    label_encoding, label_decoding = get_label_encoding(args.max_label_len)
    s2xy = partial(sample_to_x_y, encoding=label_encoding, max_label_len=args.max_label_len)

    f = partial(_process_region, rg, alphabet_filter, chunker, s2xy)