How to use the medaka.common.Region.from_string function in medaka

To help you get started, we’ve selected a few medaka examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nanoporetech / medaka / medaka / stitch.py View on Github external
def stitch(args):
    """Entry point for stitching program."""
    index = medaka.datastore.DataIndex(args.inputs)
    if args.regions is None:
        args.regions = sorted(index.index)
    # batch size is a simple empirical heuristic
    regions = medaka.common.grouper(
        (common.Region.from_string(r) for r in args.regions),
        batch_size=max(1, len(args.regions) // (2 * args.jobs)))

    with open(args.output, 'w') as fasta:
        Executor = concurrent.futures.ProcessPoolExecutor
        with Executor(max_workers=args.jobs) as executor:
            worker = functools.partial(stitch_from_probs, args.inputs)
            for contigs in executor.map(worker, regions):
                for name, info, seq in contigs:
                    fasta.write('>{} {}\n{}\n'.format(name, info, seq))
github nanoporetech / medaka / medaka / datastore.py View on Github external
:regions: list of `medaka.common.Region` s for which to yield samples.
        :samples: iterable of sample names to yield (in order in which they
            are supplied).

        :yields: `medaka.common.Sample` objects.

        """
        if samples is not None:
            # yield samples in the order they are asked for
            for sample, fname in samples:
                yield DataStore(fname).load_sample(sample)
        else:
            all_samples = self.index
            if regions is None:
                regions = [
                    medaka.common.Region.from_string(x)
                    for x in sorted(all_samples)]
            for reg in regions:
                if reg.ref_name not in self.index:
                    continue
                for sample in self.index[reg.ref_name]:
                    # samples can have major.minor coords, round to end excl.
                    sam_reg = medaka.common.Region(
                        sample['ref_name'],
                        int(float(sample['start'])),
                        int(float(sample['end'])) + 1)
                    if sam_reg.overlaps(reg):
                        with DataStore(sample['filename']) as store:
                            yield store.load_sample(sample['sample_key'])
github nanoporetech / medaka / medaka / variant.py View on Github external
A `LabelScheme` read from file is used to decode SNPs. All `LabelScheme` s
    define a `decode_snps` public method. We do not need to use `join_samples`
    to look for variants overlapping sample slice boundaries because we only
    analyse a single locus at a time. This means that `LabelScheme` s that do
    not define the `decode_consensus` method (called within `join_samples`) can
    be used.

    """
    logger = medaka.common.get_named_logger('SNPs')

    index = medaka.datastore.DataIndex(args.inputs)

    if args.regions is None:
        args.regions = sorted(index.index)
    regions = [
        medaka.common.Region.from_string(r)
        for r in args.regions]

    # lookup LabelScheme stored in HDF5
    try:
        label_scheme = index.metadata['label_scheme']
    except KeyError:
        logger.debug(
            "Could not find `label_scheme` metadata in input file, "
            "assuming HaploidLabelScheme.")
        label_scheme = medaka.labels.HaploidLabelScheme()

    logger.debug("Label decoding is:\n{}".format(
        '\n'.join('{}: {}'.format(k, v)
                  for k, v in label_scheme._decoding.items())))

    meta_info = label_scheme.snp_metainfo
github nanoporetech / medaka / medaka / medaka_counts.py View on Github external
def main():
    # Entry point for testing/checking
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
    np.set_printoptions(precision=4, linewidth=100)


    parser = argparse.ArgumentParser('medaka', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('bam', help='alignment file.')
    parser.add_argument('region', help='alignment region to sample.')
    parser.add_argument('--print', action='store_true', help='print counts.')
    parser.add_argument('--dtypes', nargs='+', help='perform a multi-datatype tests.')
    parser.add_argument('--norm', nargs='+', help='additional normalisation tests. (total, fwd_rev)')
    args = parser.parse_args()
    region = medaka.common.Region.from_string(args.region)
    kwargs={}

    def _print(samples):
       if args.print:
           for p, f in zip(samples.positions, samples.features):
               print('{}\t{}\t0\t{}\t{}'.format(p[0], p[1], '\t'.join('{:.3f}'.format(x) if x>0.0 else '-' for x in f), sum(f)))

    dtype_options = [('',)]
    if args.dtypes is not None:
        dtype_options.append(args.dtypes)
    norm_options = [None, ]
    if args.norm is not None:
        norm_options.extend(args.norm)

    for dtypes in dtype_options:
        kwargs['dtypes'] = dtypes
github nanoporetech / medaka / medaka / common.py View on Github external
:param bam: `.bam` file.
    :param region_strs: iterable of str in zero-based (samtools-like)
        region format e.g. ref:start-end or filepath containing a
        region string per line.

    :returns: list of `Region` objects.
    """
    with pysam.AlignmentFile(bam) as bam_fh:
        ref_lengths = dict(zip(bam_fh.references, bam_fh.lengths))
    if region_strs is not None:
        if os.path.isfile(region_strs[0]):
            with open(region_strs[0]) as fh:
                region_strs = [line.strip() for line in fh.readlines()]

        regions = []
        for r in (Region.from_string(x) for x in region_strs):
            start = r.start if r.start is not None else 0
            end = r.end if r.end is not None else ref_lengths[r.ref_name]
            regions.append(Region(r.ref_name, start, end))
    else:
        regions = [
            Region(ref_name, 0, end)
            for ref_name, end in ref_lengths.items()]

    return regions
github nanoporetech / medaka / medaka / variant.py View on Github external
def variants_from_hdf(args):
    """Entry point for variant calling from HDF5 files.

    A `LabelScheme` read from HDF must define both a `decode_variants`
    and `decode_consnesus` method. The latter is used with `join_samples`
    to detect multi-locus variants spanning `Sample` slice boundaries.

    """
    logger = medaka.common.get_named_logger('Variants')

    index = medaka.datastore.DataIndex(args.inputs)

    if args.regions is None:
        args.regions = sorted(index.index)
    regions = [
        medaka.common.Region.from_string(r)
        for r in args.regions]

    # lookup LabelScheme stored in HDF5
    try:
        label_scheme = index.metadata['label_scheme']
    except KeyError:
        logger.debug(
            "Could not find `label_scheme` metadata in input file, "
            "assuming HaploidLabelScheme.")
        label_scheme = medaka.labels.HaploidLabelScheme()

    logger.debug("Label decoding is:\n{}".format(
        '\n'.join('{}: {}'.format(k, v)
                  for k, v in label_scheme._decoding.items())))

    if not hasattr(label_scheme, 'decode_variants'):
github nanoporetech / medaka / medaka / vcf.py View on Github external
def get_homozygous_regions(args):
    """Entry point to find homozygous regions from an input diploid VCF."""
    vcf = VCFReader(args.vcf, cache=False)
    reg = medaka.common.Region.from_string(args.region)
    if reg.start is None or reg.end is None:
        raise ValueError('Region start and end must be specified')

    def get_hetero_pos(v):
        gt = v.to_dict()['GT']
        pos = list()
        if gt[0] != gt[-1]:  # heterozygous
            pos = [p for p in range(v.pos, v.pos + len(v.ref))]
        return pos

    def get_homo_regions(ref_name, pos, min_len=1000):
        regions = []
        hetero_gaps = np.ediff1d(pos)
        sort_inds = np.argsort(hetero_gaps)[::-1]
        homo_len = 0
        for i in sort_inds: