How to use medaka - 10 common examples

To help you get started, we’ve selected a few medaka examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nanoporetech / medaka / medaka / keras_ext.py View on Github external
if dataset == 'train':
            self.data = batcher.train_samples
        elif dataset == 'validation':
            self.data = batcher.valid_samples
            if mini_epochs != 1:
                raise ValueError(
                    "'mini_epochs' must be equal to 1 for validation data.")
        else:
            raise ValueError("'dataset' should be 'train' or 'validation'.")

        original_size = len(self.data)
        self.n_batches = len(self.data) // self.batch_size
        self.data = self.data[:self.n_batches*self.batch_size]
        np.random.shuffle(self.data)
        self.logger = medaka.common.get_named_logger(
            '{}Batcher'.format(dataset.capitalize()))
        self.logger.info(
            '{} batches of {} samples ({}), from {} original.'.format(
                self.n_batches, self.batch_size, len(self.data),
                original_size))
github nanoporetech / medaka / medaka / stitch.py View on Github external
def stitch(args):
    """Entry point for stitching program."""
    index = medaka.datastore.DataIndex(args.inputs)
    if args.regions is None:
        args.regions = sorted(index.index)
    # batch size is a simple empirical heuristic
    regions = medaka.common.grouper(
        (common.Region.from_string(r) for r in args.regions),
        batch_size=max(1, len(args.regions) // (2 * args.jobs)))

    with open(args.output, 'w') as fasta:
        Executor = concurrent.futures.ProcessPoolExecutor
        with Executor(max_workers=args.jobs) as executor:
            worker = functools.partial(stitch_from_probs, args.inputs)
            for contigs in executor.map(worker, regions):
                for name, info, seq in contigs:
                    fasta.write('>{} {}\n{}\n'.format(name, info, seq))
github nanoporetech / medaka / medaka / stitch.py View on Github external
def stitch(args):
    """Entry point for stitching program."""
    index = medaka.datastore.DataIndex(args.inputs)
    if args.regions is None:
        args.regions = sorted(index.index)
    # batch size is a simple empirical heuristic
    regions = medaka.common.grouper(
        (common.Region.from_string(r) for r in args.regions),
        batch_size=max(1, len(args.regions) // (2 * args.jobs)))

    with open(args.output, 'w') as fasta:
        Executor = concurrent.futures.ProcessPoolExecutor
        with Executor(max_workers=args.jobs) as executor:
            worker = functools.partial(stitch_from_probs, args.inputs)
            for contigs in executor.map(worker, regions):
                for name, info, seq in contigs:
                    fasta.write('>{} {}\n{}\n'.format(name, info, seq))
github nanoporetech / medaka / medaka / inference.py View on Github external
test_sample, test_fname = self.samples[0]
        with medaka.datastore.DataStore(test_fname) as ds:
            self.feature_shape = ds.load_sample(test_sample).features.shape
        self.logger.info("Sample features have shape {}".format(self.feature_shape))

        if isinstance(self.validation, float):
            np.random.seed(self.seed)
            np.random.shuffle(self.samples)
            n_sample_train = int((1 - self.validation) * len(self.samples))
            self.train_samples = self.samples[:n_sample_train]
            self.valid_samples = self.samples[n_sample_train:]
            msg = 'Randomly selected {} ({:3.2%}) of features for validation (seed {})'
            self.logger.info(msg.format(len(self.valid_samples), self.validation, self.seed))
        else:
            self.train_samples = self.samples
            self.valid_samples = medaka.datastore.DataIndex(self.validation).samples.copy()
            msg = 'Found {} validation samples equivalent to {:3.2%} of all the data'
            fraction = len(self.valid_samples) / len(self.valid_samples) + len(self.train_samples)
            self.logger.info(msg.format(len(self.valid_samples), fraction))

        msg = 'Got {} samples in {} batches ({} labels) for {}'
        self.logger.info(msg.format(len(self.train_samples),
                                    len(self.train_samples) // batch_size,
                                    len(self.train_samples) * self.feature_shape[0],
                                    'training'))
        self.logger.info(msg.format(len(self.valid_samples),
                                    len(self.valid_samples) // batch_size,
                                    len(self.valid_samples) * self.feature_shape[0],
                                    'validation'))
github nanoporetech / medaka / medaka / stitch.py View on Github external
def stitch(args):
    """Entry point for stitching program."""
    index = medaka.datastore.DataIndex(args.inputs)
    if args.regions is None:
        args.regions = sorted(index.index)
    # batch size is a simple empirical heuristic
    regions = medaka.common.grouper(
        (common.Region.from_string(r) for r in args.regions),
        batch_size=max(1, len(args.regions) // (2 * args.jobs)))

    with open(args.output, 'w') as fasta:
        Executor = concurrent.futures.ProcessPoolExecutor
        with Executor(max_workers=args.jobs) as executor:
            worker = functools.partial(stitch_from_probs, args.inputs)
            for contigs in executor.map(worker, regions):
                for name, info, seq in contigs:
                    fasta.write('>{} {}\n{}\n'.format(name, info, seq))
github nanoporetech / medaka / medaka / training.py View on Github external
self.logger.info(
            "Sample features have shape {}".format(self.feature_shape))

        if isinstance(self.validation, float):
            np.random.seed(self.seed)
            np.random.shuffle(self.samples)
            n_sample_train = int((1 - self.validation) * len(self.samples))
            self.train_samples = self.samples[:n_sample_train]
            self.valid_samples = self.samples[n_sample_train:]
            self.logger.info(
                'Randomly selected {} ({:3.2%}) of features for '
                'validation (seed {})'.format(
                    len(self.valid_samples), self.validation, self.seed))
        else:
            self.train_samples = self.samples
            self.valid_samples = medaka.datastore.DataIndex(
                self.validation).samples.copy()
            msg = 'Found {} validation samples, to {:3.2%} of all the data'
            fraction = len(self.valid_samples) / \
                (len(self.valid_samples) + len(self.train_samples))
            self.logger.info(msg.format(len(self.valid_samples), fraction))

        msg = 'Got {} samples in {} batches ({} labels) for {}'
        self.logger.info(msg.format(
            len(self.train_samples),
            len(self.train_samples) // batch_size,
            len(self.train_samples) * self.feature_shape[0],
            'training'))
        self.logger.info(msg.format(
            len(self.valid_samples),
            len(self.valid_samples) // batch_size,
            len(self.valid_samples) * self.feature_shape[0],
github nanoporetech / medaka / medaka / rle.py View on Github external
def rlebam(args):
    """Entry point for merging run length information for fast5s to bam."""
    logger = medaka.common.get_named_logger('BAMDecor')
    read_index = medaka.common.read_key_value_tsv(args.read_index)
    logger.info("Found {} read in index\n".format(len(read_index)))

    def _ingress():
        for line in sys.stdin:
            if line[0] == '@':
                yield line.rstrip(), None, None, None
            else:
                read_id, flag, _ = line.split('\t', 2)
                is_rev = bool(int(flag) & 16)
                fname = read_index[read_id]
                yield line.rstrip(), read_id, is_rev, fname

    with concurrent.futures.ProcessPoolExecutor(
            max_workers=args.workers) as executor:
        for results in executor.map(
github nanoporetech / medaka / medaka / features.py View on Github external
elif max_run != num_qstrat:
            raise ValueError(
                 'num_qstrat in feature_encoder_args must agree '
                 'with max_run in feature_encoder_args')

        # Create and serialise to file model ancilliaries
        feature_encoder = feature_encoders[args.feature_encoder](
            **args.feature_encoder_args)
        ds.set_meta(feature_encoder, 'feature_encoder')

        label_scheme = medaka.labels.label_schemes[args.label_scheme](
            **args.label_scheme_args)
        ds.set_meta(label_scheme, 'label_scheme')

        model_function = functools.partial(
            medaka.models.build_model,
            feature_encoder.feature_vector_length,
            len(label_scheme._decoding))
        ds.set_meta(model_function, 'model_function')

        # TODO: this parallelism would be better in
        # `SampleGenerator.bams_to_training_samples` since training
        # alignments are usually chunked.
        with concurrent.futures.ProcessPoolExecutor(max_workers=args.threads) \
                as executor:
            # break up overly long chunks
            MAX_SIZE = int(1e6)
            regions = itertools.chain(*(r.split(MAX_SIZE) for r in regions))
            futures = [executor.submit(
                _samples_worker, args, reg,
                feature_encoder, label_scheme) for reg in regions]
github nanoporetech / medaka / medaka / inference.py View on Github external
inter_op_parallelism_threads=args.threads)
    ))

    # Split overly long regions to maximum size so as to not create
    #   massive feature matrices
    MAX_REGION_SIZE = int(1e6)  # 1Mb
    regions = []
    for region in args.regions:
        if region.size > MAX_REGION_SIZE:
            regs = region.split(MAX_REGION_SIZE, args.chunk_ovlp)
        else:
            regs = [region]
        regions.extend(regs)

    logger.info("Processing {} long region(s) with batching.".format(len(regions)))
    model = medaka.models.load_model(args.model, time_steps=args.chunk_len)
    # the returned regions are those where the pileup width is smaller than chunk_len
    remainder_regions = run_prediction(
        args.output, args.bam, regions, model, args.model, args.rle_ref, args.read_fraction,
        args.chunk_len, args.chunk_ovlp,
        batch_size=args.batch_size, save_features=args.save_features,
        tag_name=args.tag_name, tag_value=args.tag_value, tag_keep_missing=args.tag_keep_missing
    )

    # short/remainder regions: just do things without chunking. We can do this
    # here because we now have the size of all pileups (and know they are small).
    # TODO: can we avoid calculating pileups twice whilst controlling memory?
    if len(remainder_regions) > 0:
        logger.info("Processing {} short region(s).".format(len(remainder_regions)))
        model = medaka.models.load_model(args.model, time_steps=None)
        for region in remainder_regions:
            new_remainders = run_prediction(
github nanoporetech / medaka / medaka / prediction.py View on Github external
for region in args.regions:
        if region.size > MAX_REGION_SIZE:
            # chunk_ovlp is mostly used in overlapping pileups (which generally
            # end up being expanded compared to the draft coordinate system)
            regs = region.split(
                MAX_REGION_SIZE, overlap=args.chunk_ovlp, fixed_size=False)
        else:
            regs = [region]
        regions.extend(regs)

    logger.info("Processing {} long region(s) with batching.".format(
        len(regions)))

    logger.info("Using model: {}.".format(args.model))

    model = medaka.models.load_model(args.model, time_steps=args.chunk_len,
                                     allow_cudnn=args.allow_cudnn)

    # the returned regions are those where the pileup width is smaller than
    # chunk_len
    remainder_regions = run_prediction(
        args.output, args.bam, regions, model, feature_encoder,
        args.chunk_len, args.chunk_ovlp,
        batch_size=args.batch_size, save_features=args.save_features
    )

    # short/remainder regions: just do things without chunking. We can do this
    # here because we now have the size of all pileups (and know they are
    # small).
    # TODO: can we avoid calculating pileups twice whilst controlling memory?
    if len(remainder_regions) > 0:
        logger.info("Processing {} short region(s).".format(