How to use the medaka.common.get_regions function in medaka

To help you get started, we’ve selected a few medaka examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nanoporetech / medaka / medaka / rle.py View on Github external
fast5_dir=None, summary=None, regions=None, threads=1):
    """Compress a bam into run length encoding (RLE).

    :param bam_input: str, name of the bam file to be compressed
    :param bam_output: str, name of the bam to be produced
    :param ref_fname: str, reference filename, used to produce bam_input
    :param fast5_dir: str, root directory to find the fast5 files
    :param summary: str, filename of a summary name, with the columns
        to link the read id to the fast5 file containing it. Must contain
        columns 'read_id' and 'filename'
    :param regions: list, genomic regions to be extracted
    :param threads: int, number of workers to be used

    :returns: None
    """
    regions = medaka.common.get_regions(bam_input, regions)
    ref_fasta = pysam.FastaFile(ref_fname)

    # If fast_dir is passed, create an index
    if fast5_dir:
        with open(summary) as input_file:
            # Summary files can be huge, avoid loading to memory
            col_names = input_file.readline().replace('#', '').split()
            col_filename = col_names.index('filename')
            col_readid = col_names.index('read_id')

            file_index = dict(
                (line.split()[col_readid], line.split()[col_filename])
                for line in input_file)
    else:
        file_index = None
github nanoporetech / medaka / medaka / methdaka.py View on Github external
def call_methylation(args):
    """Entry point for calling methylation from bam file."""
    logger = medaka.common.get_named_logger('Mextract')
    logger.info("Processing {}.".format(args.bam))
    region = medaka.common.get_regions(args.bam, [args.region])[0]
    ref = pysam.FastaFile(args.reference)
    ref = ref.fetch(region.ref_name)
    motifs = MOTIFS[args.meth]

    additional_text = ''
    if args.meth == 'all':
        additional_text = (
            " Note motifs are stranded, such that both 'C' and 'G' (and 'A' "
            "and 'T') bases will be seen in the output for 5mC (6mA). "
            "Output with a 'G' ('T' for 6mA) correspond to the reverse "
            "strand. Users should post process the output to join/sum lines "
            "belonging to the same genomic loci.")
    elif args.meth == 'dcm':
        additional_text = (
            " The output is sorted by motif, users may wish to subsequently "
            "sorted by position.")
github nanoporetech / medaka / medaka / prediction.py View on Github external
def predict(args):
    """Inference program."""
    logger_level = logging.getLogger(__package__).level
    if logger_level > logging.DEBUG:
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

    import tensorflow as tf
    from tensorflow.keras import backend as K

    args.regions = medaka.common.get_regions(
        args.bam, region_strs=args.regions)
    logger = medaka.common.get_named_logger('Predict')
    logger.info('Processing region(s): {}'.format(
        ' '.join(str(r) for r in args.regions)))

    # create output and copy meta
    with medaka.datastore.DataStore(args.model) as ds:
        ds.copy_meta(args.output)
        feature_encoder = ds.get_meta('feature_encoder')

    feature_encoder.tag_name = args.tag_name
    feature_encoder.tag_value = args.tag_value
    feature_encoder.tag_keep_missing = args.tag_keep_missing
    feature_encoder.read_group = args.RG

    logger.info("Setting tensorflow threads to {}.".format(args.threads))
github nanoporetech / medaka / medaka / inference.py View on Github external
def predict(args):
    """Inference program."""
    logger_level = logging.getLogger(__package__).level
    if logger_level > logging.DEBUG:
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

    import tensorflow as tf
    from tensorflow.keras.models import load_model
    from tensorflow.keras import backend as K

    args.regions = medaka.common.get_regions(args.bam, region_strs=args.regions)
    logger = medaka.common.get_named_logger('Predict')
    logger.info('Processing region(s): {}'.format(' '.join(str(r) for r in args.regions)))

    # write class names to output
    with medaka.datastore.DataStore(args.model) as ds:
        meta = ds.meta
    with medaka.datastore.DataStore(args.output, 'w', verify_on_close=False) as ds:
        ds.update_meta(meta)

    logger.info("Setting tensorflow threads to {}.".format(args.threads))
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    K.set_session(tf.Session(
        config=tf.ConfigProto(
            intra_op_parallelism_threads=args.threads,
            inter_op_parallelism_threads=args.threads)
    ))
github nanoporetech / medaka / medaka / features.py View on Github external
def create_samples(args):
    """Entry point for creation of feature .hdfs (labelled or unlabelled)."""
    logger = medaka.common.get_named_logger('Prepare')
    if args.chunk_ovlp >= args.chunk_len:
        raise ValueError(
            'chunk_ovlp {} is not smaller than chunk_len {}'.format(
                args.chunk_ovlp, args.chunk_len))
    regions = medaka.common.get_regions(args.bam, args.regions)
    reg_str = '\n'.join(['\t\t\t{}'.format(r) for r in regions])
    logger.info('Got regions:\n{}'.format(reg_str))
    if args.truth is None:
        logger.warn(
            'Running medaka features without a truth bam, '
            'unlabelled data will be produced. Is this intended?')
        time.sleep(3)

    no_data = False
    with medaka.datastore.DataStore(args.output, 'w') as ds:
        # write feature options to file
        logger.info("Writing meta data to file.")

        num_qstrat = args.feature_encoder_args.get('num_qstrat')
        max_run = args.label_scheme_args.get('max_run')
        # If one of them is set, set the other to agree.