How to use the medaka.common.Sample function in medaka

To help you get started, we’ve selected a few medaka examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nanoporetech / medaka / medaka / datastore.py View on Github external
def load_sample(self, key):
        """Load `medaka.common.Sample` object from file.

        :param key: str, sample name.
        :returns: `medaka.common.Sample` object.
        """
        s = dict()
        for field in medaka.common.Sample._fields:
            pth = '{}/{}/{}'.format(self._sample_path_, key, field)
            if pth in self.fh:
                s[field] = self.fh[pth][()]
                # handle loading of bytestrings
                if isinstance(s[field], np.ndarray) and \
                        isinstance(s[field][0], type(b'')):
                    s[field] = np.char.decode(s[field])
            else:
                s[field] = None
        return medaka.common.Sample(**s)
github nanoporetech / medaka / medaka / common.py View on Github external
def amend(self, **kwargs):
        """Create new `Sample` with some attributes changed."""
        d = self._asdict()
        for k, v in kwargs.items():
            if k not in self._fields:
                raise KeyError('Invalid key for Sample: {}'.format(k))
            d[k] = v
        return Sample(**d)
github nanoporetech / medaka / medaka / features.py View on Github external
# max just to avoid div error
            feature_array = counts / np.maximum(1, depth).reshape((-1, 1))
        elif self.normalise == 'fwd_rev':
            # normalize forward and reverse and by dtype
            feature_array = np.empty_like(counts, dtype=self.feature_dtype)
            for (dt, is_rev), inds in self.feature_indices.items():
                dt_depth = np.sum(counts[:, inds], axis=1)
                dt_depth[minor_inds] = dt_depth[major_ind_at_minor_inds]
                # max just to avoid div err
                feature_array[:, inds] = \
                    counts[:, inds] / np.maximum(1, dt_depth).reshape((-1, 1))
        else:
            feature_array = counts
        feature_array = feature_array.astype(self.feature_dtype)

        sample = medaka.common.Sample(
            ref_name=region.ref_name, features=feature_array,
            labels=None, ref_seq=None,
            positions=positions, label_probs=None
        )
        self.logger.info('Processed {} (median depth {})'.format(
            sample.name, np.median(depth)))
        return sample
github nanoporetech / medaka / medaka / common.py View on Github external
# Relationship.forward_abutted guarantees all samples have the
        # same ref_name
        non_concat_fields = {'ref_name'}

        def concat_attr(attr):
            vals = [getattr(s, attr) for s in samples]
            if attr not in non_concat_fields:
                all_none = all([v is None for v in vals])
                c = np.concatenate(vals) if not all_none else None
            else:
                assert len(set(vals)) == 1
                c = vals[0]
            return c

        return Sample(**{attr:  concat_attr(attr) for attr in Sample._fields})
github nanoporetech / medaka / medaka / variant.py View on Github external
:param sample_gen: stream of (`medaka.common.Sample`,
        bool is_last_in_contig)
    :param ref_seq: str, reference sequence

    :yields: `medaka.common.Sample` s

    """
    queue = []
    for s, is_last_in_contig in sample_gen:
        if is_last_in_contig:
            # there are no further samples in this contig
            # all remaining variants must be in this contig
            # or in the queue, so process and empty queue
            queue.append(s)
            yield medaka.common.Sample.from_samples(queue)
            queue = []
            continue

        # find last non-variant call, i.e. a major position which
        # has no minor positions and is the same as the ref

        # call retaining gaps cast to array
        call_with_gaps = np.array(list(
            label_scheme.decode_consensus(s, with_gaps=True)), dtype='|U1')

        def get_symbol(p):
            return ref_seq[p['major']] if p['minor'] == 0 else '*'

        ref_seq_with_gaps = np.fromiter(
            (get_symbol(p) for p in s.positions),
            dtype='|U1',
github nanoporetech / medaka / medaka / compress.py View on Github external
# if self.log_min is 10, we can cope with depth up to 10**9
                    feature_array[i, :] = np.log10(feature_array[i, :],
                                                   out=feature_array[i, :])
                    feature_array[i, :] += self.log_min
                    feature_array[i, :] = np.nan_to_num(feature_array[i, :], copy=False)

                if self.ref_mode == 'onehot':
                    feature_array[i, self.encoding[('ref', str(ref_base), int(ref_len))]] = 1
                elif self.ref_mode == 'base_length':
                    feature_array[i, self.encoding['ref_base']] = self.ref_base_encoding[ref_base]
                    feature_array[i, self.encoding['ref_length']] = ref_len
                elif self.ref_mode == 'index':
                    # index of count which ref would contribute to were it a read
                    feature_array[i, self.encoding['ref_index']] = self.encoding[(False, min(ref_len, self.max_hp_len), ref_base)]

            sample = Sample(ref_name=region.ref_name, features=feature_array,
                            labels=None, ref_seq=ref_array,
                            positions=positions, label_probs=None)
            logging.info('Processed {} (median depth {})'.format(encode_sample_name(sample), np.median(depth_array)))

            return sample
github nanoporetech / medaka / medaka / compress.py View on Github external
mock_compr = self.max_hp_len > 1 and not self.is_compressed
            truth_chunk = aln.get_positions_and_labels(ref_compr_rle=ref_rle_fq, mock_compr=mock_compr,
                                                       is_compressed=self.is_compressed, rle_dtype=True)
            sample = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle_fq, read_fraction=read_fraction)
            # Create labels according to positions in pileup
            pad = (encoding[_gap_], 1) if len(truth_chunk.reads[0].dtype) > 0 else encoding[_gap_]
            padder = itertools.repeat(pad)
            position_to_label = defaultdict(padder.__next__,
                                            zip([tuple(p) for p in truth_chunk.positions],
                                                [a for a in truth_chunk.reads[0]]))
            padded_labels = np.fromiter((position_to_label[tuple(p)] for p in sample.positions),
                                            dtype=truth_chunk.reads[0].dtype, count=len(sample.positions))

            sample = sample._asdict()
            sample['labels'] = padded_labels
            samples.append(Sample(**sample))
        return tuple(samples)
github nanoporetech / medaka / medaka / common.py View on Github external
...     dtype=[('major', int), ('minor', int)])
        >>> feat = np.arange(len(pos))
        >>> s = Sample('contig1', feat , None, None, pos, None)
        >>> s.slice(2)  #doctest: +ELLIPSIS
        Sample(...features=2, ..., positions=(1, 1), label_probs=None)
        >>> s.slice(slice(1,3)) #doctest: +ELLIPSIS
        Sample(..., features=array([1, 2]),...)
        """
        non_slice_fields = {'ref_name'}

        def slice_attr(attr):
            a = getattr(self, attr)
            if attr not in non_slice_fields:
                a = a[key] if a is not None else None
            return a
        return Sample(**{attr: slice_attr(attr) for attr in self._fields})
github nanoporetech / medaka / medaka / stitch.py View on Github external
else:
                # s2 ends before s1
                if s2.last_pos <= s1.last_pos:
                    logger.info('{} ends before {}, skipping.'.format(
                        s2_name, s1_name
                    ))
                    continue
                # s1 and s2 overlap by only one position
                # or there is no overlap between s1 and s2
                elif s2.first_pos >= s1.last_pos:
                    # trigger a break
                    end_1, start_2 = None, None
                else:
                    try:
                        end_1, start_2, heuristic = \
                            common.Sample.overlap_indices(s1, s2)
                        if heuristic:
                            logger.debug(
                                "Used heuristic to stitch {} and {}.".format(
                                    s1.name, s2.name))
                            heuristic_use += 1
                    except common.OverlapException as e:
                        logger.info(
                            "Unhandled overlap type whilst stitching chunks.")
                        raise(e)

            new_seq = label_scheme.decode_consensus(
                s1.slice(slice(start_1, end_1)))

            seq_parts.append(new_seq)

            if end_1 is None: