How to use the kipoi.data.Dataset function in kipoi

To help you get started, we’ve selected a few kipoi examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kipoi / models / FactorNet / FOXA1 / multiTask_DGF / dataloader.py View on Github external
rf.get_file(output_file)


class BedToolLinecache(BedTool):
    """Faster BedTool accessor by Ziga Avsec
    Normal BedTools loops through the whole file to get the
    line of interest. Hence the access it o(n)
    Note: this might load the whole bedfile into memory
    """

    def __getitem__(self, idx):
        line = linecache.getline(self.fn, idx + 1)
        return pybedtools.create_interval_from_list(line.strip().split("\t"))


class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3 file containing intervals
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
    """

    SEQ_WIDTH = 1002

    def __init__(self,
                 intervals_file,
                 fasta_file,
                 dnase_file,
                 use_linecache=True):

        # intervals
github kipoi / models / Divergent430 / TransferLearning / dataloader.py View on Github external
import pybedtools
from pybedtools import BedTool
from genomelake.extractors import FastaExtractor
from kipoi.data import Dataset
from kipoi.metadata import GenomicRanges
import linecache


class BedToolLinecache(BedTool):

    def __getitem__(self, idx):
        line = linecache.getline(self.fn, idx + 1)
        return pybedtools.create_interval_from_list(line.strip().split("\t"))


class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3+1 file containing intervals+labels
        fasta_file: file path; Genome sequence
    """

    def __init__(self, intervals_file, fasta_file):
        self.bt = BedToolLinecache(intervals_file)
        self.fasta_file = fasta_file
        self.fasta_extractor = None

    def __len__(self):
        return len(self.bt)

    def __getitem__(self, idx):
        if self.fasta_extractor is None:
github kipoi / models / pwm_HOCOMOCO / human / template / dataloader.py View on Github external
# --------------------------------------------


class BedToolLinecache(BedTool):
    """Fast BedTool accessor by Ziga Avsec

    Normal BedTools loops through the whole file to get the
    line of interest. Hence the access it o(n)
    """

    def __getitem__(self, idx):
        line = linecache.getline(self.fn, idx + 1)
        return pybedtools.create_interval_from_list(line.strip().split("\t"))


class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3 file containing intervals
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
    """

    def __init__(self, intervals_file, fasta_file, target_file=None, use_linecache=False):

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)
        self.fasta_file = fasta_file
        self.fasta_extractor = None  # to be initialized later
github kipoi / models / CpGenie / template / dataloader.py View on Github external
import numpy as np

# --------------------------------------------
class BedToolLinecache(BedTool):
    """Fast BedTool accessor by Ziga Avsec

    Normal BedTools loops through the whole file to get the
    line of interest. Hence the access it o(n)
    """

    def __getitem__(self, idx):
        line = linecache.getline(self.fn, idx + 1)
        return pybedtools.create_interval_from_list(line.strip().split("\t"))


class SeqDataset(Dataset):
    """
    Args:
        intervals_file: bed3 file containing intervals
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in the csv format
    """

    SEQ_WIDTH = 1001

    def __init__(self, intervals_file, fasta_file, target_file=None, use_linecache=False):

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)
github kipoi / models / MaxEntScan / dataloader.py View on Github external
def seq(self):
        return self._seq

    @seq.setter
    def seq(self, value):
        self._seq = value

    def get_seq(self, fasta):
        seq = fasta.get_seq(self.chrom,
                            self.grange,
                            self.strand)
        return seq


@kipoi_dataloader()
class SplicingMaxEntDataset(Dataset):
    """
    args:
      MISO_AS:
        doc: Whether the given annotation file is MISO alternative splicing annotation. default False.
      fasta_file:
        doc: Reference Genome sequence in fasta format
        example:
          md5: 936544855b253835442a0f253dd4b083
          url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.fa?download=1
        type: str
      gtf_file:
        doc: file path; Genome annotation GTF file
        example:
          md5: 174fd11303ae2c2369094bfcbe303c07
          url: https://zenodo.org/record/1466099/files/3prime-example_files-hg19.chr22.gtf?download=1
      label_col:
github kipoi / kipoiseq / kipoi_dataloaders / sequence_based.py View on Github external
'fasta_file' : 'example_files/chr21.fa',
'num_chr_fasta' : True,
}

def parse_dtype(dtype):
    dtypes = {'int':int, 'string':str, 'float':float, 'bool':bool}
    if dtype is None:
        return None
    if dtype in list(dtypes.values()):
        return dtype
    if dtype not in dtypes:
        raise Exception("Datatype '{0}' not recognized. Allowed are: {1}".format(dtype, str(list(dtypes.keys()))))
    return dtypes[dtype]


class FastaBasedDataset(Dataset):
    """
    Args:
        intervals_file: bed3+ file containing intervals+labels
        fasta_file: file path; Genome sequence
        num_chr_fasta: if True, the tsv-loader will make sure that the chromosomes
          don't start with chr
        label_dtype: label data type
        seq_len: required sequence length
        use_strand: reverse-complement fasta sequence if bed file defines negative strand
        force_upper: Force uppercase output of sequences
    """
    output_schema = None
    type = 'Dataset'
    defined_as = 'kipoi_dataloaders.FastaBasedDataset'
    info = None
    args = OrderedDict()
github kipoi / models / rbp / dataloader.py View on Github external
return float(self.lines[idx].strip())


# File paths
intervals_file = "test_files/intervals.tsv"
target_file = "test_files/targets.tsv"
gtf_file = "test_files/gencode_v25_chr22.gtf.pkl.gz"
fasta_file = "test_files/hg38_chr22.fa"
preproc_transformer = "extractor_files/encodeSplines.pkl"
# bt = pybedtools.BedTool(intervals_file)
# intervals = [i for i in bt[:10]]

# --------------------------------------------


class SeqDistDataset(Dataset):
    """
    Args:
        intervals_file: file path; tsv file
            Assumes bed-like `chrom start end id score strand` format.
        fasta_file: file path; Genome sequence
        gtf_file: file path; Genome annotation GTF file pickled using pandas.
        preproc_transformer: file path; tranformer used for pre-processing.
        target_file: file path; path to the targets
        batch_size: int
    """

    def __init__(self, intervals_file, fasta_file, gtf_file, preproc_transformer, target_file=None):
        gtf = pd.read_pickle(gtf_file)
        self.gtf = gtf[gtf["info"].str.contains('gene_type "protein_coding"')]

        # distance transformer
github kipoi / models / rbp_eclip / template / dataloader.py View on Github external
if use_strand and interval.strand == "-":
                dist = - dist

            return dist[np.argmin(np.abs(dist))]

        out[:] = np.array([[find_closest(self.landmarks[ldm_name], interval, self.use_strand)
                            for ldm_name in self.columns]
                           for interval in intervals], dtype=float)

        return out

    def _get_output_shape(self, num_intervals, width):
        return (num_intervals, len(self.columns))


class TxtDataset(Dataset):

    def __init__(self, path):
        with open(path, "r") as f:
            self.lines = f.readlines()

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        return int(self.lines[idx].strip())


# --------------------------------------------
class SeqDistDataset(Dataset):
    """
    Args:
github kipoi / kipoiseq / kipoiseq / dataloaders / sequence.py View on Github external
if self.ignore_targets or self.n_tasks == 0:
            labels = {}
        else:
            labels = row.iloc[self.bed_columns:].values.astype(
                self.label_dtype)
        return interval, labels

    def __len__(self):
        return len(self.df)

    def get_targets(self):
        return self.df.iloc[:, self.bed_columns:].values.astype(self.label_dtype)


@kipoi_dataloader(override={"dependencies": deps, 'info.authors': package_authors})
class StringSeqIntervalDl(Dataset):
    """
    info:
        doc: >
           Dataloader for a combination of fasta and tab-delimited input files such as bed files. The dataloader extracts
           regions from the fasta file as defined in the tab-delimited `intervals_file`. Returned sequences are of the type
           np.array([str]).
    args:
        intervals_file:
            doc: bed3+ file path containing intervals + (optionally) labels
            example:
              url: https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/intervals_51bp.tsv
              md5: a76e47b3df87fd514860cf27fdc10eb4
        fasta_file:
            doc: Reference genome FASTA file path.
            example:
              url: https://raw.githubusercontent.com/kipoi/kipoiseq/master/tests/data/hg38_chr22_32000000_32300000.fa
github kipoi / models / HAL / dataloader.py View on Github external
    @property
    def seq(self):
        return self._seq

    @seq.setter
    def seq(self, value):
        self._seq = value

    def get_seq(self, fasta):
        seq = fasta.get_seq(self.chrom,
                            self.grange,
                            self.strand)
        return seq


class SplicingKmerDataset(Dataset):
    """
    Args:
        gtf_file: gtf file. Can be dowloaded from MISO or ensembl.
        fasta_file: file path; Genome sequence
        target_file: file path; path to the targets in MISO summary format.
        overhang: length of overhang.
        MISO_AS: whether the used annotation file is from MISO alternative splicing annotation.
    """

    def __init__(self,
                 gtf_file,
                 fasta_file,
                 overhang=80,
                 MISO_AS=False):  # intron + ~ bp exon from both side
        self.genes = loadgene(gtf_file)
        self.fasta_file = fasta_file