How to use the snps.utils.get_empty_snps_dataframe function in snps

To help you get started, we’ve selected a few snps examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apriha / snps / src / snps / snps.py View on Github external
resources_dir : str
            name / path of resources directory
        deduplicate : bool
            deduplicate RSIDs and make SNPs available as `duplicate_snps`
        deduplicate_XY_chrom : bool
            deduplicate alleles in the non-PAR regions of X and Y for males; see `discrepant_XY_snps`
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        rsids : tuple, optional
            rsids to extract if loading a VCF file
        """
        self._file = file
        self._only_detect_source = only_detect_source
        self._snps = get_empty_snps_dataframe()
        self._duplicate_snps = pd.DataFrame()
        self._discrepant_XY_snps = pd.DataFrame()
        self._source = ""
        self._phased = False
        self._build = 0
        self._build_detected = False
        self._output_dir = output_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)

        if file:

            d = self._read_raw_data(file, only_detect_source, rsids)

            # Replace multiple rsids separated by commas in index with the first rsid. E.g. rs1,rs2 -> rs1
            multi_rsids = {
github apriha / snps / src / snps / io / reader.py View on Github external
snps (pandas.DataFrame)
                dataframe of parsed SNPs
            source (str)
                detected source of SNPs
            phased (bool)
                flag indicating if SNPs are phased

        References
        ----------
        1. Fluent Python by Luciano Ramalho (O'Reilly). Copyright 2015 Luciano Ramalho,
           978-1-491-94600-8.
        """
        phased = False

        if self._only_detect_source:
            df = get_empty_snps_dataframe()
        else:
            df, *extra = parser()

            if len(extra) == 1:
                phased = extra[0]

        return {"snps": df, "source": source, "phased": phased}
github apriha / snps / src / snps / io / reader.py View on Github external
Returns
        -------
        dict
            dict with the following items:

            snps (pandas.DataFrame)
                dataframe of parsed SNPs
            source (str)
                detected source of SNPs
            phased (bool)
                flag indicating if SNPs are phased
        """
        file = self._file
        compression = "infer"
        d = {
            "snps": get_empty_snps_dataframe(),
            "source": "",
            "phased": False,
            "build": 0,
        }

        # peek into files to determine the data format
        if isinstance(file, str) and os.path.exists(file):

            if ".zip" in file:
                with zipfile.ZipFile(file) as z:
                    with z.open(z.namelist()[0], "r") as f:
                        first_line, comments, data = self._extract_comments(
                            f, decode=True
                        )
            elif ".gz" in file:
                with gzip.open(file, "rt") as f:
github apriha / snps / src / snps / __init__.py View on Github external
resources_dir : str
            name / path of resources directory
        deduplicate : bool
            deduplicate RSIDs and make SNPs available as `duplicate_snps`
        deduplicate_XY_chrom : bool
            deduplicate alleles in the non-PAR regions of X and Y for males; see `discrepant_XY_snps`
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        rsids : tuple, optional
            rsids to extract if loading a VCF file
        """
        self._file = file
        self._only_detect_source = only_detect_source
        self._snps = get_empty_snps_dataframe()
        self._duplicate_snps = pd.DataFrame()
        self._discrepant_XY_snps = pd.DataFrame()
        self._source = ""
        self._phased = False
        self._build = 0
        self._build_detected = False
        self._output_dir = output_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)

        if file:

            d = self._read_raw_data(file, only_detect_source, rsids)

            self._snps = d["snps"]
            self._source = d["source"]