How to use the snps.utils.Parallelizer function in snps

To help you get started, we’ve selected a few snps examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github apriha / snps / src / snps / snps.py View on Github external
processes to launch if multiprocessing
        rsids : tuple, optional
            rsids to extract if loading a VCF file
        """
        self._file = file
        self._only_detect_source = only_detect_source
        self._snps = get_empty_snps_dataframe()
        self._duplicate_snps = pd.DataFrame()
        self._discrepant_XY_snps = pd.DataFrame()
        self._source = ""
        self._phased = False
        self._build = 0
        self._build_detected = False
        self._output_dir = output_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)

        if file:

            d = self._read_raw_data(file, only_detect_source, rsids)

            # Replace multiple rsids separated by commas in index with the first rsid. E.g. rs1,rs2 -> rs1
            multi_rsids = {
                multi_rsid: multi_rsid.split(",")[0]
                for multi_rsid in list(
                    filter(lambda x: len(x.split(",")) > 1, d["snps"].index)
                )
            }
            d["snps"].rename(index=multi_rsids, inplace=True)

            self._snps = d["snps"]
            self._source = d["source"]
github apriha / snps / analysis / xy-chrom-snp-ratios / xy-chrom-snp-ratios.py View on Github external
if __name__ == "__main__":
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # results are a list of lists
    rows = p(get_xy_chrom_snp_ratios, tasks)

    # remove None results
    rows = [row for row in rows if row]

    df = pd.DataFrame(
        rows,
        columns=[
            "file",
            "source",
            "build",
            "build_detected",
            "x_snps",
github apriha / snps / src / snps / __init__.py View on Github external
resources_dir : str
            name / path of resources directory
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        """
        self._file = file
        self._only_detect_source = only_detect_source
        self._snps = pd.DataFrame()
        self._source = ""
        self._build = 0
        self._build_detected = False
        self._output_dir = output_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)

        if file:

            self._snps, self._source = self._read_raw_data(file, only_detect_source)

            if not self._snps.empty:
                self.sort_snps()

                self._build = self.detect_build()

                if not self._build:
                    self._build = 37  # assume Build 37 / GRCh37 if not detected
                else:
                    self._build_detected = True

                if assign_par_snps:
github apriha / lineage / src / lineage / __init__.py View on Github external
""" Initialize a ``Lineage`` object.

        Parameters
        ----------
        output_dir : str
            name / path of output directory
        resources_dir : str
            name / path of resources directory
        parallelize : bool
            utilize multiprocessing to speedup calculations
        processes : int
            processes to launch if multiprocessing
        """
        self._output_dir = os.path.abspath(output_dir)
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
github apriha / snps / src / snps / __init__.py View on Github external
processes to launch if multiprocessing
        rsids : tuple, optional
            rsids to extract if loading a VCF file
        """
        self._file = file
        self._only_detect_source = only_detect_source
        self._snps = get_empty_snps_dataframe()
        self._duplicate_snps = pd.DataFrame()
        self._discrepant_XY_snps = pd.DataFrame()
        self._source = ""
        self._phased = False
        self._build = 0
        self._build_detected = False
        self._output_dir = output_dir
        self._resources = Resources(resources_dir=resources_dir)
        self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)

        if file:

            d = self._read_raw_data(file, only_detect_source, rsids)

            self._snps = d["snps"]
            self._source = d["source"]
            self._phased = d["phased"]

            if not self._snps.empty:
                self.sort_snps()

                if deduplicate:
                    self._deduplicate_rsids()

                self._build = self.detect_build()