Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
# Replace multiple rsids separated by commas in index with the first rsid. E.g. rs1,rs2 -> rs1
multi_rsids = {
multi_rsid: multi_rsid.split(",")[0]
for multi_rsid in list(
filter(lambda x: len(x.split(",")) > 1, d["snps"].index)
)
}
d["snps"].rename(index=multi_rsids, inplace=True)
self._snps = d["snps"]
self._source = d["source"]
if __name__ == "__main__":
logger.info("start")
# get filenames from openSNP data dump
filenames = r.get_opensnp_datadump_filenames()
# draw a sample from the observations
random.seed(1)
SAMPLE_SIZE = len(filenames)
# SAMPLE_SIZE = 10
samples = random.sample(range(len(filenames)), SAMPLE_SIZE)
# setup tasks for parallelizing / execution on multiple cores
p = Parallelizer(parallelize=True)
tasks = [{"file": filenames[i]} for i in samples]
# results are a list of lists
rows = p(get_xy_chrom_snp_ratios, tasks)
# remove None results
rows = [row for row in rows if row]
df = pd.DataFrame(
rows,
columns=[
"file",
"source",
"build",
"build_detected",
"x_snps",
resources_dir : str
name / path of resources directory
parallelize : bool
utilize multiprocessing to speedup calculations
processes : int
processes to launch if multiprocessing
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = pd.DataFrame()
self._source = ""
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
self._snps, self._source = self._read_raw_data(file, only_detect_source)
if not self._snps.empty:
self.sort_snps()
self._build = self.detect_build()
if not self._build:
self._build = 37 # assume Build 37 / GRCh37 if not detected
else:
self._build_detected = True
if assign_par_snps:
""" Initialize a ``Lineage`` object.
Parameters
----------
output_dir : str
name / path of output directory
resources_dir : str
name / path of resources directory
parallelize : bool
utilize multiprocessing to speedup calculations
processes : int
processes to launch if multiprocessing
"""
self._output_dir = os.path.abspath(output_dir)
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
self._snps = d["snps"]
self._source = d["source"]
self._phased = d["phased"]
if not self._snps.empty:
self.sort_snps()
if deduplicate:
self._deduplicate_rsids()
self._build = self.detect_build()