Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _setup_gsa_test(resources_dir):
# reset resource if already loaded
r = Resources()
r._resources_dir = resources_dir
r._gsa_resources = {}
gzip_file(
"tests/resources/gsa_rsid_map.txt",
os.path.join(resources_dir, "gsa_rsid_map.txt.gz"),
)
gzip_file(
"tests/resources/gsa_chrpos_map.txt",
os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"),
)
def _teardown_gsa_test():
r = Resources()
r._resources_dir = "resources"
r._gsa_resources = {}
def test_save_snps_vcf(self):
s = SNPs("tests/input/testvcf.vcf")
r = Resources()
r._reference_sequences["GRCh37"] = {}
with tempfile.TemporaryDirectory() as tmpdir:
dest = os.path.join(tmpdir, "generic.fa.gz")
gzip_file("tests/input/generic.fa", dest)
seq = ReferenceSequence(ID="1", path=dest)
r._reference_sequences["GRCh37"]["1"] = seq
self.assertEqual(
os.path.relpath(s.save_snps(vcf=True)), "output/vcf_GRCh37.vcf"
)
self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf")
def test_save_snps_vcf_phased(self):
# read phased data
s = SNPs("tests/input/testvcf_phased.vcf")
# setup resource to use test FASTA reference sequence
r = Resources()
r._reference_sequences["GRCh37"] = {}
with tempfile.TemporaryDirectory() as tmpdir:
dest = os.path.join(tmpdir, "generic.fa.gz")
gzip_file("tests/input/generic.fa", dest)
seq = ReferenceSequence(ID="1", path=dest)
r._reference_sequences["GRCh37"]["1"] = seq
# save phased data to VCF
self.assertEqual(
os.path.relpath(s.save_snps(vcf=True)), "output/vcf_GRCh37.vcf"
)
# read saved VCF
processes : int
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
# Replace multiple rsids separated by commas in index with the first rsid. E.g. rs1,rs2 -> rs1
multi_rsids = {
multi_rsid: multi_rsid.split(",")[0]
for multi_rsid in list(
filter(lambda x: len(x.split(",")) > 1, d["snps"].index)
)
}
d["snps"].rename(index=multi_rsids, inplace=True)
self._snps = d["snps"]
path to output directory
resources_dir : str
name / path of resources directory
parallelize : bool
utilize multiprocessing to speedup calculations
processes : int
processes to launch if multiprocessing
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = pd.DataFrame()
self._source = ""
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
self._snps, self._source = self._read_raw_data(file, only_detect_source)
if not self._snps.empty:
self.sort_snps()
self._build = self.detect_build()
if not self._build:
self._build = 37 # assume Build 37 / GRCh37 if not detected
else:
self._build_detected = True
processes : int
processes to launch if multiprocessing
rsids : tuple, optional
rsids to extract if loading a VCF file
"""
self._file = file
self._only_detect_source = only_detect_source
self._snps = get_empty_snps_dataframe()
self._duplicate_snps = pd.DataFrame()
self._discrepant_XY_snps = pd.DataFrame()
self._source = ""
self._phased = False
self._build = 0
self._build_detected = False
self._output_dir = output_dir
self._resources = Resources(resources_dir=resources_dir)
self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if file:
d = self._read_raw_data(file, only_detect_source, rsids)
self._snps = d["snps"]
self._source = d["source"]
self._phased = d["phased"]
if not self._snps.empty:
self.sort_snps()
if deduplicate:
self._deduplicate_rsids()