Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#assert gps.transcripts.isin(dfp.index).all()
transcript_id = 'ENST00000485079'
div3_error = 0
seq_mismatch_err = 0
err_transcripts = []
for transcript_id in tqdm(gps.transcripts):
# make sure all ids can be found in the proteome
dna_seq = gps.get_seq(transcript_id)
# dna_seq = dna_seq[:(len(dna_seq) // 3) * 3]
if len(dna_seq) % 3 != 0:
div3_error += 1
print("len(dna_seq) % 3 != 0: {}".format(transcript_id))
err_transcripts.append({"transcript_id": transcript_id, "div3_err": True})
continue
prot_seq = translate(dna_seq)
if dfp.loc[transcript_id].seq != prot_seq:
seq_mismatch_err += 1
print("seq.mismatch: {}".format(transcript_id))
n_mismatch = 0
for i in range(len(prot_seq)):
a = dfp.loc[transcript_id].seq[i]
b = prot_seq[i]
if a != b:
n_mismatch += 1
print("{} {} {}/{}".format(a,b,i,len(prot_seq)))
err_transcripts.append({"transcript_id": transcript_id, "div3_err": False,
"n-seq-mismatch": n_mismatch})
# print("prot:", dfp.loc[transcript_id].seq)
# print("seq: ", prot_seq)
err_transcripts = pd.DataFrame(err_transcripts)
# err_cds.to_csv("data/protein/err_cds.csv")
interval = Interval('chr1', 4, 14)
seq = variant_seq_extractor.extract(interval, variants, anchor=4)
assert len(seq) == interval.end - interval.start
assert seq == 'GAACGTAACG'
interval = Interval('chr1', 2, 5)
seq = variant_seq_extractor.extract(interval, variants, anchor=3)
assert len(seq) == interval.end - interval.start
assert seq == 'GCG'
interval = Interval('chr1', 24, 34)
seq = variant_seq_extractor.extract(interval, variants, anchor=27)
assert len(seq) == interval.end - interval.start
assert seq == 'TGATAACGTA'
interval = Interval('chr1', 25, 35)
seq = variant_seq_extractor.extract(interval, variants, anchor=34)
assert len(seq) == interval.end - interval.start
assert seq == 'TGATAACGTA'
interval = Interval('chr1', 34, 44)
seq = variant_seq_extractor.extract(interval, variants, anchor=37)
assert len(seq) == interval.end - interval.start
assert seq == 'AACGTAACGT'
interval = Interval('chr1', 34, 44)
seq = variant_seq_extractor.extract(interval, variants, anchor=100)
assert len(seq) == interval.end - interval.start
assert seq == 'AACGTAACGT'
interval = Interval('chr1', 5, 11, strand='+')
seq = variant_seq_extractor.extract(
import pytest
from conftest import vcf_file, sample_5kb_fasta_file
from kipoiseq.dataclasses import Variant, Interval
from kipoiseq.extractors.vcf_query import NumberVariantQuery
from kipoiseq.extractors.vcf import MultiSampleVCF
fasta_file = sample_5kb_fasta_file
intervals = [
Interval('chr1', 3, 10),
Interval('chr1', 4, 30),
Interval('chr1', 19, 30)
]
@pytest.fixture
def multi_sample_vcf():
return MultiSampleVCF(vcf_file)
def test_MultiSampleVCF__next__(multi_sample_vcf):
variant = next(multi_sample_vcf)
assert variant.chrom == 'chr1'
assert variant.pos == 4
assert variant.ref == 'T'
assert variant.alt == 'C'
def test_single_seq_vcf_seq_extract(single_seq_vcf_seq_extractor):
interval = Interval('chr1', 2, 9)
seq = single_seq_vcf_seq_extractor.extract(interval, anchor=3)
assert seq == 'GCGAACG'
interval.chrom = 'asd'
with pytest.raises(AttributeError):
interval.start = 10
with pytest.raises(AttributeError):
interval.end = 300
with pytest.raises(AttributeError):
interval.strand = '+'
assert interval.strand == '-'
# non-fixed arguments
interval.name = 'asd'
interval.score = 10
assert interval.unstrand().strand == '.'
assert interval == Interval.from_pybedtools(interval.to_pybedtools())
assert isinstance(interval.to_pybedtools(), pybedtools.Interval)
i2 = interval.shift(10, use_strand=False)
# original unchanged
assert interval.start == 10
assert interval.end == 20
assert i2.start == 20
assert i2.end == 30
i2 = interval.shift(10) # use_strand = True by default
assert i2.start == 0
assert i2.end == 10
assert not interval.shift(20, use_strand=True).is_valid()
def test_MultiSampleVCF_fetch_variant(multi_sample_vcf):
interval = Interval('chr1', 3, 5)
assert len(list(multi_sample_vcf.fetch_variants(interval))) == 2
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00003'))) == 1
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00001'))) == 0
interval = Interval('chr1', 7, 12)
assert len(list(multi_sample_vcf.fetch_variants(interval))) == 0
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00003'))) == 0
def test_MultiSampleVCF_fetch_variant(multi_sample_vcf):
interval = Interval('chr1', 3, 5)
assert len(list(multi_sample_vcf.fetch_variants(interval))) == 2
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00003'))) == 1
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00001'))) == 0
interval = Interval('chr1', 7, 12)
assert len(list(multi_sample_vcf.fetch_variants(interval))) == 0
assert len(list(multi_sample_vcf.fetch_variants(interval, 'NA00003'))) == 0
def variant_queryable():
vcf = MultiSampleVCF(vcf_file)
return VariantIntervalQueryable(vcf, [
(
[
Variant('chr1', 12, 'A', 'T'),
Variant('chr1', 18, 'A', 'C', filter='q10'),
],
Interval('chr1', 10, 20)
),
(
[
Variant('chr2', 120, 'AT', 'AAAT'),
],
Interval('chr2', 110, 200)
)
def test_variant():
v = Variant("chr1", 10, 'C', 'T')
assert v.start == 9
assert v.chrom == 'chr1'
assert v.pos == 10
assert v.ref == 'C'
assert v.alt == 'T'
assert isinstance(v.info, dict)
assert len(v.info) == 0
assert v.qual == 0
assert v.filter == 'PASS'
v.info['test'] = 10
assert v.info['test'] == 10
assert isinstance(str(v), str)
# make sure the original got unchangd
v2 = v.copy()
with pytest.raises(AttributeError):
v.chrom = 'asd'
with pytest.raises(AttributeError):
v.pos = 10
with pytest.raises(AttributeError):
v.ref = 'asd'
with pytest.raises(AttributeError):
v.alt = 'asd'
# non-fixed arguments
v.id = 'asd'
v.qual = 10
v.filter = 'asd'
v.source = 2
assert isinstance(Variant("chr1", '10', 'C', 'T').pos, int)
# from cyvcf2
vcf = cyvcf2.VCF('tests/data/test.vcf.gz')
cv = list(vcf)[0]
v2 = Variant.from_cyvcf(cv)
assert isinstance(v2.source, cyvcf2.Variant)