How to use the biothings.utils.dataload.tab2dict function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / mygene.info / src / hub / dataload / sources / ucsc / parser.py View on Github external
[int(x) for x in ld[10].split(',') if x]))
        assert len(exons) == int(ld[8]), (len(exons), int(ld[8]))
        ref2exons.setdefault(refseq,[]).append({
            'transcript' : refseq,
            'chr': chr,
            'strand': -1 if ld[3] == '-' else 1,
            'txstart': int(ld[4]),
            'txend': int(ld[5]),
            'cdsstart': int(ld[6]),
            'cdsend': int(ld[7]),
            'position': exons
        })

    gene2exons = {}
    reflink_file = os.path.join(data_folder, '../hgFixed/database/refLink.txt.gz')
    refseq2gene = tab2dict(reflink_file, (2, 6), 0, alwayslist=False)
    for refseq in sorted(ref2exons.keys()):
        geneid = refseq2gene.get(refseq, None)
        if geneid and geneid != '0':
            if geneid not in gene2exons:
                gene2exons[geneid] = {exons_key: ref2exons[refseq]}
            else:
                gene2exons[geneid][exons_key].extend(ref2exons[refseq])

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
github biothings / mygene.info / src / hub / dataload / sources / exac / parser.py View on Github external
def load_broadinstitute_exac_any(one_file,key):
    logging.info("Loading file %s (%s)" % (one_file,key))
    data = tab2dict(one_file, (0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21), 0)
    exacs = {}
    for transcript in data:
        tupleexac = data[transcript]
        # remove version in key so we can search the dict easily later
        exacs[transcript.split(".")[0]] = {"exac" : 
                {
                    "transcript" : transcript,  # but keep version here
                    "n_exons" : int(tupleexac[0]),
                    "cds_start" : int(tupleexac[1]),
                    "cds_end" : int(tupleexac[2]),
                    "bp" : int(tupleexac[3]),
                    key : {
                        "mu_syn" : float(tupleexac[4]),
                        "mu_mis" : float(tupleexac[5]),
                        "mu_lof" : float(tupleexac[6]),
                        "n_syn" : float(tupleexac[7]),
github biothings / mygene.info / src / hub / dataload / sources / homologene / parser.py View on Github external
def load(self, aslist=False):
        '''
        loading ncbi "homologene.data" file
        adding "homologene" field in gene doc
        '''
        from biothings.utils.hub_db import get_src_dump
        homo_d = tab2dict(self.datafile,(2,1),0,header=0)
        entrez_doc = get_src_dump().find_one({"_id":"entrez"}) or {}
        entrez_dir = entrez_doc.get("download",{}).get("data_folder")
        assert entrez_dir, "Can't find Entrez data directory"
        DATAFILE = os.path.join(entrez_dir, 'gene_history.gz')
        assert os.path.exists(DATAFILE), "gene_history.gz is missing (entrez_dir: %s)" % entrez_dir
        retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
        for id in list(homo_d.keys()):
            homo_d[retired2gene.get(id,id)] = homo_d[id]

        with open(self.datafile) as df:
            homologene_d = {}
            doc_li = []
            print()
            geneid_d = get_geneid_d(entrez_dir, self.species_li,load_cache=False,save_cache=False,only_for=homo_d)

            for line in df:
                ld = line.strip().split('\t')
                hm_id, tax_id, geneid = [int(x) for x in ld[:3]]
                if (self.taxid_set is None or tax_id in self.taxid_set) and geneid in geneid_d:
                    # for selected species only
                    # and also ignore those geneid does not match any
                    # existing gene doc
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
"""gene_ensembl__xref_entrezgene__dm"""
        CUSTOM_MAPPING_FILE = os.path.join(
            self.data_folder, 'gene_ensembl__gene__extra.txt')
        global extra_mapping_lock
        try:
            print("Trying to acquire extra mapping lock")
            extra_mapping_lock.acquire()
            print("Lock acquired")
            if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat(CUSTOM_MAPPING_FILE).st_size == 0:
                print("Missing extra mapping file, now generating")
                from . import ensembl_ncbi_mapping
                ensembl_ncbi_mapping.main(src_name, confirm=False)
        finally:
            print("Releasing lock")
            extra_mapping_lock.release()
        extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt')
        # [(ensembl_gid, entrez_gid),...]
        ensembl2entrez = tab2dict(
            datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True)
        # replace with our custom mapping
        ##adjusted = {}
        for k in extra:
            # if k in ensembl2entrez:
            ##    adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]}
            ensembl2entrez[k] = extra[k]
        ##import pickle
        # pickle.dump(adjusted,open("/tmp/adjusted","wb"))
        # back to list of tuples
        ensembl2entrez_li = []
        for ensembl_id, entrez_ids in ensembl2entrez.items():
github biothings / mygene.info / src / hub / dataload / sources / reporter / gnf_reporter.py View on Github external
def loaddata(data_folder):
    #GNF1H
    datafile = os.path.join(data_folder, 'gnf', 'GNF1H.ANNO7.LOAD_20130402.tab')
    gene2gnf1h = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')
    #GNF1m
    datafile = os.path.join(data_folder, 'gnf', 'gnf1m.NEW_ANNO6.LOAD_20130402.tab')
    gene2gnf1m = tab2dict(datafile, (0, 5), 1, header=0, includefn=lambda ld: len(ld) > 5 and ld[5] != '')

    return {'GNF1H': gene2gnf1h,
            'GNF1M': gene2gnf1m}
github biothings / mygene.info / src / hub / dataload / sources / entrez / parser.py View on Github external
def load(self, aslist=False):
        uni_d = tab2dict(self.datafile, (0, 1), 0, alwayslist=0)
        DATAFILE = os.path.join(self.data_folder, 'gene_history.gz')
        retired2gene = tab2dict(DATAFILE, (1, 2), 1, alwayslist=0,includefn=lambda ld: ld[1] != '-')
        for id in list(uni_d.keys()):
            uni_d[retired2gene.get(id,id)] = uni_d[id]
        geneid_d = get_geneid_d(self.data_folder, self.species_li,load_cache=False,save_cache=False,only_for=uni_d)
        gene2unigene = tab2dict_iter(self.datafile, (0, 1), 0, alwayslist=0,
                                 includefn=lambda ld: int(ld[0]) in geneid_d)
        cnt = 0
        for doc in gene2unigene:
            yield self.format(doc)
            cnt += 1
github biothings / mygene.info / src / hub / dataload / sources / pharmgkb / parser.py View on Github external
def load_pharmgkb(data_folder):
    datafile = os.path.join(data_folder, 'genes.zip')
    gene2pharmgkb = tab2dict((datafile, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)
    return gene2pharmgkb
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
try:
            print("Trying to acquire extra mapping lock")
            extra_mapping_lock.acquire()
            print("Lock acquired")
            if not os.path.exists(CUSTOM_MAPPING_FILE) or os.stat(CUSTOM_MAPPING_FILE).st_size == 0:
                print("Missing extra mapping file, now generating")
                from . import ensembl_ncbi_mapping
                ensembl_ncbi_mapping.main(src_name, confirm=False)
        finally:
            print("Releasing lock")
            extra_mapping_lock.release()
        extra = tab2dict(CUSTOM_MAPPING_FILE, (0, 1), 0, alwayslist=True)
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__xref_entrezgene__dm.txt')
        # [(ensembl_gid, entrez_gid),...]
        ensembl2entrez = tab2dict(
            datafile, (1, 2), 0, includefn=_not_LRG, alwayslist=True)
        # replace with our custom mapping
        ##adjusted = {}
        for k in extra:
            # if k in ensembl2entrez:
            ##    adjusted[k] = {"ensembl2entrez":ensembl2entrez[k],"extra":extra[k]}
            ensembl2entrez[k] = extra[k]
        ##import pickle
        # pickle.dump(adjusted,open("/tmp/adjusted","wb"))
        # back to list of tuples
        ensembl2entrez_li = []
        for ensembl_id, entrez_ids in ensembl2entrez.items():
            for entrez_id in entrez_ids:
                ensembl2entrez_li.append((ensembl_id, entrez_id))
        self.ensembl2entrez_li = ensembl2entrez_li
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
def _load_ensembl_2taxid(self):
        """ensembl2taxid"""
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__translation__main.txt')
        ensembl2taxid = dict_nodup(
            tab2dict(datafile, (0, 1), 1, includefn=_not_LRG))
        # need to convert taxid to integer here
        ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
        return ensembl2taxid