How to use the biothings.utils.dataload.value_convert function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
def _fn(x):
            import logging
            out = {'taxid': int(x[0])}
            if x[1].strip() not in ['', '\\N']:
                out['symbol'] = x[1].strip()
            if x[2].strip() not in ['', '\\N']:
                _name = SubStr(x[2].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out

        skip_count = 0
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        for datadict in tab2dict_iter(datafile, (0, 1, 2, 7, 8), 1, includefn=_not_LRG):
            datadict = value_convert(datadict, _fn)
            for id, doc in datadict.items():
                if id.isdigit():
                    if skip_count < ERR_THRESHOLD:
                        skip_count += 1
                    else:
                        raise ValueError('Too many ensembl ids are entirely numeric')
                    self.logger.warning(
                        "Document Skipped: All-digit id {}".format(id))
                    continue
                doc['_id'] = id
                yield doc
github biothings / mychem.info / src / dataload / contrib / chebi / chebi_parser.py View on Github external
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']    
    restr_dict['chebi']= dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) 
    restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"])
    return restr_dict
github biothings / mychem.info / src / dataload / contrib / drugbank / drugbank_parser.py View on Github external
elif isinstance(value['atc-code'], dict) or isinstance(value['atc-code'], OrderedDict):                
                restr_atccode_dict(value['atc-code'])
                
       
    d1['atc_codes'] = atccode_list
    d1['targets'] = targets_list
    d1['carriers'] = carriers_list
    d1['enzymes'] = enzymes_list
    d1['transporters'] = transporters_list    
    d1['predicted_properties'] = pred_properties_dict  
    d1['products'] = products_list            
    restr_dict['drugbank'] = d1     
    restr_dict = unlist(restr_dict) 
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"])      
    restr_dict = boolean_convert(restr_dict,added_keys=["mddr_like_rule","bioavailability","ghose_filter","rule_of_five"])
    restr_dict = value_convert(restr_dict,skipped_keys=["dpd","chemspider","chebi","pubchem_compound","pubchem_substance","bindingdb"])    
    return restr_dict
github biothings / mychem.info / src / dataload / contrib / chembl / chembl_parser.py View on Github external
_flag=1
            for x,y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key':y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y
            
    if _flag == 0:
        restr_dict['chembl'] = dictionary
    del restr_dict['chembl']['molecule_structures']           
    restr_dict = unlist(restr_dict)
    restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"])        
    restr_dict = value_convert(restr_dict, skipped_keys=["chebi_par_id","first_approval"])
    restr_dict = boolean_convert(restr_dict, added_keys=["topical","oral","parenteral",
                              "dosed_ingredient","polymer_flag","therapeutic_flag","med_chem_friendly","ro3_pass"])     
    return restr_dict
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
def load_ensembl2prosite(self):
        # Prosite
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__prot_profile__dm.txt')
        for datadict in tab2dict_iter(datafile, (1, 4), 0):
            datadict = dict_nodup(datadict)
            datadict = value_convert(datadict, lambda x: {
                                     'prosite': x}, traverse_list=False)
            for doc in map_id(datadict, self.ensembl2entrez):
                yield doc
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        # all genes with matched entrez
        def _fn(eid, taxid=None):
            # need to make a copy of the value here.
            d = copy.copy(ensembl2x.get(eid, {}))
            # otherwise, it will cause issue when multiple entrezgene ids
            return d
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        # add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                # if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
github biothings / mygene.info / src / hub / dataload / sources / uniprot / parser.py View on Github external
res = {keyname : uniq[0]}
        return res

    def normalize_pdb(value):
        return normalize(value,"pdb")

    def normalize_pir(value):
        return normalize(value,"pir")

    # PDB
    gene2pdb = value_convert(gene2pdb, normalize_pdb, traverse_list=False)
    pdb_dumpfile = os.path.join(data_folder, 'gene2pdb.pyobj')
    dump(gene2pdb,pdb_dumpfile)

    # PIR
    gene2pir = value_convert(gene2pir, normalize_pir, traverse_list=False)
    pir_dumpfile = os.path.join(data_folder, 'gene2pir.pyobj')
    dump(gene2pir,pir_dumpfile)
github biothings / mygene.info / src / hub / dataload / sources / uniprot / parser.py View on Github external
def transform(xli2):
        gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
        gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False)
        gid, uniprot = list(gene2uniprot.items())[0]
        docs = []
        for gid, uniprot in gene2uniprot.items():
            doc = {"_id" : gid}
            doc.update(uniprot)
            docs.append(doc)
        return docs
github biothings / mygene.info / src / hub / dataload / sources / ensembl / parser.py View on Github external
def load_ensembl2pos(self):
        datafile = os.path.join(
            self.data_folder, 'gene_ensembl__gene__main.txt')
        # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
        ensembl2pos = dict_nodup(
            tab2dict(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
        ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                    'ensemblgene': x[0], 'chr': x[3], 'start': int(x[1]), 'end': int(x[2]), 'strand': int(x[4])})
        ensembl2pos = value_convert(ensembl2pos, lambda x: {
                                    'genomic_pos': x}, traverse_list=False)
        for datadict in tab2dict_iter(datafile, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG):
            datadict = dict_nodup(datadict)
            datadict = value_convert(datadict, lambda x: {'ensemblgene': x[0], 'chr': x[3], 'start': int(
                x[1]), 'end': int(x[2]), 'strand': int(x[4])})
            datadict = value_convert(datadict, lambda x: {
                                     'genomic_pos': x, '__aslistofdict__': 'genomic_pos'}, traverse_list=False)
            for doc in map_id(datadict, self.ensembl2entrez):
                yield doc