How to use the biothings.hub.dataload.uploader function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / myvariant.info / src / hub / dataload / sources / dbsnp / dbsnp_upload.py View on Github external
import itertools, glob, os

from .dbsnp_json_parser import load_data_file
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader


SRC_META = {
        "url" : "https://www.ncbi.nlm.nih.gov/projects/SNP/",
        "license_url" : "https://www.ncbi.nlm.nih.gov/home/about/policies/",
        "license_url_short": "http://bit.ly/2AqoLOc"
        }


class DBSNPBaseUploader(uploader.IgnoreDuplicatedSourceUploader,
                    uploader.ParallelizedSourceUploader,
                    SnpeffPostUpdateUploader):

    def jobs(self):
        files = glob.glob(os.path.join(self.data_folder,"refsnp-chr*.json.bz2"))
        return [(f,) for f in files]

    def load_data(self,input_file):
        self.logger.info("Load data from '%s'",input_file)
        return load_data_file(input_file,self.__class__.__metadata__["assembly"])

    def post_update_data(self, *args, **kwargs):
        super(DBSNPBaseUploader,self).post_update_data(*args,**kwargs)
        self.logger.info("Indexing 'rsid'")
        # background=true or it'll lock the whole database...
        self.collection.create_index("dbsnp.rsid",background=True)
github biothings / myvariant.info / src / hub / dataload / sources / exac / exac_upload.py View on Github external
def load_data(self,data_folder):
        content = glob.glob(os.path.join(data_folder,"ExAC.r*.vcf"))
        if len(content) != 1:
            raise uploader.ResourceError("Expecting one single vcf file, got: %s" % repr(content))
        input_file = content.pop()
        self.logger.info("Load data from file '%s'" % input_file)
        return load_data(self.__class__.name, input_file)
github biothings / myvariant.info / src / hub / dataload / sources / wellderly / wellderly_upload.py View on Github external
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader

class WellderlyFactoryUploader(uploader.DummySourceUploader,SnpeffPostUpdateUploader):
    """Data originally coming from: http://www.stsiweb.org/wellderly"""

    name = "wellderly"
    __metadata__ = {
        "mapper" : 'observed',
        "assembly" : "hg19",
        "src_meta" : {
            "url" : "https://genomics.scripps.edu/browser/",
            "license_url" : "https://genomics.scripps.edu/browser/page-help.html",
            "license_url_short": "http://bit.ly/2VE6gj7"
        }
    }

    #split_collections = ["wellderly_cg1","wellderly_cg10","wellderly_cg11",
    #                     "wellderly_cg12","wellderly_cg13","wellderly_cg14",
    #                     "wellderly_cg15","wellderly_cg16","wellderly_cg17",
github biothings / mychem.info / src / hub / dataload / sources / umls / umls_upload.py View on Github external
import os.path
from .umls_parser import load_data
import biothings.hub.dataload.uploader as uploader


class UMLSUploader(uploader.BaseSourceUploader):

    name = "umls"

    def load_data(self, data_folder):
        umls_docs = load_data(data_folder)
        return umls_docs

    @classmethod
    def get_mapping(klass):
        mapping = {
            "umls": {
                "properties": {
                    "cui": {
                        "type": "keyword",
                        "normalizer" : "keyword_lowercase_normalizer",
                        'copy_to': ['all'],
github biothings / mygene.info / src / hub / dataload / sources / entrez / genomic_pos_upload.py View on Github external
'''
Populates MICROBE gene entries with genomic position data
Currently updates the 120 microbial taxids that are NCBI Reference Sequences

run get_ref_microbe_taxids function to get an updated file for TAXIDS_FILE
when it's necessary.
'''

import os.path
from biothings.utils.common import (dump, loadobj, get_timestamp)
from biothings.utils.dataload import tab2list
import biothings.hub.dataload.uploader as uploader
from biothings.utils.hub_db import get_src_dump

class EntrezGenomicPosUploader(uploader.MergerSourceUploader):

    name = "entrez_genomic_pos"
    main_source = "entrez"

    def load_data(self, data_folder):
        """
        Loads gene data from NCBI's refseq2gene.gz file.
        Parses it based on genomic position data and refseq status provided by the
        list of taxids from get_ref_microbe_taxids() as lookup table
        :return:
        """

        refsrc = get_src_dump().find_one({"_id":"ref_microbe_taxids"})
        assert refsrc, "ref_microbe_taxids dump not found"
        taxids_file = os.path.join(refsrc["download"]["data_folder"], "ref_microbe_taxids.pyobj")
        datafile = os.path.join(data_folder, 'gene2refseq.gz')
github biothings / mygene.info / src / hub / dataload / sources / cpdb / upload.py View on Github external
from .parser import load_cpdb
import biothings.hub.dataload.uploader as uploader


class CPDBUploader(uploader.BaseSourceUploader):

    name = "cpdb"
    PATHWAYS = ['biocarta','humancyc','kegg','mousecyc',
                'netpath','pharmgkb','pid',
                'smpdb','wikipathways','yeastcyc']

    def load_data(self, data_folder):
        return load_cpdb(data_folder, self.__class__.PATHWAYS)

    @classmethod
    def get_mapping(klass):
        mapping = {
            "pathway": {
                "dynamic": False,
                "properties": {
                }
github biothings / myvariant.info / src / hub / dataload / sources / grasp / grasp_upload.py View on Github external
def load_data(self,data_folder):
        # there's one zip there, let's get the zipped filename
        zgrasp = glob.glob(os.path.join(data_folder,"*.zip"))
        if len(zgrasp) != 1:
            raise uploader.ResourceError("Expecting one zip only, got: %s" % repr(zgrasp))
        zgrasp = zgrasp.pop()
        zf = zipfile.ZipFile(zgrasp)
        content = [e.filename for e in zf.filelist]
        if len(content) != 1:
            raise uploader.ResourceError("Expecting only one file in the archive, got: %s" % content)
        input_file = content.pop()
        input_file = os.path.join(data_folder,"sorted")#input_file)
        self.logger.info("Load data from file '%s'" % input_file)
        res = load_data(input_file)
        return res
github biothings / mygene.info / src / hub / dataload / sources / refseq / genesummary_upload.py View on Github external
from .parser import GeneSummaryParser
import biothings.hub.dataload.uploader as uploader

class EntrezGeneSummaryUploader(uploader.MergerSourceUploader):

    name = "entrez_genesummary"
    main_source = "refseq"

    def load_data(self, data_folder):
        gene2summary = GeneSummaryParser(data_folder).load()
        return gene2summary

    @classmethod
    def get_mapping(klass):
        mapping = {
            "summary": {
                "type": "text",
                "boost": 0.5,      # downgrade summary field.
                'copy_to': ['all'],
            },
github biothings / myvariant.info / src / hub / dataload / sources / dbsnp / dbsnp_upload.py View on Github external
import itertools, glob, os

from .dbsnp_json_parser import load_data_file
import biothings.hub.dataload.uploader as uploader
from hub.dataload.uploader import SnpeffPostUpdateUploader


SRC_META = {
        "url" : "https://www.ncbi.nlm.nih.gov/projects/SNP/",
        "license_url" : "https://www.ncbi.nlm.nih.gov/home/about/policies/",
        "license_url_short": "http://bit.ly/2AqoLOc"
        }


class DBSNPBaseUploader(uploader.IgnoreDuplicatedSourceUploader,
                    uploader.ParallelizedSourceUploader,
                    SnpeffPostUpdateUploader):

    def jobs(self):
        files = glob.glob(os.path.join(self.data_folder,"refsnp-chr*.json.bz2"))
        return [(f,) for f in files]

    def load_data(self,input_file):
        self.logger.info("Load data from '%s'",input_file)
        return load_data_file(input_file,self.__class__.__metadata__["assembly"])

    def post_update_data(self, *args, **kwargs):
        super(DBSNPBaseUploader,self).post_update_data(*args,**kwargs)
        self.logger.info("Indexing 'rsid'")
        # background=true or it'll lock the whole database...
        self.collection.create_index("dbsnp.rsid",background=True)