How to use the biothings.utils.mongo.get_target_db function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / myvariant.info / src / bin / hub.py View on Github external
"step" : "rebuild",
                    "description" : ""}
            self.logger.info("Rebuild cache for sources: %s, target: %s" % (srcs,tgt))
            for src in srcs:
                # src can be a full name (eg. clinvar.clinvar_hg38) but id_feeder knows only name (clinvar_hg38)
                if "." in src:
                    src = src.split(".")[1]
                self.logger.info("Rebuilding cache for source '%s'" % src)
                col = mongo.get_src_db()[src]
                pinfo["source"] = src
                job = yield from self.managers["job_manager"].defer_to_thread(pinfo, partial(rebuild,col))
                yield from job
                self.logger.info("Done rebuilding cache for source '%s'" % src)
            if tgt:
                self.logger.info("Rebuilding cache for target '%s'" % tgt)
                col = mongo.get_target_db()[tgt]
                pinfo["source"] = tgt
                job = self.managers["job_manager"].defer_to_thread(pinfo, partial(rebuild,col))
                yield from job
github biothings / mygene.info / src / hub / databuild / indexer.py View on Github external
def test():
    target = get_target_db()
    sync_src = backend.GeneDocMongoDBBackend(target['genedoc_mygene_allspecies_20130402_uiu7bkyi'])
    idxer = ESIndexer()
    sync_target = backend.GeneDocESBackend(idxer)
    return sync_src, sync_target
github biothings / myvariant.info / src / hub / databuild / builder.py View on Github external
def chrom_worker(col_name, ids):
    tgt = mongo.get_target_db()
    col = tgt[col_name]
    cur = col.find({'_id': {'$in': ids}})
    bob = col.initialize_unordered_bulk_op()
    disagreed = []
    missing = []
    root_keys = {}
    at_least_one = False
    for doc in cur:
        dchrom = get_chrom(doc)
        if dchrom["chrom"] is None:
            missing.append(doc["_id"])
        elif dchrom["agreed"] is False:
            disagreed.append(doc["_id"])
        chrom = dchrom["chrom"]
        if chrom:
            bob.find({"_id": doc["_id"]}).update({"$set": {"chrom" : chrom}})
github biothings / mygene.info / src / hub / databuild / builder.py View on Github external
def get_stats(self,sources,job_manager):
        self.stats = super(MyGeneDataBuilder,self).get_stats(sources,job_manager)
        # enrich with some specific mygene counts, specially regarding ensembl vs. entrez
        tgt = mongo.get_target_db()[self.target_name]
        self.stats["total_genes"] = tgt.count()
        # entrez genes are digits only (also, don't count entrez_gene collection,
        # because tgt can be a subset, we have to work with the merged collection)
        self.logger.debug("Counting 'total_entrez_genes'")
        entrez_cnt = tgt.find({"entrezgene":{"$exists":1}},{"_id":1}).count()
        self.stats["total_entrez_genes"] = entrez_cnt
        # ensembl genes aount are taken from :
        # 1. "ensembl" field, but it can a list => use aggregation. 
        #    Note: "ensembl.0" means first element of the list, so it implicitely
        #    select doc with a list. Finally, filtering with {$type:"array"} doesn't work because
        #    mongo filters this on the most inner field (that's weird, but it is what is it...)
        # 2. when document is root doc coming from ensembl_gene collection without a "ensembl" key ("orphan")
        # Note: we can't create a sparce or conditional index to help querying "ensembl"
        # because data is too long for an index key, and "hashed" mode doesn't work because list aren't supported
        # Queries are gonna use colscan strategy...
        self.logger.debug("Counting 'total_ensembl_genes'")
github biothings / mygene.info / src / hub / databuild / builder.py View on Github external
def post_merge(self, source_names, batch_size, job_manager):
        tgt = mongo.get_target_db()[self.target_name]
        # background=true or it'll lock the whole database...
        self.logger.info("Indexing 'taxid'")
        tgt.create_index("taxid",background=True)
        self.logger.info("Indexing 'entrezgene'")
        tgt.create_index("entrezgene",background=True)
github biothings / mygene.info / src / hub / databuild / indexer.py View on Github external
def diff2src(use_parallel=True, noconfirm=False):
    src_li = []

    target_db = get_target_db()
    src_li.extend([(name, target_db[name].count(), 'mongodb') for name in sorted(target_db.collection_names()) if name.startswith('genedoc')])

    es_idxer = ESIndexer()
    es_idxer.conn.default_indices = []
    for es_idx in es_idxer.conn.indices.get_indices():
        if es_idx.startswith('genedoc'):
            es_idxer.ES_INDEX_NAME = es_idx
            src_li.append((es_idx, es_idxer.count()['count'], 'es'))

    print("Found {} sources:".format(len(src_li)))
    src_1 = _pick_one(src_li, "Pick first source above: ")
    src_li.remove(src_1)
    print
    src_2 = _pick_one(src_li, "Pick second source above: ")

    sync_li = []
github biothings / mygene.info / src / dataindex / es_sync.py View on Github external
def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))
github biothings / mygene.info / src / databuild / sync.py View on Github external
def diff_two(col_1, col_2, use_parallel=True):
    target = get_target_db()
    b1 = GeneDocMongoDBBackend(target[col_1])
    b2 = GeneDocMongoDBBackend(target[col_2])
    return diff_collections(b1, b2, use_parallel=use_parallel)