How to use the biothings.utils.mongo.get_src_dump function in biothings

To help you get started, we’ve selected a few biothings examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github biothings / mygene.info / src / dataload / data_dump / dl_uniprot.py View on Github external
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not ARCHIVE_DATA:
        rmdashfr(DATA_FOLDER)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
no_confirm = True   # set it to True for running this script automatically without intervention.

    if not ARCHIVE_DATA:
        rmdashfr(DATA_FOLDER)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': TIMESTAMP,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)
github biothings / mygene.info / src / dataload / data_dump / dl_refseq.py View on Github external
def check_refseq_release():
    refseq_release = get_refseq_release()
    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)
github biothings / mygene.info / src / dataload / data_dump / dl_exac.py View on Github external
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'exac'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILES_PATH[0])
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'exac_dump.log')
github biothings / mygene.info / src / dataload / data_dump / dl_ucsc.py View on Github external
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {'_id': 'ucsc',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
github biothings / mygene.info / src / dataload / data_dump / dl_pharmgkb.py View on Github external
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_header()
    doc = src_dump.find_one({'_id': 'pharmgkb'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        data_file = os.path.join(doc['data_folder'], 'genes.zip')
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not ARCHIVE_DATA:
        rmdashfr(DATA_FOLDER)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)
github biothings / mygene.info / src / dataload / data_dump / dl_ensembl_mart.py View on Github external
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)
github biothings / mygene.info / src / dataload / data_dump / dl_entrez.py View on Github external
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,