How to use the parlai.core.build_data.untar function in parlai

To help you get started, we’ve selected a few parlai examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facebookresearch / ParlAI / parlai / tasks / mctest / build.py View on Github external
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MCTest')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'mctest.tar.gz'
        url = 'http://parl.ai/downloads/mctest/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'mctest')
        create_fb_format(
            dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None
        )
        create_fb_format(
            dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None
        )
        create_fb_format(
            dpath,
            'test160',
            os.path.join(dpext, 'MCTest', 'mc160.test'),
            os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'),
        )
        create_fb_format(
            dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None
github facebookresearch / ParlAI / parlai / tasks / dstc7 / agents.py View on Github external
def build(self, opt):
        dpath = os.path.join(opt['datapath'], 'dstc7')
        version = None

        if not build_data.built(dpath, version_string=version):
            print('[building data: ' + dpath + ']')
            if build_data.built(dpath):
                # An older version exists, so remove these outdated files.
                build_data.remove_dir(dpath)
            build_data.make_dir(dpath)

            # Download the data.
            fname = 'dstc7.tar.gz'
            url = 'http://parl.ai/downloads/dstc7/' + fname
            build_data.download(url, dpath, fname)
            build_data.untar(dpath, fname)

            # Mark the data as built.
            build_data.mark_done(dpath, version_string=version)
github deepmipt / kpi2017 / deeppavlov / tasks / ner / build.py View on Github external
if not build_data.built(dpath, version_string=version):
            print('[target data path: ' + dpath + ']')
            # make a clean directory if needed
            if build_data.built(dpath):
                # an older version exists, so remove these outdated files.
                build_data.remove_dir(dpath)
            build_data.make_dir(dpath)

            ds_path = os.environ.get('DATASETS_URL')
            file_name = 'gareev.tar.gz'
            if not ds_path:
                raise RuntimeError("Looks like the `DATASETS_URL` variable is set incorrectly")
            print('Trying to download a dataset %s from the repository' % file_name)
            url = urllib.parse.urljoin(ds_path, file_name)
            build_data.download(url, dpath, file_name)
            build_data.untar(dpath, file_name)
            print('Downloaded a %s dataset' % file_name)
            # mark the data as built
            build_data.mark_done(dpath, version_string=version)
        opt['raw_dataset_path']=dpath
    print("Use dataset from path: %s" % repr(opt['raw_dataset_path']))
    create_heap_file(opt['raw_dataset_path'])
github deepmipt / kpi2017 / deeppavlov / agents / coreference / agents.py View on Github external
if not isfile(join(dpath, 'vocab', 'char_vocab.russian.txt')):
        print('[Download the chars vocalibary]...')
        try:
            vocab_url = os.environ['MODELS_URL'] + 'coreference/vocabs/char_vocab.russian.txt'
            build_data.download(vocab_url, join(dpath, 'vocab'), 'char_vocab.russian.txt')
            print('[End of download the chars vocalibary]...')
        except RuntimeWarning:
            raise('To use your own char vocalibary, please, put the file char_vocab.russian.txt in the folder '
                  '{0}'.format(join(dpath, 'vocabs')))
    
    if opt['name'] == 'pretrained_model' and not isdir(join(dpath, 'logs', 'pretrain_model')):
        print('[Download the pretrain model]...')
        try:
            pretrain_url = os.environ['MODELS_URL'] + 'coreference/OpeanAI/pretrain_model.zip'
            build_data.download(pretrain_url, join(dpath, 'logs'), 'pretrain_model.zip')
            build_data.untar(join(dpath, 'logs'), 'pretrain_model.zip')
            print('[End of download pretrain model]...')
        except RuntimeWarning:
            raise('To train your own model, please, change the variable --name in build.py:train_coreference '
                  'to anything other than `pretrain_model`')
        
    build_data.make_dir(join(dpath, 'reports', 'response_files'))
    build_data.make_dir(join(dpath, 'reports', 'results'))
    build_data.make_dir(join(dpath, 'reports', 'predictions'))
    return None
github deepmipt / kpi2017 / deeppavlov / tasks / coreference_scorer_model / build.py View on Github external
build_data.make_dir(join(dpath, 'report', 'results'))
        build_data.make_dir(join(dpath, 'scorer'))
        build_data.make_dir(join(dpath, 'train'))
        build_data.make_dir(join(dpath, 'test'))
        build_data.make_dir(join(dpath, 'valid'))

        # urls
        dataset_url = 'http://rucoref.maimbava.net/files/rucoref_29.10.2015.zip'
        scorer_url = 'http://conll.cemantix.org/download/reference-coreference-scorers.v8.01.tar.gz'

        # download the conll-2012 scorer v 8.1
        start = time.time()

        print('[Downloading the conll-2012 scorer]...')
        build_data.download(scorer_url, join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
        build_data.untar(join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
        print('[Scorer was downloaded]...')

        fname = 'rucoref_29.10.2015.zip'
        if not os.path.isdir(join(dpath, 'rucoref_29.10.2015')):
            print('[Downloading the rucoref dataset]...')
            build_data.make_dir(join(dpath, 'rucoref_29.10.2015'))
            build_data.download(dataset_url, join(dpath, 'rucoref_29.10.2015'), fname)
            # uncompress it
            build_data.untar(join(dpath, 'rucoref_29.10.2015'), 'rucoref_29.10.2015.zip')
            print('End of downloading: took {0:.3f}s'.format(time.time() - start))

        # Convertation rucorpus files in conll files
        conllpath = join(dpath, 'ru_conll')
        build_data.make_dir(conllpath)
        coreference_utils.RuCoref2CoNLL(
            join(dpath, 'rucoref_29.10.2015'), conllpath, language)
github deepmipt / kpi2017 / parlai_tasks / minist_example / build.py View on Github external
if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'mnist.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/mnist/' + fname # dataset URL
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
github deepmipt / kpi2017 / deeppavlov / tasks / paraphrases / build.py View on Github external
if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        url = 'http://paraphraser.ru/download/get?file_id='  # datasets URL

        fname = 'paraphraser.zip'
        build_data.download(url+'1', dpath, fname)
        # uncompress it
        build_data.untar(dpath, fname)
        path = os.path.join(dpath, 'paraphrases.xml')
        clean_dataset(path)

        fname = 'paraphraser_gold.zip'
        build_data.download(url+'5', dpath, fname)
        # uncompress it
        build_data.untar(dpath, fname)
        path = os.path.join(dpath, 'paraphrases_gold.xml')
        clean_dataset(path)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
github facebookresearch / ParlAI / parlai / tasks / vqa_v2 / build.py View on Github external
fname3 = 'v2_Questions_Test_mscoco.zip'

        fname4 = 'v2_Annotations_Val_mscoco.zip'
        fname5 = 'v2_Annotations_Train_mscoco.zip'

        url = 'https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
github facebookresearch / ParlAI / projects / controllable_dialogue / tasks / build.py View on Github external
if not build_data.built(dpath, version_string=version):
        if build_data.built(dpath):
            # older version exists, so remove the outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # first download the data files
        fname_data = 'data_v1.tar.gz'
        build_data.download(URL_ROOT + fname_data, dpath, fname_data)
        build_data.untar(dpath, fname_data)

        # next download the wordstats files
        fname_wordstats = 'wordstats_v1.tar.gz'
        build_data.download(URL_ROOT + fname_wordstats, dpath, fname_wordstats)
        build_data.untar(dpath, fname_wordstats)

        # next download the evaluation logs
        fname_evallogs = 'evaluationlogs_v1.tar.gz'
        build_data.download(URL_ROOT + fname_evallogs, dpath, fname_evallogs)
        build_data.untar(dpath, fname_evallogs)

        print("Data has been placed in " + dpath)

        build_data.mark_done(dpath, version)