How to use the parlai.core.build_data.download function in parlai

To help you get started, we’ve selected a few parlai examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepmipt / kpi2017 / deeppavlov / tasks / insults / build.py View on Github external
raw_path = os.path.abspath(opt['raw_dataset_path'] or ".")
        train_file = os.path.join(raw_path, 'train.csv')
        valid_file = os.path.join(raw_path, 'test_with_solutions.csv')
        test_file = os.path.join(raw_path, 'impermium_verification_labels.csv')
        if not os.path.isfile(train_file) or not os.path.isfile(valid_file) or not os.path.isfile(test_file):
            ds_path = os.environ.get('DATASETS_URL')
            file_name = 'insults.tar.gz'
            if not ds_path:
                raise RuntimeError('Please download dataset files from'
                                   ' https://www.kaggle.com/c/detecting-insults-in-social-commentary/data'
                                   ' and set path to their directory in raw-dataset-path parameter')
            print('Trying to download a insults dataset from the repository')
            url = urllib.parse.urljoin(ds_path, file_name)
            print(repr(url))
            build_data.download(url, dpath, file_name)
            build_data.untar(dpath, file_name)
            opt['raw_dataset_path'] = dpath
            print('Downloaded a insults dataset')

            raw_path = os.path.abspath(opt['raw_dataset_path'])
            train_file = os.path.join(raw_path, 'train.csv')
            valid_file = os.path.join(raw_path, 'test_with_solutions.csv')
            test_file = os.path.join(raw_path, 'impermium_verification_labels.csv')

        train_data = pd.read_csv(train_file)
        train_data = train_data.drop('Date', axis=1)

        test_data = pd.read_csv(test_file)
        test_data = test_data.drop('id', axis=1)
        test_data = test_data.drop('Usage', axis=1)
        test_data = test_data.drop('Date', axis=1)
github facebookresearch / ParlAI / parlai / tasks / wikiqa / build.py View on Github external
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WikiQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'wikiqa.tar.gz'
        url = 'http://parl.ai/downloads/wikiqa/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'WikiQACorpus')
        create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv'))
        create_fb_format(
            dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')
        )
        create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
github deepmipt / kpi2017 / parlai_tasks / minist_example / build.py View on Github external
version = None

    # check if data had been previously built
    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'mnist.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/mnist/' + fname # dataset URL
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
github facebookresearch / ParlAI / parlai / tasks / vqa_v1 / build.py View on Github external
build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'Questions_Train_mscoco.zip'
        fname2 = 'Questions_Val_mscoco.zip'
        fname3 = 'Questions_Test_mscoco.zip'

        fname4 = 'Annotations_Val_mscoco.zip'
        fname5 = 'Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)
        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
github deepmipt / kpi2017 / deeppavlov / tasks / coreference / build.py View on Github external
build_data.remove_dir(dpath)

        # Build the folders tree
        build_data.make_dir(dpath)
        build_data.make_dir(join(dpath, 'scorer'))
        build_data.make_dir(join(dpath, 'train'))
        build_data.make_dir(join(dpath, 'valid'))

        # urls
        dataset_url = 'http://rucoref.maimbava.net/files/rucoref_29.10.2015.zip'
        scorer_url = 'http://conll.cemantix.org/download/reference-coreference-scorers.v8.01.tar.gz'
        
        # download the conll-2012 scorer v 8.1
        start = time.time()
        print('[Download the conll-2012 scorer]...')
        build_data.download(scorer_url, join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
        build_data.untar(join(dpath, 'scorer'), 'reference-coreference-scorers.v8.01.tar.gz')
        print('[Scorer was dawnloads]...')      
        
        # download dataset
        fname = 'rucoref_29.10.2015.zip'
        print('[Download the rucoref dataset]...')
        build_data.make_dir(join(dpath, 'rucoref_29.10.2015'))
        build_data.download(dataset_url, join(dpath, 'rucoref_29.10.2015'), fname)
        # uncompress it
        build_data.untar(join(dpath, 'rucoref_29.10.2015'), 'rucoref_29.10.2015.zip')
        print('End of download: time - {}'.format(time.time()-start))
        
        # Convertation rucorpus files in conll files
        conllpath = join(dpath, 'ru_conll')
        build_data.make_dir(conllpath)
        utils.RuCoref2CoNLL(join(dpath, 'rucoref_29.10.2015'), conllpath, language)
github facebookresearch / ParlAI / parlai / tasks / dialogue_qe / build.py View on Github external
def build(opt):
    data_path = os.path.join(opt['datapath'], 'DialogueQE')
    version = '1501534800'

    if not build_data.built(data_path, version_string=version):
        print('[building data: ' + data_path + ']')

        if build_data.built(data_path):
            build_data.remove_dir(data_path)
        build_data.make_dir(data_path)

        fname = 'data_' + version + '.tar.gz'
        url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname
        build_data.download(url, data_path, fname)
        build_data.untar(data_path, fname)

        os.rename(
            os.path.join(data_path, 'data_train_' + version + '.json'),
            os.path.join(data_path, 'train.json'),
        )
        os.rename(
            os.path.join(data_path, 'data_test_' + version + '.json'),
            os.path.join(data_path, 'test.json'),
        )

        build_data.mark_done(data_path, version_string=version)
github facebookresearch / ParlAI / parlai / mturk / tasks / wizard_of_wikipedia / run.py View on Github external
def setup_personas_with_wiki_links(opt):
    fname = 'personas_with_wiki_links.txt'
    file_path = '{}/{}'.format(os.getcwd(), fname)
    if not os.path.exists(file_path):
        url = 'http://parl.ai/downloads/wizard_of_wikipedia/' + fname
        build_data.download(url, os.getcwd(), fname)
github facebookresearch / ParlAI / parlai / tasks / vqa_v1 / build.py View on Github external
if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'Questions_Train_mscoco.zip'
        fname2 = 'Questions_Val_mscoco.zip'
        fname3 = 'Questions_Test_mscoco.zip'

        fname4 = 'Annotations_Val_mscoco.zip'
        fname5 = 'Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)
        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
github facebookresearch / ParlAI / parlai / tasks / cornell_movie / build.py View on Github external
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CornellMovie')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cornell_movie_dialogs_corpus.tgz'
        url = 'http://parl.ai/downloads/cornell_movie/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
        create_fb_format(
            os.path.join(dpext, 'movie_lines.txt'),
            os.path.join(dpext, 'movie_conversations.txt'),
            dpath,
        )

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
github facebookresearch / ParlAI / parlai / mturk / tasks / talkthewalk / download.py View on Github external
def build(opt):
    dpath = os.path.join(opt['datapath'], 'talkthewalk')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'talkthewalk.tgz'
        url = 'http://parl.ai/downloads/projects/talkthewalk/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)