Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
fname = 'data_' + version + '.tar.gz'
url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname
build_data.download(url, data_path, fname)
build_data.untar(data_path, fname)
os.rename(
os.path.join(data_path, 'data_train_' + version + '.json'),
os.path.join(data_path, 'train.json'),
)
os.rename(
os.path.join(data_path, 'data_test_' + version + '.json'),
os.path.join(data_path, 'test.json'),
)
build_data.mark_done(data_path, version_string=version)
print('Preprocessing test')
test_data['Comment'] = data_preprocessing(test_data['Comment'])
print('Writing input files for fasttext')
write_input_fasttext_cls(train_data, os.path.join(dpath, 'train'), 'train')
write_input_fasttext_cls(test_data, os.path.join(dpath, 'test'), 'test')
write_input_fasttext_emb(train_data, os.path.join(dpath, 'train'), 'train')
write_input_fasttext_emb(test_data, os.path.join(dpath, 'test'), 'test')
print('Writing input normalized input files')
train_data.to_csv(os.path.join(dpath, 'train.csv'), index=False)
test_data.to_csv(os.path.join(dpath, 'test.csv'), index=False)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
build_data.download(URL_ROOT + fname_data, dpath, fname_data)
build_data.untar(dpath, fname_data)
# next download the wordstats files
fname_wordstats = 'wordstats_v1.tar.gz'
build_data.download(URL_ROOT + fname_wordstats, dpath, fname_wordstats)
build_data.untar(dpath, fname_wordstats)
# next download the evaluation logs
fname_evallogs = 'evaluationlogs_v1.tar.gz'
build_data.download(URL_ROOT + fname_evallogs, dpath, fname_evallogs)
build_data.untar(dpath, fname_evallogs)
print("Data has been placed in " + dpath)
build_data.mark_done(dpath, version)
dts = ['train', 'val', 'test']
if task == 'image_chat':
dts[1] = 'valid'
for dt in dts:
with open(os.path.join(dpath, '{}.json'.format(dt))) as f:
data = json.load(f)
hashes += [d['image_hash'] for d in data]
os.makedirs(image_path, exist_ok=True)
print('[downloading images to {}]'.format(image_path))
for _, (p_hash) in enumerate(tqdm.tqdm(hashes, unit='img')):
image_url = '{}/{}/{}/{}.jpg'.format(
image_prefix, p_hash[:3], p_hash[3:6], p_hash
)
download(image_url, image_path, '{}.jpg'.format(p_hash))
build_data.mark_done(image_path, version)
dpath = os.path.join(opt['datapath'], 'Flickr30k')
version = '1.0'
if not build_data.built(dpath, version_string=version):
print('[building image data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
fname = 'dbll.tgz'
url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# an older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
# mark the data as built
build_data.mark_done(dpath, version_string=version)
dpath = os.path.join(opt['datapath'], 'Ubuntu')
version = None
if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz'))
# os.remove(os.path.join(dpath, 'train.gz'))
create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz'))
# os.remove(os.path.join(dpath, 'valid.gz'))
create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz'))
# os.remove(os.path.join(dpath, 'test.gz'))
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
def build(opt):
dpath = os.path.join(opt['datapath'], 'self_feeding')
fname = 'self_feeding_v031.tar.gz'
version = '3.1'
if not build_data.built(dpath, version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)
url = 'http://parl.ai/downloads/self_feeding/' + fname
build_data.download(url, dpath, fname)
build_data.untar(dpath, fname)
build_data.mark_done(dpath, version)