Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def list_creator(path):
dataset = {}
archive_path = gdown.cached_download(url)
maxsize = sys.maxsize
while True:
try:
csv.field_size_limit(maxsize)
break
except OverflowError:
maxsize = int(maxsize / 10)
csv.field_size_limit(maxsize)
with tarfile.open(archive_path, 'r') as archive:
for split in ('train', 'test'):
filename = f'{key}_csv/{split}.csv'
print(f'Processing {filename}...')
reader = csv.reader(
io.TextIOWrapper(archive.extractfile(filename), encoding='utf-8'))
def creator(path):
archive_path = gdown.cached_download(url)
target_path = os.path.join(root, 'raw')
with tarfile.open(archive_path, 'r') as archive:
print(f'Extracting to {target_path}')
archive.extractall(target_path)
split2filename = {'train': 'train.tok.clean.bpe.32000',
'dev': 'newstest2013.tok.bpe.32000',
'test': 'newstest2014.tok.bpe.32000'}
dataset = {}
for split, filename in split2filename.items():
src_path = f'{filename}.en'
tgt_path = f'{filename}.de'
dataset[split] = (
easyfile.TextFile(os.path.join(target_path, src_path)),
easyfile.TextFile(os.path.join(target_path, tgt_path))
)
def easyfile_creator(path):
dataset = {}
archive_path = gdown.cached_download(url)
with tarfile.open(archive_path, 'r') as archive:
print(f'Extracting to {root}...')
archive.extractall(root)
dataset = {}
for split in ('train', 'test'):
filename = f'{key}_csv/{split}.csv'
dataset[split] = easyfile.CsvFile(os.path.join(root, filename))
with io.open(path, 'wb') as f:
pickle.dump(dataset, f)
return dataset