Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action=Action.INSERT)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(texts))
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.BertAug(action=Action.INSERT)
for text in texts:
self.assertLess(0, len(text))
augmented_text = aug.augment(text)
self.assertLess(len(text.split(' ')), len(augmented_text.split(' ')))
self.assertNotEqual(text, augmented_text)
self.assertTrue(nml.Bert.SUBWORD_PREFIX not in augmented_text)
self.assertLess(0, len(texts))
def setUpClass(cls):
env_config_path = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', '..', '..', '.env'))
load_dotenv(env_config_path)
cls.insert_augmenters = [
naw.Word2vecAug(
model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
action=Action.INSERT),
naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
action=Action.INSERT),
naw.GloVeAug(
model_path=os.environ.get("MODEL_DIR") + 'glove.6B.50d.txt',
action=Action.INSERT)
]
cls.substitute_augmenters = [
naw.Word2vecAug(
model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin',
action=Action.SUBSTITUTE),
naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR") + 'wiki-news-300d-1M.vec',
action=Action.SUBSTITUTE),
naw.GloVeAug(
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.FasttextAug(
model_path=os.environ.get("MODEL_DIR")+'wiki-news-300d-1M.vec',
action=Action.INSERT)
for text in texts:
tokens = aug.tokenizer(text)
results = aug.augment(text)
self.assertLess(len(tokens), len(results))
self.assertLess(0, len(tokens))
self.assertLess(0, len(texts))
def test_multiple_actions(self):
texts = [
'The quick brown fox jumps over the lazy dog',
'Zology raku123456 fasdasd asd4123414 1234584'
]
flows = [
naf.Sequential([nac.RandomCharAug(action=Action.INSERT),
naw.RandomWordAug()]),
naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
]
for flow in flows:
for text in texts:
augmented_text = flow.augment(text)
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(text))
self.assertLess(0, len(texts))
self.assertLess(0, len(flows))
def test_single_action(self):
texts = [
'The quick brown fox jumps over the lazy dog',
'Zology raku123456 fasdasd asd4123414 1234584 s@#'
]
# Since prob may be low and causing do not perform data augmentation. Retry 5 times
at_least_one_not_equal = False
for _ in range(0, 5):
flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6)
for text in texts:
augmented_text = flow.augment(text)
if text != augmented_text:
at_least_one_not_equal = True
self.assertLess(0, len(text))
if at_least_one_not_equal:
break
self.assertTrue(at_least_one_not_equal)
self.assertLess(0, len(texts))
def test_single_action(self):
texts = [
'The quick brown fox jumps over the lazy dog',
'Zology raku123456 fasdasd asd4123414 1234584 s@#'
]
flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)])
for text in texts:
augmented_text = flow.augment(text)
self.assertNotEqual(text, augmented_text)
self.assertLess(0, len(text))
self.assertLess(0, len(texts))
def test_insert_multi_words(self):
texts = ['The quick brown fox jumps over the lazy dog']
aug = RandomCharAug(action=Action.INSERT, min_char=1)
for text in texts:
augmented_cnt = 0
augmented_text = aug.augment(text)
tokens = aug.tokenizer(text)
augmented_tokens = aug.tokenizer(augmented_text)
for token, augmented_token in zip(tokens, augmented_tokens):
if token != augmented_token:
augmented_cnt += 1
self.assertLess(augmented_cnt, len(tokens))
self.assertNotEqual(text, augmented_text)
self.assertLess(len(text), len(augmented_text))
self.assertTrue(len(texts) > 0)
def test_insert(self):
texts = [
'The quick brown fox jumps over the lazy dog'
]
aug = naw.GloVeAug(
model_path=os.environ.get("MODEL_DIR")+'glove.6B.50d.txt',
action=Action.INSERT)
for text in texts:
tokens = aug.tokenizer(text)
results = aug.augment(text)
self.assertLess(len(tokens), len(results))
self.assertLess(0, len(tokens))
self.assertLess(0, len(texts))
exception.output()
# Return empty value per data type
if isinstance(data, str):
return ''
elif isinstance(data, list):
return []
elif isinstance(data, np.ndarray):
return np.array([])
return None
results = []
action_fx = None
clean_data = self.clean(data)
if self.action == Action.INSERT:
action_fx = self.insert
elif self.action == Action.SUBSTITUTE:
action_fx = self.substitute
elif self.action == Action.SWAP:
action_fx = self.swap
elif self.action == Action.DELETE:
action_fx = self.delete
elif self.action == Action.SPLIT:
action_fx = self.split
for _ in range(max_retry_times+1):
augmented_results = []
if num_thread == 1 or self.device == 'cuda':
# TODO: support multiprocessing for GPU
# https://discuss.pytorch.org/t/using-cuda-multiprocessing-with-single-gpu/7300
augmented_results = [action_fx(clean_data) for _ in range(n)]