How to use the nlpaug.augmenter.char.OcrAug function in nlpaug

To help you get started, we’ve selected a few nlpaug examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github makcedward / nlpaug / test / flow / test_flow.py View on Github external
'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([nac.RandomCharAug(action="insert"),
                               nac.RandomCharAug(action="delete")],
                              pipeline_p=0.9),
                naf.Sequential([
                    nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)
                ], name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([nac.RandomCharAug(action="insert"),
                               nac.RandomCharAug(action="delete")]),
                naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
                                nac.RandomCharAug(action="substitute", aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
            ], pipeline_p=0.9)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            for text in texts:
                at_least_one_not_equal = False
                for _ in range(5):
                    augmented_text = flow.augment(text, n=1)

                    if text != augmented_text:
                        at_least_one_not_equal = True
                        break

                self.assertTrue(at_least_one_not_equal)
github makcedward / nlpaug / test / flow / test_sometimes.py View on Github external
def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sometimes([nac.RandomCharAug(action=Action.INSERT),
                           nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
                          pipeline_p=0.8),
            naf.Sometimes(
                [nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
                 nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6),
                 nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
                pipeline_p=0.6)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True
github makcedward / nlpaug / test / flow / test_sequential.py View on Github external
def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([nac.RandomCharAug(action=Action.INSERT),
                            naw.RandomWordAug()]),
            naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
                            nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))
github makcedward / nlpaug / test / flow / test_flow.py View on Github external
naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(
                        model_type='word2vec',
                        model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(
                    model_path='xlnet-base-cased', action="substitute",
                    skip_unknown_word=True, temperature=0.7, device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
                ], pipeline_p=0.999),
                naw.ContextualWordEmbsAug(
                    model_path='xlnet-base-cased', action="substitute",
                    skip_unknown_word=True, temperature=0.7, device='cpu')
            ], pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
github makcedward / nlpaug / test / augmenter / char / test_char.py View on Github external
def test_empty(self):
        texts = ['', None]

        augs = [
            nac.OcrAug(),
            nac.KeyboardAug(),
        ]

        for text in texts:
            for aug in augs:
                augmented_text = aug.augment(text)
                self.assertEqual(text, augmented_text)
github makcedward / nlpaug / test / flow / test_flow.py View on Github external
def test_n_output_without_augmentation(self):
        texts = [
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential([
                nac.OcrAug(),
                nac.OcrAug()
            ]),
            naf.Sometimes([
                nac.RandomCharAug(),
                nac.RandomCharAug()
            ], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    all_not_equal = False
                    for augmented_text in augmented_texts:
                        if augmented_text != text:
                            all_not_equal = True
                            break
github makcedward / nlpaug / test / augmenter / char / test_ocr.py View on Github external
def test_ocr_single_word_nonexist_char(self):
        texts = ['AAAAA', 'KKKKK']
        aug = OcrAug()
        for text in texts:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)

        self.assertTrue(len(texts) > 0)
github makcedward / nlpaug / test / flow / test_flow.py View on Github external
def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3
        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(
                        model_type='word2vec',
                        model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin')
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(
                    model_path='xlnet-base-cased', action="substitute",
                    skip_unknown_word=True, temperature=0.7, device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),