Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_text_1(self):
input = u"đi học"
output = Text(input)
self.assertTrue(is_unicode(output))
def tokenize(text):
"""
tokenize text for word segmentation
:param text: raw text input
:return: tokenize text
"""
text = Text(text)
specials = ["==>", "->", "\.\.\.", ">>"]
digit = "\d+([\.,_]\d+)+"
email = "(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
web = "^(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
datetime = [
"\d{1,2}\/\d{1,2}(\/\d+)?",
"\d{1,2}-\d{1,2}(-\d+)?",
]
word = "\w+"
non_word = "[^\w\s]"
abbreviations = [
"[A-ZĐ]+\.",
"Tp\.",
"Mr\.", "Mrs\.", "Ms\.",
"Dr\.", "ThS\."
]