Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def cleaning_text(self):
'''Cleaning text to detect
(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
'''
latin_count, non_latin_count = 0, 0
for ch in self.text:
if 'A' <= ch <= 'z':
latin_count += 1
elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
non_latin_count += 1
if latin_count * 2 < non_latin_count:
text_without_latin = ''
for ch in self.text:
if ch < 'A' or 'z' < ch:
text_without_latin += ch
self.text = text_without_latin
def normalize(cls, ch):
block = unicode_block(ch)
if block == UNICODE_BASIC_LATIN:
if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
ch = ' '
elif block == UNICODE_LATIN_1_SUPPLEMENT:
if cls.LATIN1_EXCLUDED.find(ch) >= 0:
ch = ' '
elif block == UNICODE_LATIN_EXTENDED_B:
# normalization for Romanian
if ch == six.u('\u0219'): # Small S with comma below => with cedilla
ch = six.u('\u015f')
if ch == six.u('\u021b'): # Small T with comma below => with cedilla
ch = six.u('\u0163')
elif block == UNICODE_GENERAL_PUNCTUATION:
ch = ' '
elif block == UNICODE_ARABIC:
if ch == six.u('\u06cc'):