Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def proc_all_mp(ss):
"""
:meth: `proc_all` runs `proc_text` for multiple sentences using multiple cpus
:param str text: text to process
:return: processed and tokenized text
"""
ncpus = num_cpus() // 2
with ProcessPoolExecutor(ncpus) as e:
return sum(e.map(ThaiTokenizer.proc_all, ss), [])
def get_texts(df):
"""
:meth: `get_texts` get tuple of tokenized texts and labels
:param pandas.DataFrame df: `pandas.DataFrame` with `label` as first column and `text` as second column
:return:
* tok - lists of tokenized texts with beginning-of-sentence tag `xbos` as first element of each list
* labels - list of labels
"""
labels = df.iloc[:, 0].values.astype(np.int64)
texts = BOS + df.iloc[:, 1].astype(str).apply(lambda x: x.rstrip())
tok = ThaiTokenizer().proc_all_mp(partition_by_cores(texts))
return (tok, list(labels))
def proc_all(ss):
"""
:meth: `proc_all` runs `proc_text` for multiple sentences
:param str text: text to process
:return: processed and tokenized text
"""
tok = ThaiTokenizer()
return [tok.proc_text(s) for s in ss]
def proc_text(self, text):
"""
:meth: `proc_text` procss and tokenize text removing repetitions, special characters, double spaces
:param str text: text to process
:return: processed and tokenized text
"""
s = self.__RE_REP.sub(ThaiTokenizer.replace_rep, text)
s = self.__RE_SLASH_HASH.sub(r" \1 ", s)
s = self.__RE_DOUBLE_SPACE(" ", s)
return self.tokenize(s)