Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for index, text in enumerate(corpus):
text = normalize_accented_characters(text)
text = html_parser.unescape(text)
text = strip_html(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def normalize_corpus(corpus, tokenize=False):
normalized_corpus = []
for text in corpus:
text = expand_contractions(text, CONTRACTION_MAP)
text = lemmatize_text(text)
text = remove_special_characters(text)
text = remove_stopwords(text)
normalized_corpus.append(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_sentence = contractions_pattern.sub(expand_match, sentence)
return expanded_sentence
expanded_corpus = [expand_contractions(sentence, CONTRACTION_MAP)
for sentence in cleaned_corpus]
print expanded_corpus
print
# case conversion
print corpus[0].lower()
print corpus[0].upper()
# removing stopwords
def remove_stopwords(tokens):
stopword_list = nltk.corpus.stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stopword_list]
return filtered_tokens
def normalize_corpus(corpus, lemmatize=True, tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus