Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dois = [v.replace("http://dx.doi.org/", "") for v in source["identifiers"] if "doi.org" in v]
if len(dois) == 0:
self.logger.warn("document without DOI")
elif len(dois) == 1:
doc.update({"doi": dois[0]})
else:
# In 08/2019, various DOI seem to work.
self.logger.warn("document with multiple dois: %s", dois)
doc.update({"doi": dois[0]})
if doc.get("language"):
doc.update({"language": doc.get("language")})
else:
if len(doc["abstract"]) > 20:
result = langdetect.detect(doc["abstract"])
doc["languages"] = [languages.get(alpha2=result).bibliographic]
self.logger.debug("detected %s in abstract (%s)", doc["languages"], doc["abstract"][:40])
# Gather subjects.
subjects = source.get("subjects", []) + source.get("subject_synonyms", []) + source.get("tags", [])
unique_subjects = set(itertools.chain(*[v.split("|") for v in subjects]))
doc.update({"x.subjects": list(unique_subjects)})
# Try date_published, then date_created, then fail.
for key in ("date_published", "date_created"):
if key not in source or not source[key]:
continue
doc.update({
"x.date": source[key][:19] + "Z",
"rft.date": source[key][:10],
})
for script in soup(["script", "style"]):
script.extract() # rip javascript out
try:
self.set_links(soup)
except ValueError:
raise WebpageError('Cannot set links')
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
self.language = langdetect.detect(self.text)
if not self.language == language:
raise WebpageError("Language doesn't match.")
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
self.title_words = self.text_to_words(self.title, language=self.language)
# convert all white space to sigle space
self.text = ' '.join(
filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.summary = self.text[:500]
self.words = self.text_to_words(self.text, language=self.language)
def sanitize_text(self, text):
try:
if detect(text) == 'en':
allow_in_dict = True
else:
allow_in_dict = False
except:
allow_in_dict = False
# remove non-words
sanitized_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split())
self.stop_words = set(stopwords.words('english'))
self.stop_words.update(STOPWORDS)
self.stop_words.update(self.ignored_words)
word_tokens = word_tokenize(sanitized_text)
#filtered_sentence = [w for w in word_tokens if not w in stop_words and len(w) > 1]
try:
h = float(lines[i].split()[1].strip())
r = lines[i + 1].split(' ', 1)[1].strip()
except (ValueError, IndexError) as e:
i += 2
continue
# Skip reviews that don't have any characters
if not len(r):
i += 2
continue
# Skip reviews if they cannot be recognized as English
try:
if not detect(r) == 'en':
i += 2
continue
except LangDetectException:
i += 2
continue
# Now we append the 2-key dict to the end of reviews
reviews.append(dict(hours=h,
review=r))
i += 2 # Increment i by 2 since we need to go to the next
# 2-line couplet
return reviews
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.stem import PorterStemmer
from PIL import Image
from nltk.tokenize import sent_tokenize, word_tokenize
from langdetect import detect
import langdetect as ld
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
PS = PorterStemmer()
MASK_LOC = "images/wordclouds/mymask.png"
LD_EXC = ld.lang_detect_exception.LangDetectException
def wordcloud():
"""
Analysing users' posts,comments and friends data.
Generate wordclouds of commonly used words from users' posts and comments
Find out the most used language in posts and comments
Generate wordcloud of friends' names, most tagged in your posts
"""
loc = input('Enter facebook archive extracted location: ')
if not os.path.isdir(loc):
print("The provided location doesn't seem to be right")
exit(1)
fname = loc+'/comments/comments.json'
paragraphs = re.split(r'(<p></p><p>|\n|</p>|<p>| – |<br><br><br>)+', text)
separated = {script: '' for script in scripts}
# the first language given is the default one
last_language = scripts[0]
last_paragraph = ''
for paragraph in paragraphs:
if paragraph in (r'</p><p>', r'</p>' r'\n', r'<p>', r'<br><br><br>'):
# skip paragraph breaks to prevent misdetection
separated[last_language] += paragraph
last_paragraph = paragraph
continue
# replace any misleading tags left
paragraph_stripped = re.sub(r'(<(/)?strong>)|(<br>)+|&|<a href=".*"">|</a>', ' ', paragraph)
try:
language = detect(paragraph_stripped)
except LangDetectException:
# an exception means no language could be detected
language = last_language
# langdetect maps "Simplified Chinese" to "zh-cn"
# However, we store it as "zh_hans"
if language == "zh-cn":
language = "zh_hans"
if language not in scripts:
# only detect allowed languages, no exceptions
language = last_language
if language != last_language:
# fix html paragraph breaks after language change
logger.debug('supported language detected: ' + language)
if last_paragraph in (r'</p><p>', r'</p>', r'<p>'):
separated[last_language] = re.sub(r'</p><p>$', '', separated[last_language])
separated[language] += r'</p><p>'
# remove useless dashes after language change</p>
def is_valid_line(line):
if 'contact' in line.lower():
return False
if not re.search(r'\.|\!|\,', line) and len(line.split()) > 15:
return False
if re.search(r'\$\s*\d+', line):
return False
try:
if langdetect.detect(line) != 'en':
return False
except langdetect.lang_detect_exception.LangDetectException:
return True
return True
def is_eligible(text, n, lng):
"""Returns True if *text* contains at least *n* words in the specified *lng* language"""
for language in detect_langs(text):
if language.lang == lng:
probability = language.prob
word_count = len(text.split(" "))
if probability * word_count > n:
return True
else:
break
return False
script.extract() # rip javascript out
try:
self.set_links(soup)
except ValueError:
raise WebpageError('Cannot set links')
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
self.language = langdetect.detect(self.text)
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
self.title_words = self.text_to_words(self.title)
# convert all white space to sigle space
self.text = ' '.join(
filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.text = self.remove_non_ascii_character(self.text)
self.summary = self.text[:500]
self.words = self.text_to_words(self.text)
try:
self.title = str(soup.title.string)
self.text = str(soup.body.text)
except AttributeError:
raise WebpageError('Cannot get title or text')
try:
print('webpage.py start detecting language ' + url,
file=sys.stderr)
self.language = langdetect.detect(self.text)
print('webpage.py finish detecting language ' + url,
file=sys.stderr)
if not self.language == language:
raise WebpageError("Language doesn't match.")
except langdetect.lang_detect_exception.LangDetectException:
raise WebpageError('Cannot detect language.')
print('webpage.py start text_to_words for title ' + url,
file=sys.stderr)
self.title_words = self.text_to_words(
self.title, language=self.language)
print('webpage.py finish text_to_words for title ' + url,
file=sys.stderr)
# convert all white space to sigle space
# self.text = ' '.join(
# filter(lambda x: not x == '', re.split('\s', self.text)))
# This version do not respond to mutibyte characters
self.summary = self.text[:500]
print('webpage.py start text_to_words for text ' + url,
file=sys.stderr)