How to use langdetect - 10 common examples

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miku / siskin / siskin / sources / lissa.py View on Github external
dois = [v.replace("http://dx.doi.org/", "") for v in source["identifiers"] if "doi.org" in v]
            if len(dois) == 0:
                self.logger.warn("document without DOI")
            elif len(dois) == 1:
                doc.update({"doi": dois[0]})
            else:
                # In 08/2019, various DOI seem to work.
                self.logger.warn("document with multiple dois: %s", dois)
                doc.update({"doi": dois[0]})

            if doc.get("language"):
                doc.update({"language": doc.get("language")})
            else:
                if len(doc["abstract"]) > 20:
                    result = langdetect.detect(doc["abstract"])
                    doc["languages"] = [languages.get(alpha2=result).bibliographic]
                    self.logger.debug("detected %s in abstract (%s)", doc["languages"], doc["abstract"][:40])

            # Gather subjects.
            subjects = source.get("subjects", []) + source.get("subject_synonyms", []) + source.get("tags", [])
            unique_subjects = set(itertools.chain(*[v.split("|") for v in subjects]))
            doc.update({"x.subjects": list(unique_subjects)})

            # Try date_published, then date_created, then fail.
            for key in ("date_published", "date_created"):
                if key not in source or not source[key]:
                    continue
                doc.update({
                    "x.date": source[key][:19] + "Z",
                    "rft.date": source[key][:10],
                })
github kearch / kearch / packages / specialist_crawler_child / webpage.py View on Github external
for script in soup(["script", "style"]):
            script.extract()    # rip javascript out

        try:
            self.set_links(soup)
        except ValueError:
            raise WebpageError('Cannot set links')

        try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            self.language = langdetect.detect(self.text)
            if not self.language == language:
                raise WebpageError("Language doesn't match.")
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        self.title_words = self.text_to_words(self.title, language=self.language)
        # convert all white space to sigle space
        self.text = ' '.join(
            filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.summary = self.text[:500]
        self.words = self.text_to_words(self.text, language=self.language)
github tahaHichri / ML-fomo / main.py View on Github external
def sanitize_text(self, text):
		try:
			if detect(text) == 'en':
				allow_in_dict = True
			else:
				allow_in_dict = False
		except:
			allow_in_dict = False

		# remove non-words
		sanitized_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) 

		self.stop_words = set(stopwords.words('english'))
		self.stop_words.update(STOPWORDS)
		self.stop_words.update(self.ignored_words)
		
		word_tokens = word_tokenize(sanitized_text) 
  
		#filtered_sentence = [w for w in word_tokens if not w in stop_words and len(w) > 1] 
github mulhod / reviewer_experience_prediction / util / read_data_files.py View on Github external
try:
            h = float(lines[i].split()[1].strip())
            r = lines[i + 1].split(' ', 1)[1].strip()
        except (ValueError, IndexError) as e:
            i += 2
            continue
        # Skip reviews that don't have any characters
        if not len(r):
            i += 2
            continue
        # Skip reviews if they cannot be recognized as English
        try:
            if not detect(r) == 'en':
                i += 2
                continue
        except LangDetectException:
            i += 2
            continue
        # Now we append the 2-key dict to the end of reviews
        reviews.append(dict(hours=h,
                            review=r))
        i += 2 # Increment i by 2 since we need to go to the next
            # 2-line couplet
    return reviews
github kaustubhhiware / facebook-archive / wordclouds.py View on Github external
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.stem import PorterStemmer
from PIL import Image
from nltk.tokenize import sent_tokenize, word_tokenize
from langdetect import detect
import langdetect as ld

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

PS = PorterStemmer()
MASK_LOC = "images/wordclouds/mymask.png"
LD_EXC = ld.lang_detect_exception.LangDetectException

def wordcloud():
    """
    Analysing users' posts,comments and friends data.
    
    Generate wordclouds of commonly used words from users' posts and comments
    Find out the most used language in posts and comments
    Generate wordcloud of friends' names, most tagged in your posts
    """
    
    loc = input('Enter facebook archive extracted location: ')
    if not os.path.isdir(loc):
        print("The provided location doesn't seem to be right")
        exit(1)
    
    fname = loc+'/comments/comments.json'
github City-of-Helsinki / linkedevents / events / importer / util.py View on Github external
paragraphs = re.split(r'(<p></p><p>|\n|</p>|<p>| – |<br><br><br>)+', text)
    separated = {script: '' for script in scripts}
    # the first language given is the default one
    last_language = scripts[0]
    last_paragraph = ''
    for paragraph in paragraphs:
        if paragraph in (r'</p><p>', r'</p>' r'\n', r'<p>', r'<br><br><br>'):
            # skip paragraph breaks to prevent misdetection
            separated[last_language] += paragraph
            last_paragraph = paragraph
            continue
        # replace any misleading tags left
        paragraph_stripped = re.sub(r'(&lt;(/)?strong&gt;)|(<br>)+|&amp;|<a href=".*&quot;">|</a>', ' ', paragraph)
        try:
            language = detect(paragraph_stripped)
        except LangDetectException:
            # an exception means no language could be detected
            language = last_language
        # langdetect maps "Simplified Chinese" to "zh-cn"
        # However, we store it as "zh_hans"
        if language == "zh-cn":
            language = "zh_hans"
        if language not in scripts:
            # only detect allowed languages, no exceptions
            language = last_language
        if language != last_language:
            # fix html paragraph breaks after language change
            logger.debug('supported language detected: ' + language)
            if last_paragraph in (r'</p><p>', r'</p>', r'<p>'):
                separated[last_language] = re.sub(r'</p><p>$', '', separated[last_language])
                separated[language] += r'</p><p>'
            # remove useless dashes after language change</p>
github stanfordnlp / cocoa / craigslistbargain / scripts / generate_scenarios.py View on Github external
def is_valid_line(line):
    if 'contact' in line.lower():
        return False
    if not re.search(r'\.|\!|\,', line) and len(line.split()) > 15:
        return False
    if re.search(r'\$\s*\d+', line):
        return False
    try:
        if langdetect.detect(line) != 'en':
            return False
    except langdetect.lang_detect_exception.LangDetectException:
        return True
    return True
github tiotdev / steem-curationbot / curationbot.py View on Github external
def is_eligible(text, n, lng):
    """Returns True if *text* contains at least *n* words in the specified *lng* language"""
    for language in detect_langs(text):
        if language.lang == lng:
            probability = language.prob
            word_count = len(text.split(" "))
            if probability * word_count > n:
                return True
            else:
                break
    return False
github kearch / kearch / webpage.py View on Github external
script.extract()    # rip javascript out

        try:
            self.set_links(soup)
        except ValueError:
            raise WebpageError('Cannot set links')

        try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            self.language = langdetect.detect(self.text)
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        self.title_words = self.text_to_words(self.title)
        # convert all white space to sigle space
        self.text = ' '.join(
            filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.text = self.remove_non_ascii_character(self.text)
        self.summary = self.text[:500]
        self.words = self.text_to_words(self.text)
github kearch / kearch / packages / kearch_classifier / kearch_classifier / webpage.py View on Github external
try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            print('webpage.py start detecting language ' + url,
                  file=sys.stderr)
            self.language = langdetect.detect(self.text)
            print('webpage.py finish detecting language ' + url,
                  file=sys.stderr)
            if not self.language == language:
                raise WebpageError("Language doesn't match.")
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        print('webpage.py start text_to_words for title ' + url,
              file=sys.stderr)
        self.title_words = self.text_to_words(
            self.title, language=self.language)
        print('webpage.py finish text_to_words for title ' + url,
              file=sys.stderr)
        # convert all white space to sigle space
        # self.text = ' '.join(
        # filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.summary = self.text[:500]
        print('webpage.py start text_to_words for text ' + url,
              file=sys.stderr)