How to use the langdetect.lang_detect_exception.LangDetectException function in langdetect

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mulhod / reviewer_experience_prediction / util / read_data_files.py View on Github external
try:
            h = float(lines[i].split()[1].strip())
            r = lines[i + 1].split(' ', 1)[1].strip()
        except (ValueError, IndexError) as e:
            i += 2
            continue
        # Skip reviews that don't have any characters
        if not len(r):
            i += 2
            continue
        # Skip reviews if they cannot be recognized as English
        try:
            if not detect(r) == 'en':
                i += 2
                continue
        except LangDetectException:
            i += 2
            continue
        # Now we append the 2-key dict to the end of reviews
        reviews.append(dict(hours=h,
                            review=r))
        i += 2 # Increment i by 2 since we need to go to the next
            # 2-line couplet
    return reviews
github kaustubhhiware / facebook-archive / wordclouds.py View on Github external
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.stem import PorterStemmer
from PIL import Image
from nltk.tokenize import sent_tokenize, word_tokenize
from langdetect import detect
import langdetect as ld

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

PS = PorterStemmer()
MASK_LOC = "images/wordclouds/mymask.png"
LD_EXC = ld.lang_detect_exception.LangDetectException

def wordcloud():
    """
    Analysing users' posts,comments and friends data.
    
    Generate wordclouds of commonly used words from users' posts and comments
    Find out the most used language in posts and comments
    Generate wordcloud of friends' names, most tagged in your posts
    """
    
    loc = input('Enter facebook archive extracted location: ')
    if not os.path.isdir(loc):
        print("The provided location doesn't seem to be right")
        exit(1)
    
    fname = loc+'/comments/comments.json'
github City-of-Helsinki / linkedevents / events / importer / util.py View on Github external
paragraphs = re.split(r'(<p></p><p>|\n|</p>|<p>| – |<br><br><br>)+', text)
    separated = {script: '' for script in scripts}
    # the first language given is the default one
    last_language = scripts[0]
    last_paragraph = ''
    for paragraph in paragraphs:
        if paragraph in (r'</p><p>', r'</p>' r'\n', r'<p>', r'<br><br><br>'):
            # skip paragraph breaks to prevent misdetection
            separated[last_language] += paragraph
            last_paragraph = paragraph
            continue
        # replace any misleading tags left
        paragraph_stripped = re.sub(r'(&lt;(/)?strong&gt;)|(<br>)+|&amp;|<a href=".*&quot;">|</a>', ' ', paragraph)
        try:
            language = detect(paragraph_stripped)
        except LangDetectException:
            # an exception means no language could be detected
            language = last_language
        # langdetect maps "Simplified Chinese" to "zh-cn"
        # However, we store it as "zh_hans"
        if language == "zh-cn":
            language = "zh_hans"
        if language not in scripts:
            # only detect allowed languages, no exceptions
            language = last_language
        if language != last_language:
            # fix html paragraph breaks after language change
            logger.debug('supported language detected: ' + language)
            if last_paragraph in (r'</p><p>', r'</p>', r'<p>'):
                separated[last_language] = re.sub(r'</p><p>$', '', separated[last_language])
                separated[language] += r'</p><p>'
            # remove useless dashes after language change</p>
github Abhijit-2592 / spacy-langdetect / spacy_langdetect / spacy_langdetect.py View on Github external
def _detect_language(spacy_object):
    try:
        detected_language = detect_langs(spacy_object.text)[0]
        return {"language": str(detected_language.lang), "score": float(detected_language.prob)}
    except LangDetectException:
        return {"language": "UNKNOWN", "score": 0.0}
github Youngboom / clerk / helper / lang.py View on Github external
def find_out_language(candidate_languages, *args):
    candidates = []
    for sample in args:
        candidate = guess_language(sample)
        if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages:
            candidates.append(candidate)
        try:
            for candidate in detect_langs(sample):
                if candidate.lang in candidate_languages:
                    candidates.append(candidate.lang)
        except LangDetectException:
            continue

    if len(candidates) == 0:
        return None
    leading_candidate = {
        'lang': candidates[0],
        'count': candidates.count(candidates[0])
    }
    for leading_candidate in candidates[1:0]:
        if leading_candidate['count'] &lt; candidates.count(candidate):
            leading_candidate['lang'] = candidate
            leading_candidate['size'] = candidates.count(candidate)
    if leading_candidate['lang'] == UNKNOWN_LANGUAGE:
        return None
    return leading_candidate['lang']