How to use the langdetect.lang_detect_exception function in langdetect

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github stanfordnlp / cocoa / craigslistbargain / scripts / generate_scenarios.py View on Github external
def is_valid_line(line):
    if 'contact' in line.lower():
        return False
    if not re.search(r'\.|\!|\,', line) and len(line.split()) > 15:
        return False
    if re.search(r'\$\s*\d+', line):
        return False
    try:
        if langdetect.detect(line) != 'en':
            return False
    except langdetect.lang_detect_exception.LangDetectException:
        return True
    return True
github kearch / kearch / webpage.py View on Github external
script.extract()    # rip javascript out

        try:
            self.set_links(soup)
        except ValueError:
            raise WebpageError('Cannot set links')

        try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            self.language = langdetect.detect(self.text)
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        self.title_words = self.text_to_words(self.title)
        # convert all white space to sigle space
        self.text = ' '.join(
            filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.text = self.remove_non_ascii_character(self.text)
        self.summary = self.text[:500]
        self.words = self.text_to_words(self.text)
github kearch / kearch / packages / kearch_classifier / kearch_classifier / webpage.py View on Github external
try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            print('webpage.py start detecting language ' + url,
                  file=sys.stderr)
            self.language = langdetect.detect(self.text)
            print('webpage.py finish detecting language ' + url,
                  file=sys.stderr)
            if not self.language == language:
                raise WebpageError("Language doesn't match.")
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        print('webpage.py start text_to_words for title ' + url,
              file=sys.stderr)
        self.title_words = self.text_to_words(
            self.title, language=self.language)
        print('webpage.py finish text_to_words for title ' + url,
              file=sys.stderr)
        # convert all white space to sigle space
        # self.text = ' '.join(
        # filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.summary = self.text[:500]
        print('webpage.py start text_to_words for text ' + url,
              file=sys.stderr)
github nschaetti / pyInstaBot / find_medias.py View on Github external
media_id,
                                    media_code
                                ))
                                action_scheduler.add_like(media_id, media_code)
                            # end if
                        except ActionReservoirFullError:
                            logging.getLogger(pystr.LOGGER).error(pystr.ERROR_RESERVOIR_FULL)
                            exit()
                            pass
                        except ActionAlreadyExists:
                            logging.getLogger(pystr.LOGGER).error(pystr.ERROR_COMMENT_ALREADY_DB.format(
                                media_id))
                            pass
                        # end try
                    # end if
                except langdetect.lang_detect_exception.LangDetectException:
                    pass
                # end try
github deepset-ai / haystack / haystack / indexing / file_converters / base.py View on Github external
def validate_language(self, text: str) -> bool:
        """
        Validate if the language of the text is one of valid languages.
        """
        if not self.valid_languages:
            return True

        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = None

        if lang in self.valid_languages:
            return True
        else:
            return False
github thundergolfer / Insults / insults / data / building / criteria.py View on Github external
# There's no point processing empty comments or comments too short
    # to possibly contain an insult.
    if comment_len < config["min_comment_length"]:
        return False

    # Long form comments are far more difficult to process with current
    # NLP techniques. Most work is on 1-2 sentences examples. A decent paragraph
    # is 6-10 sentences and around 600-1000 characters.
    # We want to avoid having essays as part of our dataset.
    valid_length = comment_len <= config["max_comment_length"]

    # Ignore comments that aren't in a language our model will handle. This
    # will very likely just be english ('en')
    try:
        valid_language = detect(comment) in config["allowed_languages"]
    except langdetect.lang_detect_exception.LangDetectException:
        logging.error("Comment: '{}' caused error in lang detect".format(comment.encode('utf-8')))
        return False

    return valid_length and valid_language
github ekonda / sketal / plugins / outsource / outsource_sayer.py View on Github external
def get_lang(text):
        resu = None

        try:
            langs = langdetect.detect_langs(text)

            for language in langs:
                if language.lang == "ru":
                    language.prob += 0.2

                if resu is None or resu < language:
                    resu = language

        except langdetect.lang_detect_exception.LangDetectException:
            pass

        if resu is None:
            return "ru"

        return resu.lang
github wikimedia / editquality / editquality / feature_lists / translatewiki.py View on Github external
def process_normalized_lang_map(text):
    try:
        lang_map = {l.lang: l.prob
                    for l in langdetect.detect_langs(text or "")}
    except langdetect.lang_detect_exception.LangDetectException:
        lang_map = {}

    normalized_lang_map = defaultdict(lambda: 0.0)
    for lang in ALL_LANGS:
        norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang)
        normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0)

    return normalized_lang_map
github ellie-icekler / StanfordCoreNLP-Chinese / StanfordCoreNLP.py View on Github external
def POSTag(text, sent_split=True, tolist=True):
    words=[]
    if text!='':
        try:
            lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            lang = "undetermined"
        if (lang == "zh-cn"): #If text is chinese segment, else leave it
            #########
            if sent_split:
                annotators = ['tokenize', 'ssplit', 'pos']
                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
                    ann = client.annotate(text)
                words = [[(token.word,token.pos) for token in sent.token] for sent in ann.sentence]
                segmented_list = [' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words]
                segmented = '\n'.join(segmented_list)
            else:
                annotators = ['tokenize','pos']
                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
                    ann = client.annotate(text)
                words = [(token.word, token.pos) for token in ann.sentencelessToken]
                segmented = ' '.join(['#'.join(posted) for posted in words])
github SMAPPNYU / pysmap / pysmap / twitterutil / smapp_collection.py View on Github external
def language_in_tweet(tweet):
            detected_lang = None
            try: 
                detected_lang = detect(tweet['text'])             
            except lang_detect_exception.LangDetectException:
                pass
            return  any([detected_lang in args])
        cp = copy.deepcopy(self)