How to use the langdetect.detect_langs function in langdetect

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tiotdev / steem-curationbot / curationbot.py View on Github external
def is_eligible(text, n, lng):
    """Returns True if *text* contains at least *n* words in the specified *lng* language"""
    for language in detect_langs(text):
        if language.lang == lng:
            probability = language.prob
            word_count = len(text.split(" "))
            if probability * word_count > n:
                return True
            else:
                break
    return False
github osma / annif / filter_corpus.py View on Github external
def is_in_language(targetlang, text):
    # Quick and dirty regex shortcuts for detecting the most common languages
    if FINNISH.search(text) is not None:
        return (targetlang == 'fi')
    if SWEDISH.search(text) is not None:
        return (targetlang == 'sv')
    if ENGLISH.search(text) is not None:
        return (targetlang == 'en')
    try:
        langs = detect_langs(text)
        for lang in langs:
            if lang.lang == targetlang:
                return True
        return False
    except:
        return False
github ekonda / sketal / plugins / outsource / outsource_sayer.py View on Github external
def get_lang(text):
        resu = None

        try:
            langs = langdetect.detect_langs(text)

            for language in langs:
                if language.lang == "ru":
                    language.prob += 0.2

                if resu is None or resu < language:
                    resu = language

        except langdetect.lang_detect_exception.LangDetectException:
            pass

        if resu is None:
            return "ru"

        return resu.lang
github chfoo / tellnext / tellnext / training.py View on Github external
def is_english(text):
    if not only_roman_chars(text):
        return False

    try:
        stats = langdetect.detect_langs(text)
    except LangDetectException:
        return False

    if any(stats.lang == 'en' for stats in stats):
        return True
github h3llrais3r / Auto-Subliminal / autosubliminal / util / filesystem.py View on Github external
subtitle = pysrt.open(path=srt_path, encoding='iso-8859-1')
    except Exception:
        try:
            subtitle = pysrt.open(path=srt_path, encoding='utf-8')
        except Exception:
            # If we can't read it, we can't detect, so return
            return None

    # Read first 5 subtitle lines to determine the language
    if len(subtitle) >= 5:
        text = ''
        for sub in subtitle[0:5]:
            text += sub.text

        # Detect the language with highest probability and return it if it's more than the required minimum probability
        detected_languages = langdetect.detect_langs(text)
        log.debug('Detected subtitle language(s): %s', detected_languages)
        if len(detected_languages) > 0:
            # Get first detected language (list is sorted according to probability, highest first)
            detected_language = detected_languages[0]
            language_probability = detected_language.prob
            if language_probability >= autosubliminal.DETECTEDLANGUAGEPROBABILITY:
                log.debug('Probability of detected subtitle language accepted: %s', detected_language)
                return Language.fromietf(detected_language.lang)
            else:
                log.debug('Probability of detected subtitle language too low: %s', detected_language)

    return None
github nettitude / scrounger / scrounger / modules / analysis / ios / unencrypted_keychain_data.py View on Github external
keychain_module.device = self.device
        keychain_module.output = None
        keychain_result = keychain_module.run()
        keychain_data = keychain_result["keychain_data"]

        data = []
        for key in keychain_data:
            if (key["entitlement_group"] and \
            keychain_id in key["entitlement_group"]) or (key["account"] and \
            keychain_id in key["account"]) or (key["service"] and \
            keychain_id in key["service"]):
                data += [str(key['keychain_data'])]

        report_data = []
        for item in data:
            lang = detect_langs(item)[0]
            if lang.prob > float("0.{}".format(self.min_percentage)):
                report_data += [item]

        if report_data:
            result.update({
                "report": True,
                "details": "The following data was found:\n* {}".format(
                    "\n* ".join(report_data))
            })

        return {
            "{}_result".format(self.name()): result
        }
github h3llrais3r / Auto-Subliminal / autosubliminal / diskscanner.py View on Github external
subtitle = pysrt.open(path=srt_path, encoding='iso-8859-1')
    except:
        try:
            subtitle = pysrt.open(path=srt_path, encoding='utf-8')
        except:
            # If we can't read it, we can't detect, so return
            return None

    # Read first 5 subtitle lines to determine the language
    if len(subtitle) >= 5:
        text = ""
        for sub in subtitle[0:5]:
            text += sub.text

        # Detect the language with highest probability and return it if it's more than the required minimum probability
        detected_languages = langdetect.detect_langs(text)
        log.debug("Detected subtitle language(s): %s", detected_languages)
        if len(detected_languages) > 0:
            # Get first detected language (list is sorted according to probability, highest first)
            detected_language = detected_languages[0]
            language_probability = detected_language.prob
            if language_probability >= autosubliminal.DETECTEDLANGUAGEPROBABILITY:
                log.debug("Probability of detected subtitle language accepted: %s" % detected_language)
                return Language.fromietf(detected_language.lang)
            else:
                log.debug("Probability of detected subtitle language too low: %s" % detected_language)
    return None
github MartinThoma / lidtk / lidtk / classifiers / langdetect_mod.py View on Github external
def predict_proba(self, text: str) -> List[Dict[str, Any]]:
        """
        Predicting probability of languages of a text.

        Parameters
        ----------
        text : str
        """
        probabilities = detect_langs(text)
        converted = []
        for el in probabilities:
            converted.append({"lang": self.map2wili(el.lang), "prob": el.prob})
        return converted