How to use the langdetect.detect function in langdetect

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miku / siskin / siskin / sources / lissa.py View on Github external
dois = [v.replace("http://dx.doi.org/", "") for v in source["identifiers"] if "doi.org" in v]
            if len(dois) == 0:
                self.logger.warn("document without DOI")
            elif len(dois) == 1:
                doc.update({"doi": dois[0]})
            else:
                # In 08/2019, various DOI seem to work.
                self.logger.warn("document with multiple dois: %s", dois)
                doc.update({"doi": dois[0]})

            if doc.get("language"):
                doc.update({"language": doc.get("language")})
            else:
                if len(doc["abstract"]) > 20:
                    result = langdetect.detect(doc["abstract"])
                    doc["languages"] = [languages.get(alpha2=result).bibliographic]
                    self.logger.debug("detected %s in abstract (%s)", doc["languages"], doc["abstract"][:40])

            # Gather subjects.
            subjects = source.get("subjects", []) + source.get("subject_synonyms", []) + source.get("tags", [])
            unique_subjects = set(itertools.chain(*[v.split("|") for v in subjects]))
            doc.update({"x.subjects": list(unique_subjects)})

            # Try date_published, then date_created, then fail.
            for key in ("date_published", "date_created"):
                if key not in source or not source[key]:
                    continue
                doc.update({
                    "x.date": source[key][:19] + "Z",
                    "rft.date": source[key][:10],
                })
github kearch / kearch / packages / specialist_crawler_child / webpage.py View on Github external
for script in soup(["script", "style"]):
            script.extract()    # rip javascript out

        try:
            self.set_links(soup)
        except ValueError:
            raise WebpageError('Cannot set links')

        try:
            self.title = str(soup.title.string)
            self.text = str(soup.body.text)
        except AttributeError:
            raise WebpageError('Cannot get title or text')

        try:
            self.language = langdetect.detect(self.text)
            if not self.language == language:
                raise WebpageError("Language doesn't match.")
        except langdetect.lang_detect_exception.LangDetectException:
            raise WebpageError('Cannot detect language.')

        self.title_words = self.text_to_words(self.title, language=self.language)
        # convert all white space to sigle space
        self.text = ' '.join(
            filter(lambda x: not x == '', re.split('\s', self.text)))

        # This version do not respond to mutibyte characters
        self.summary = self.text[:500]
        self.words = self.text_to_words(self.text, language=self.language)
github tahaHichri / ML-fomo / main.py View on Github external
def sanitize_text(self, text):
		try:
			if detect(text) == 'en':
				allow_in_dict = True
			else:
				allow_in_dict = False
		except:
			allow_in_dict = False

		# remove non-words
		sanitized_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) 

		self.stop_words = set(stopwords.words('english'))
		self.stop_words.update(STOPWORDS)
		self.stop_words.update(self.ignored_words)
		
		word_tokens = word_tokenize(sanitized_text) 
  
		#filtered_sentence = [w for w in word_tokens if not w in stop_words and len(w) > 1] 
github thundergolfer / Insults / insults / data / building / criteria.py View on Github external
# There's no point processing empty comments or comments too short
    # to possibly contain an insult.
    if comment_len < config["min_comment_length"]:
        return False

    # Long form comments are far more difficult to process with current
    # NLP techniques. Most work is on 1-2 sentences examples. A decent paragraph
    # is 6-10 sentences and around 600-1000 characters.
    # We want to avoid having essays as part of our dataset.
    valid_length = comment_len <= config["max_comment_length"]

    # Ignore comments that aren't in a language our model will handle. This
    # will very likely just be english ('en')
    try:
        valid_language = detect(comment) in config["allowed_languages"]
    except langdetect.lang_detect_exception.LangDetectException:
        logging.error("Comment: '{}' caused error in lang detect".format(comment.encode('utf-8')))
        return False

    return valid_length and valid_language
github Cyberjusticelab / JusticeAI / src / ml_service / feature_extraction / pre_processing / filter_precedent / precendent_directory_cleaner.py View on Github external
for reg in regexes:
                    if reg.search(line):
                        os.remove(directory_path + filename)
                        file_removed = True
                        files_matching_regexes.append(filename)
                        break
                if file_removed:
                    break
            if file_removed:
                precedent_file.close()
                continue

            # remove english precedents
            precedent_file.seek(0)
            file_content = precedent_file.read()
            if detect(file_content) == 'en':
                os.remove(directory_path + filename)
                files_in_english.append(filename)
            precedent_file.close()
    print('')
    Log.write('Done filtering precedents')
    Log.write('Removed {} file in english'.format(str(len(files_in_english))))
    Log.write('Removed {} files without value'.format(str(len(files_matching_regexes))))
    return files_in_english, files_matching_regexes
github Sotera / pst-extraction / spark / translation.py View on Github external
def language(text, override_language=None):
    if override_language:
        return override_language

    try:
        return detect(text)
    except LangDetectException:
        return 'en'
github mknz / mirusan / search / search.py View on Github external
if args.add_summary is not None:
        print('add summary: ' + str(args.add_summary))
        title = args.add_summary[0]
        summary_file = args.add_summary[1]
        add_summary(title, summary_file)
        return

    title = args.delete_by_title
    if title is not None:
        print('delete by title: ' + title)
        delete_by_title(title, args.keep_file)
        return

    if args.lang_detect:
        print(langdetect.detect(args.lang_detect))
        return

    parser.print_help()
    return
github kootenpv / sky / sky / capsule.py View on Github external
def get_language(self, tree):
        if self.response and 'content-language' in self.response.headers:
            self.lang = self.response.headers['content-language']
            
        if self.lang is None and 'lang' in tree.attrib:
            self.lang = tree.attrib['lang']

        if self.response and self.lang is None:
            self.lang = self.body_blob.detect_language()    
            
        if self.lang is None:
            self.lang = langdetect.detect(self.body)
github riccardorestagno / BuzzFeed-Reddit-Bot / app / article_archive_parsers / helper_methods / list_parser_helper_methods.py View on Github external
def article_meets_posting_requirements(subreddit, website, article_title):
    """
    Validates that the article meets all requirements to post the list to Reddit.

    The validations below check if:
        (1) The article contains a number
        (2) The post hasn't been made already
        (3) The article title doesn't contain certain pre-defined keywords
        (4) The article title is in english (BuzzFeed only)

    Returns True if all validations are met. Returns False otherwise.
    """

    if website == ArticleType.BuzzFeed:
        try:
            if not detect(article_title) == 'en':
                return False
        except lang_detect_exception.LangDetectException:
            return False

    no_of_elements = get_article_list_count(article_title)
    if no_of_elements == 0:
        return False

    article_title_lowercase = article_title.lower()
    if any(words in article_title_lowercase for words in BREAK_WORDS):
        return False

    if post_previously_made(article_title_lowercase, no_of_elements, subreddit):
        return False

    return True