How to use the langdetect.DetectorFactory.seed function in langdetect

To help you get started, we’ve selected a few langdetect examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github h2oai / driverlessai-recipes / transformers / nlp / text_lang_detect_transformer.py View on Github external
def __init__(self, **kwargs):
        super().__init__(**kwargs)
        from langdetect import DetectorFactory
        DetectorFactory.seed = 0
github online-pol-ads / FacebookApiPolAdsCollector / fb_ad_creative_retriever.py View on Github external
def main(argv):
    config = configparser.ConfigParser()
    config.read(argv[0])

    # Force consistent langdetect results. https://pypi.org/project/langdetect/
    DetectorFactory.seed = 0

    access_token = config_utils.get_facebook_access_token(config)
    commit_to_db_every_n_processed = config.getint('LIMITS', 'BATCH_SIZE', fallback=DEFAULT_BATCH_SIZE)
    logging.info('Will commit to DB every %d snapshots processed.', commit_to_db_every_n_processed)
    slack_url = config.get('LOGGING', 'SLACK_URL')

    database_connection_params = config_utils.get_database_connection_params_from_config(config)

    with config_utils.get_database_connection(database_connection_params) as db_connection:
        ad_creative_images_bucket_client = make_gcs_bucket_client(AD_CREATIVE_IMAGES_BUCKET,
                                                                  GCS_CREDENTIALS_FILE)
        archive_screenshots_bucket_client = make_gcs_bucket_client(ARCHIVE_SCREENSHOTS_BUCKET,
                                                                  GCS_CREDENTIALS_FILE)
        image_retriever = FacebookAdCreativeRetriever(
            db_connection, ad_creative_images_bucket_client, archive_screenshots_bucket_client,
            access_token, commit_to_db_every_n_processed, slack_url)
github SMAPPNYU / pysmap / pysmap / twitterutil / smapp_dataset.py View on Github external
def detect_tweet_language(self, *args):
        DetectorFactory.seed = 0
        def language_in_tweet(tweet):
            detected_lang = None
            try: 
                detected_lang = detect(tweet['text'])             
            except lang_detect_exception.LangDetectException:
                pass
            return  any([detected_lang in args])
        cp = copy.deepcopy(self)
        cp.apply_filter_to_collections(language_in_tweet)
        return cp
github steemit / steem-python / steem / utils.py View on Github external
if sys.version >= '3.0':
    from urllib.parse import urlparse
else:
    from urlparse import urlparse

logger = logging.getLogger(__name__)

# https://github.com/matiasb/python-unidiff/blob/master/unidiff/constants.py#L37
# @@ (source offset, length) (target offset, length) @@ (section header)
RE_HUNK_HEADER = re.compile(
    r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))?\ @@[ ]?(.*)$",
    flags=re.MULTILINE)

# ensure deterministec language detection
DetectorFactory.seed = 0
MIN_TEXT_LENGTH_FOR_DETECTION = 20


def block_num_from_hash(block_hash):
    """
    return the first 4 bytes (8 hex digits) of the block ID (the block_num)
    Args:
        block_hash (str):

    Returns:
        int:
    """
    return int(str(block_hash)[:8], base=16)


def block_num_from_previous(previous_block_hash):
github melqkiades / yelp / source / python / etl / reviews_preprocessor.py View on Github external
def tag_reviews_language(self):

        print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
            print('Records have already been tagged with language field')
            self.records = \
                ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
            return

        DetectorFactory.seed = 0

        for record in self.records:
            try:
                language = langdetect.detect(record[Constants.TEXT_FIELD])
            except LangDetectException:
                language = 'unknown'
            record[Constants.LANGUAGE_FIELD] = language

        ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
github mehdisadeghi / mehdix.ir / scripts / rebuild_comments.py View on Github external
def detect_language(text):
    from langdetect import DetectorFactory, detect
    # Stay consistent between builds
    DetectorFactory.seed = 0

    return detect(text)
github SMAPPNYU / pysmap / pysmap / twitterutil / smapp_collection.py View on Github external
def detect_tweet_language(self, *args):
        DetectorFactory.seed = 0
        def language_in_tweet(tweet):
            detected_lang = None
            try: 
                detected_lang = detect(tweet['text'])             
            except lang_detect_exception.LangDetectException:
                pass
            return  any([detected_lang in args])
        cp = copy.deepcopy(self)
        cp.collection.set_custom_filter(language_in_tweet)
        return cp
github MartinThoma / lidtk / lidtk / classifiers / langdetect_mod.py View on Github external
* Python wrapper: https://pypi.python.org/pypi/langdetect
* Based on: https://github.com/shuyo/language-detection
"""

# Core Library modules
from typing import Any, Dict, List

# Third party modules
import click
import pkg_resources
from langdetect import DetectorFactory, detect, detect_langs

# First party modules
import lidtk.classifiers

DetectorFactory.seed = 0  # Make sure we get consistent results


class LangdetectClassifier(lidtk.classifiers.LIDClassifier):
    """LID with the Langdetect classifier."""

    def predict(self, text: str) -> str:
        """
        Predicting the language of a text.

        Parameters
        ----------
        text : str
        """
        return self.map2wili(detect(text))

    def predict_proba(self, text: str) -> List[Dict[str, Any]]:
github woctezuma / hidden-gems / compute_regional_stats.py View on Github external
# Review polarity tag, i.e. either "recommended" or "not recommended"
        is_a_positive_review = review['voted_up']

        # Review text
        review_content = review['review']

        # Review language tag
        review_language_tag = review['language']

        # Review's automatically detected language
        if review_id in previously_detected_languages_dict[app_id].keys():
            detected_language = previously_detected_languages_dict[app_id][review_id]
        else:
            try:
                DetectorFactory.seed = 0
                detected_language = detect(review_content)
            except lang_detect_exception.LangDetectException:
                detected_language = 'unknown'
            previously_detected_languages_dict[app_id][review_id] = detected_language
            previously_detected_languages_dict['has_changed'] = True

        language_dict[review_id] = dict()
        language_dict[review_id]['tag'] = review_language_tag
        language_dict[review_id]['detected'] = detected_language
        language_dict[review_id]['voted_up'] = is_a_positive_review

    return language_dict, previously_detected_languages_dict
github SmartDataAnalytics / horus-ner / src / classifiers / util / language_detection.py View on Github external
from langdetect import detect
from langdetect import detect_langs
from langdetect import DetectorFactory

# https://github.com/Mimino666/langdetect
# to ensure deterministic behaviour
DetectorFactory.seed = 0

print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")
print detect_langs("Otec matka syn.")