How to use the wordfreq.config.RAW_DATA_DIR function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_build.py View on Github external
def test_build():
    """
    Ensure that the build process builds the same DB that gets distributed.
    """
    if not os.path.exists(config.RAW_DATA_DIR):
        download_and_extract_raw_data()

    tempdir = tempfile.mkdtemp('.wordfreq')
    try:
        db_file = os.path.join(tempdir, 'test.db')
        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
        conn = sqlite3.connect(db_file)

        # Compare the information we got to the information in the default DB.
        new_info = flatten_list_of_dicts(wordlist_info(conn))
        old_info = flatten_list_of_dicts(wordlist_info(None))
        eq_(len(new_info), len(old_info))
        for i in range(len(new_info)):
            # Don't test Greek and emoji on Python 2; we can't make them
            # consistent with Python 3.
            if PYTHON2 and ((u'lang', u'el') in new_info[i]):
github LuminosoInsight / wordfreq / tests / test_build.py View on Github external
def test_build():
    """
    Ensure that the build process builds the same DB that gets distributed.
    """
    if not os.path.exists(config.RAW_DATA_DIR):
        download_and_extract_raw_data()

    tempdir = tempfile.mkdtemp('.wordfreq')
    try:
        db_file = os.path.join(tempdir, 'test.db')
        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
        conn = sqlite3.connect(db_file)

        # Compare the information we got to the information in the default DB.
        new_info = flatten_list_of_dicts(wordlist_info(conn))
        old_info = flatten_list_of_dicts(wordlist_info(None))
        eq_(len(new_info), len(old_info))
        for i in range(len(new_info)):
            # Don't test Greek and emoji on Python 2; we can't make them
            # consistent with Python 3.
            if PYTHON2 and ((u'lang', u'el') in new_info[i]):
                continue
            if PYTHON2 and ((u'wordlist', u'twitter') in new_info[i]):
                continue
            eq_(new_info[i], old_info[i])
    finally:
        shutil.rmtree(tempdir)
github LuminosoInsight / wordfreq / wordfreq / build.py View on Github external
up as actual differences in the set of words. For the sake of consistency,
    we say that the data is only valid when built on Python 3.

    Python 2 can still *use* wordfreq, by downloading the database that was
    built on Python 3.

    If you insist on building the Python 2 version, pass `do_it_anyway=True`.
    """
    if sys.version_info.major == 2 and not do_it_anyway:
        raise UnicodeError(
            "Python 2.x has insufficient Unicode support, and will build "
            "the wrong database. Pass `do_it_anyway=True` to do it anyway."
        )

    if source_dir is None:
        source_dir = config.RAW_DATA_DIR

    if filename is None:
        filename = config.DB_FILENAME

    def wordlist_path(*pieces):
        return os.path.join(source_dir, *pieces)

    logger.info("Creating database")
    conn = create_db(filename)

    for lang in LEEDS_LANGUAGES:
        filename = wordlist_path('leeds', 'internet-%s-forms.num' % lang)
        read_leeds_wordlist_into_db(conn, filename, 'leeds-internet', lang)

    read_wordlist_into_db(conn, wordlist_path('google', 'google-books-english.csv'), 'google-books', 'en')
    read_wordlist_into_db(conn, wordlist_path('luminoso', 'twitter-52M.csv'), 'twitter', 'xx')
github LuminosoInsight / wordfreq / wordfreq / transfer.py View on Github external
3, and more notably, that it has the proper SSH keys to upload to that
    server.
    """
    from tempfile import TemporaryDirectory

    if upload_path is None:
        upload_path = config.UPLOAD_PATH
    
    with TemporaryDirectory('.wordfreq') as build_dir:
        version_dir = os.path.join(build_dir, config.MINOR_VERSION)
        os.makedirs(version_dir)

        source_filename = os.path.join(version_dir, 'wordfreq-data.tar.gz')
        logger.info("Creating %s" % source_filename)
        with tarfile.open(source_filename, 'w:gz') as tarf:
            tarf.add(config.RAW_DATA_DIR)

        logger.info("Copying database file %s" % config.DB_FILENAME)
        subprocess.call([
            '/bin/cp',
            config.DB_FILENAME,
            version_dir
        ])

        logger.info("Uploading to %s" % upload_path)
        subprocess.call([
            '/usr/bin/rsync',
            '-avz',
            version_dir,
            upload_path
        ])