How to use the nltk.data function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github billzorn / mtgencode / lib / cardlib.py View on Github external
import utils
import transforms
from manalib import Manacost, Manatext

# Some text prettification stuff that people may not have installed
try:
    from titlecase import titlecase
except ImportError:
    def titlecase(s):
        return s.title()

try:
    import textwrap
    import nltk.data
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    # This could me made smarter - MSE will capitalize for us after :,
    # but we still need to capitalize the first english component of an activation
    # cost that starts with symbols, such as {2U}, *R*emove a +1/+1 counter from @: etc.
    def cap(s):
        return s[:1].capitalize() + s[1:]
    # This crazy thing is actually invoked as an unpass, so newlines are still
    # encoded.
    def sentencecase(s):
        s = s.replace(utils.x_marker, utils.reserved_marker)
        lines = s.split(utils.newline)
        clines = []
        for line in lines:
            if line:
                sentences = sent_tokenizer.tokenize(line)
                clines += [' '.join([cap(sent) for sent in sentences])]
        return utils.newline.join(clines).replace(utils.reserved_marker, utils.x_marker)
github SIlver-- / asoiafsearchbot-reddit / asoiaf-reddit.py View on Github external
config = ConfigParser.ConfigParser()
config.read("asoiafsearchbot.cfg")

# Database info
host = config.get("SQL", "host")
user = config.get("SQL", "user")
passwd = config.get("SQL", "passwd")
db = config.get("SQL", "db")
table = config.get("SQL", "table")
column1 = config.get("SQL", "column1")
column2 = config.get("SQL", "column2")

MAX_ROWS = 30
BOOK_CONTAINER = []
sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')

# Reddit Info
user_agent = (
        "ASOIAFSearchBot -Help you find that comment"
        "- by /u/RemindMeBotWrangler")
reddit = praw.Reddit(user_agent = user_agent)
reddit_user = config.get("Reddit", "username")
reddit_pass = config.get("Reddit", "password")
reddit.login(reddit_user, reddit_pass)
# =============================================================================
# CLASSES
# =============================================================================

class Connect(object):
    """
    DB connection class
github Kyubyong / g2pK / g2pk / g2pk.py View on Github external
# -*- coding: utf-8 -*-
'''
https://github.com/kyubyong/g2pK
'''

import os, re

import nltk
from jamo import h2j
from konlpy.tag import Mecab
from nltk.corpus import cmudict

# For further info. about cmu dict, consult http://www.speech.cs.cmu.edu/cgi-bin/cmudict.
try:
    nltk.data.find('corpora/cmudict.zip')
except LookupError:
    nltk.download('cmudict')

from g2pk.special import jyeo, ye, consonant_ui, josa_ui, vowel_ui, jamo, rieulgiyeok, rieulbieub, verb_nieun, balb, palatalize, modifying_rieul
from g2pk.regular import link1, link2, link3, link4
from g2pk.utils import annotate, compose, group, gloss, parse_table, get_rule_id2text
from g2pk.english import convert_eng
from g2pk.numerals import convert_num


class G2p(object):
    def __init__(self):
        self.mecab = Mecab() # for annotation
        self.table = parse_table()

        self.cmu = cmudict.dict() # for English
github nltk / nltk / nltk / wordnet / util.py View on Github external
def __init__(self, pos, filenameroot):
        """
        @type  pos: {string}
        @param pos: The part of speech of this index file e.g. 'noun'
        @type  filenameroot: {string}
        @param filenameroot: The base filename of the index file.
        """
        self.pos = pos
        path = nltk.data.find('corpora/wordnet/index.%s' % filenameroot)
        self.file = open(path, FILE_OPEN_MODE)

        # Table of (pathname, offset) -> (line, nextOffset)
        self.offsetLineCache = {}

        self.rewind()
github Yomguithereal / furuikeya / furuikeya.py View on Github external
def launch(self):

        # Verifying nltk resources
        nltk.data.path[0] = self.settings.nltk_data

        # Determining action
        if self.opts.saijiki:
            self.controller.generateSaijikiHaikus(self.opts.number)
        else:
            self.controller.generateMultipleHaikus(self.opts.kigo, self.opts.number)
github awslabs / aws-media-insights-engine / source / operators / translate / start_translate.py View on Github external
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
    elif source_lang == 're':
        print("Using Russian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
    elif source_lang == 'it':
        print("Using Italian dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
    elif source_lang == 'pt':
        print("Using Portuguese dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
    elif source_lang == 'es':
        print("Using Spanish dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    else:
        print("Using English dictionary to find sentence boundaries.")
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # Split input text into a list of sentences
    sentences = tokenizer.tokenize(transcript)
    print("Input text length: " + str(len(transcript)))
    print("Number of sentences: " + str(len(sentences)))
    translated_text = ''
    transcript_chunk = ''
    for sentence in sentences:
        # Translate can handle 5000 unicode characters but we'll process no more than 4000
        # just to be on the safe side.
        if (len(sentence) + len(transcript_chunk) < 4000):
            transcript_chunk = transcript_chunk + ' ' + sentence
        else:
            try:
                print("Translation input text length: " + str(len(transcript_chunk)))
                translation_chunk = translate_client.translate_text(Text=transcript_chunk,SourceLanguageCode=source_lang,TargetLanguageCode=target_lang)
github prabhakar267 / vertikin / server / utils.py View on Github external
import operator

import enchant
import nltk
from gcm import *
from nltk.stem.wordnet import WordNetLemmatizer

from api_constants import WALMART_OPEN_PRODUCT_API_KEY, GCM_API_KEY
from constants import PROPER_NOUN_POS_TAGS
from settings import DEBUG

# append custom path for nltk corpus
nltk.data.path.append("nltk_data/")

lmtzr = WordNetLemmatizer()
enchant_dictionary = enchant.Dict("en_US")


def check_candidature(query_string):
    walmart_url = "http://api.walmartlabs.com/v1/search?apiKey={0}&query={1}".format(WALMART_OPEN_PRODUCT_API_KEY,
                                                                                     query_string)
    response = requests.get(walmart_url)
    if response.ok:
        if response.json()['totalResults'] > 0:
            return True

    return False
github ramtinms / tokenquery / tokenquery / nlp / pos_tagger.py View on Github external
def __init__(self):
        try:
            nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
        except LookupError:
            nltk.download('averaged_perceptron_tagger')
github quadflor / Quadflor / Code / lucid_ml / utils / nltk_normalization.py View on Github external
def install_nltk_corpora(*packages):
        nltk_packages = list(packages)
        try:
            installed = (set(os.listdir(nltk.data.find("corpora"))) |
                         (set(os.listdir(nltk.data.find("taggers"))))) | \
                        (set(os.listdir(nltk.data.find("tokenizers"))))
        except LookupError:
            installed = set()
        if not set(nltk_packages) <= set(installed):
            nltk.download(nltk_packages)
github nltk / nltk / nltk / sem / chat80.py View on Github external
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3

    try:
        path = nltk.data.find(dbname)
        connection = sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings

        warnings.warn(
            "Make sure the database file %s is installed and uncompressed." % dbname
        )
        raise