How to use the textblob.nltk.corpus.util.LazyCorpusLoader function in textblob

To help you get started, we’ve selected a few textblob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sloria / TextBlob / textcorpus / __init__.py View on Github external
"""

import re

from nltk.tokenize import RegexpTokenizer
from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
                     simplify_alpino_tag, simplify_indian_tag,\
                     simplify_tag
from .util import LazyCorpusLoader
from .reader import *

abc = LazyCorpusLoader(
    'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
            ('science', 'latin_1'),
            ('rural', 'utf8')])
alpino = LazyCorpusLoader(
    'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
brown = LazyCorpusLoader(
    'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
    cat_file='cats.txt', tag_mapping_function=simplify_brown_tag,
    encoding="ascii")
cess_cat = LazyCorpusLoader(
    'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
    'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
    'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
    'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
github sloria / TextBlob / textcorpus / __init__.py View on Github external
ieer = LazyCorpusLoader(
    'ieer', IEERCorpusReader, r'(?!README|\.).*')
inaugural = LazyCorpusLoader(
    'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1')
# [XX] This should probably just use TaggedCorpusReader:
indian = LazyCorpusLoader(
    'indian', IndianCorpusReader, r'(?!\.).*\.pos',
    tag_mapping_function=simplify_indian_tag,
    encoding='utf8')
ipipan = LazyCorpusLoader(
    'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
jeita = LazyCorpusLoader(
    'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8')
knbc = LazyCorpusLoader(
    'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp')
lin_thesaurus = LazyCorpusLoader(
    'lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp')
mac_morpho = LazyCorpusLoader(
    'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt',
    tag_mapping_function=simplify_tag, encoding='latin-1')
machado = LazyCorpusLoader(
    'machado', PortugueseCategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1')
movie_reviews = LazyCorpusLoader(
    'movie_reviews', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
    encoding='ascii')
names = LazyCorpusLoader(
    'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii')
nps_chat = LazyCorpusLoader(
    'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml',
    tag_mapping_function=simplify_wsj_tag)
github sloria / TextBlob / textcorpus / __init__.py View on Github external
tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
    'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
    'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
    'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
    'conll2000', ConllChunkCorpusReader,
    ['train.txt', 'test.txt'], ('NP','VP','PP'),
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
conll2002 = LazyCorpusLoader(
    'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
    ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
conll2007 = LazyCorpusLoader(
    'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
        ('eus', 'ISO-8859-2'),
        ('esp', 'utf8')])
dependency_treebank = LazyCorpusLoader(
    'dependency_treebank', DependencyCorpusReader, '.*\.dp',
    encoding='ascii')
floresta = LazyCorpusLoader(
    'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-15')
framenet = LazyCorpusLoader(
    'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
    'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
    encoding='ISO-8859-2')
genesis = LazyCorpusLoader(
    'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
github sloria / TextBlob / textcorpus / __init__.py View on Github external
cmudict = LazyCorpusLoader(
    'cmudict', CMUDictCorpusReader, ['cmudict'])
comtrans = LazyCorpusLoader(
    'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt')
conll2000 = LazyCorpusLoader(
    'conll2000', ConllChunkCorpusReader,
    ['train.txt', 'test.txt'], ('NP','VP','PP'),
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
conll2002 = LazyCorpusLoader(
    'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',
    ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
conll2007 = LazyCorpusLoader(
    'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[
        ('eus', 'ISO-8859-2'),
        ('esp', 'utf8')])
dependency_treebank = LazyCorpusLoader(
    'dependency_treebank', DependencyCorpusReader, '.*\.dp',
    encoding='ascii')
floresta = LazyCorpusLoader(
    'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-15')
framenet = LazyCorpusLoader(
    'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml'])
gazetteers = LazyCorpusLoader(
    'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt',
    encoding='ISO-8859-2')
genesis = LazyCorpusLoader(
    'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
        ('finnish|french|german', 'latin_1'),
        ('swedish', 'cp865'),
        ('.*', 'utf_8')])
gutenberg = LazyCorpusLoader(
github sloria / TextBlob / textcorpus / __init__.py View on Github external
tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
    'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
    sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
    para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
    'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
    'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
    'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
    'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
    'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
    'propbank', PropbankCorpusReader,
    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
    'nombank.1.0', NombankCorpusReader,
    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
github sloria / TextBlob / textcorpus / __init__.py View on Github external
state_union = LazyCorpusLoader(
    'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
    encoding='ISO-8859-2')
stopwords = LazyCorpusLoader(
    'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh = LazyCorpusLoader(
    'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
switchboard = LazyCorpusLoader(
    'switchboard', SwitchboardCorpusReader,
    tag_mapping_function=simplify_wsj_tag)
timit = LazyCorpusLoader(
    'timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
    'timit', TimitTaggedCorpusReader, '.+\.tags',
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
toolbox = LazyCorpusLoader(
    'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
treebank = LazyCorpusLoader(
    'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
    'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
    sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
    para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
    'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
github sloria / TextBlob / textcorpus / __init__.py View on Github external
para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
    'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
    'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
    'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
    'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
    'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
    'propbank', PropbankCorpusReader,
    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
    'nombank.1.0', NombankCorpusReader,
    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
    'propbank', PropbankCorpusReader,
    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
    lambda filename: filename.upper(),
github sloria / TextBlob / textcorpus / __init__.py View on Github external
shakespeare = LazyCorpusLoader(
    'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml')
sinica_treebank = LazyCorpusLoader(
    'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'],
    tag_mapping_function=simplify_tag, encoding='utf-8')
state_union = LazyCorpusLoader(
    'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt',
    encoding='ISO-8859-2')
stopwords = LazyCorpusLoader(
    'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8')
swadesh = LazyCorpusLoader(
    'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8')
switchboard = LazyCorpusLoader(
    'switchboard', SwitchboardCorpusReader,
    tag_mapping_function=simplify_wsj_tag)
timit = LazyCorpusLoader(
    'timit', TimitCorpusReader)
timit_tagged = LazyCorpusLoader(
    'timit', TimitTaggedCorpusReader, '.+\.tags',
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
toolbox = LazyCorpusLoader(
    'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)')
treebank = LazyCorpusLoader(
    'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg',
    tag_mapping_function=simplify_wsj_tag, encoding='ascii')
treebank_chunk = LazyCorpusLoader(
    'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos',
    sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True),
    para_block_reader=tagged_treebank_para_block_reader, encoding='ascii')
treebank_raw = LazyCorpusLoader(
    'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2')
udhr = LazyCorpusLoader(
github sloria / TextBlob / textcorpus / __init__.py View on Github external
>>> from nltk.corpus import brown
    >>> print(", ".join(brown.words()))
    The, Fulton, County, Grand, Jury, said, ...

"""

import re

from nltk.tokenize import RegexpTokenizer
from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\
                     simplify_alpino_tag, simplify_indian_tag,\
                     simplify_tag
from .util import LazyCorpusLoader
from .reader import *

abc = LazyCorpusLoader(
    'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[
            ('science', 'latin_1'),
            ('rural', 'utf8')])
alpino = LazyCorpusLoader(
    'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag)
brown = LazyCorpusLoader(
    'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d',
    cat_file='cats.txt', tag_mapping_function=simplify_brown_tag,
    encoding="ascii")
cess_cat = LazyCorpusLoader(
    'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cess_esp = LazyCorpusLoader(
    'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',
    tag_mapping_function=simplify_tag, encoding='ISO-8859-2')
cmudict = LazyCorpusLoader(
github sloria / TextBlob / textcorpus / __init__.py View on Github external
udhr = LazyCorpusLoader(
    'udhr', UdhrCorpusReader)
verbnet = LazyCorpusLoader(
    'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml')
webtext = LazyCorpusLoader(
    'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2')
wordnet = LazyCorpusLoader(
    'wordnet', WordNetCorpusReader)
wordnet_ic = LazyCorpusLoader(
    'wordnet_ic', WordNetICCorpusReader, '.*\.dat')
words = LazyCorpusLoader(
    'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii')
ycoe = LazyCorpusLoader(
    'ycoe', YCOECorpusReader)
# defined after treebank
propbank = LazyCorpusLoader(
    'propbank', PropbankCorpusReader,
    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
nombank = LazyCorpusLoader(
    'nombank.1.0', NombankCorpusReader,
    'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words',
    lambda filename: re.sub(r'^wsj/\d\d/', '', filename),
    treebank) # Must be defined *after* treebank corpus.
propbank_ptb = LazyCorpusLoader(
    'propbank', PropbankCorpusReader,
    'prop.txt', 'frames/.*\.xml', 'verbs.txt',
    lambda filename: filename.upper(),
    ptb) # Must be defined *after* ptb corpus.
nombank_ptb = LazyCorpusLoader(
    'nombank.1.0', NombankCorpusReader,