How to use the spacy.en.English function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spaCy / tests / test_indices.py View on Github external
def nlp():
    nlp = English()
    return nlp.tokenizer
github uclnlp / jack / projects / autoread / wikipedia / preprocess.py View on Github external
def preprocess_files_recursively(dir, output, num_train_chunks=100, num_valid_chunks=1, newline_token=" <s>"):
    num_chunks = num_train_chunks + num_valid_chunks
    chunk_id = 0
    nlp = spacy.en.English(tagger=False, parser=False, entity=False, matcher=False, serializer=False, load_vectors=False)
    if not os.path.exists(output):
        os.mkdir(output)
    writers = [open(os.path.join(output, "%s_%d.txt" %
                                 ("train" if num_valid_chunks &lt;= i else "valid", i-num_valid_chunks if num_valid_chunks &lt;= i else i)), "w") for i in range(num_chunks)]
    context = None
    word_counts = defaultdict(lambda: 0)
    last_token = br_token.split(" ")[0]
    start_token = br_token.split(" ")[1] if " " in br_token else None
    for sub_dir, _, files in os.walk(dir):
        for fn in files:
            print("Processing %s" % fn)
            fn = os.path.join(sub_dir, fn)
            with open(fn, 'rb') as f:
                for l in f:
                    l = l.decode("utf-8")
                    if l.startswith(""): #new document</s>
github aoldoni / tetre / lib / parsers_backend.py View on Github external
def get_tree_from_spacy(argv):
    """Parses the raw text using SpaCy.

    Args:
        argv: The command line arguments.

    Returns:
        A list of tree.FullSentence objects, the sentences parsed from the raw text.
    """

    en_nlp = spacy.en.English()

    sentences = []

    file_id = 0

    lst = os.listdir(dirs['raw_input']['path'])
    lst.sort()

    for fn in lst:
        file_id += 1

        if should_skip_file(fn):
            continue

        name = dirs['raw_input']['path'] + fn
github opentargets / data_pipeline / modules / LiteratureNLP.py View on Github external
def __init__(self, fetcher, dry_run=False):

        self.fetcher = fetcher
        self.logger = logging.getLogger(__name__)
        self.parser = English()
        # A custom stoplist
        STOPLIST = set(nltk_stopwords.words('english') + ["n't", "'s", "'m", "ca","p","t"] + list(ENGLISH_STOP_WORDS))
        ALLOWED_STOPLIST=set(('non'))
        self.STOPLIST = STOPLIST - ALLOWED_STOPLIST
github totalgood / openchat / twote / infoextractors.py View on Github external
def __init__(self, tokenizer=None, phrases=None, max_len=6, max_phrases=1000000):
        self.max_phrases = max_phrases or 1000000
        self.max_len = max_len or 6
        self.nlp = English()
        if isinstance(phrases, basestring):
            self.phrases = read_gazetteer(self.nlp.tokenizer, phrases, n=self.max_phrases)
        else:
            self.phrases = phrases
        self.matcher = PhraseMatcher(self.nlp.tokenizer.vocab, self.phrases, max_length=self.max_len)
github explosion / spaCy / examples / inventory_count / main.py View on Github external
import inventoryCount as mainModule
import os
from spacy.en import English

if __name__ == '__main__':
    """
    Main module for this example - loads the English main NLP class,
    and keeps it in RAM while waiting for the user to re-run it. Allows the
    developer to re-edit their module under testing without having
    to wait as long to load the English class
    """

    #  Set the NLP object here for the parameters you want to see,
    #  or just leave it blank and get all the opts
    print "Loading English module... this will take a while."
    nlp = English()
    print "Done loading English module."
    while True:
        try:
            reload(mainModule)
            mainModule.runTest(nlp)
            raw_input('================ To reload main module, press Enter ================')

            
        except Exception, e:
            print "Unexpected error: " + str(e)
            continue