How to use the spacy.blank function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github burglarhobbit / machine-reading-comprehension / S-NET / 3_snet_without_pr_test / evaluate-v1.1.py View on Github external
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
import spacy

nlp = spacy.blank("en")

def word_tokenize(sent):
	doc = nlp(sent)
	return [token.text for token in doc]

def normalize_answer(s):

	def remove_articles(text):
		return re.sub(r'\b(a|an|the)\b', ' ', text)

	def white_space_fix(text):
		return ' '.join(text.split())

	def remove_punc(text):
		exclude = set(string.punctuation)
		return ''.join(ch for ch in text if ch not in exclude)
github textpipe / textpipe / tests / test_pipeline.py View on Github external
"""
Testing for textpipe pipeline.py
"""

import tempfile
import pytest
import spacy

from textpipe.pipeline import Pipeline

TEXT = 'Test sentence for testing'
ents_model_nl = spacy.blank('nl')
ents_model_en = spacy.blank('en')
model_path_nl = tempfile.mkdtemp()
model_path_en = tempfile.mkdtemp()
ents_model_nl.to_disk(model_path_nl)
ents_model_en.to_disk(model_path_en)

STEPS = [('Raw',), ('NWords',), ('Complexity',), ('CleanText',),
         ('Entities', {'model_mapping': {'nl': 'ents', 'en': 'other_identifier'}})]

PIPELINE_DEF_KWARGS = dict(models=[('ents', 'nl', model_path_nl),
                                   ('other_identifier', 'en', model_path_en)])

PIPE = Pipeline(STEPS, **PIPELINE_DEF_KWARGS)


def test_load_custom_model():
    """
github yash1994 / dframcy / dframcy / trainer.py View on Github external
self.output_path = output_path
        self.train_path = train_path
        self.dev_path = dev_path
        self.model = model
        self.n_iter = n_iter
        self.init_tok2vec = init_tok2vec
        self.exclusive_classes = exclusive_classes
        self.architecture = architecture
        self.train_split = train_split
        self.label_map = None

        if self.model is not None:
            self.nlp = spacy.load(self.model)
        else:
            self.nlp = spacy.blank("en")

        if "textcat" not in self.nlp.pipe_names:
            self.textcat = self.nlp.create_pipe(
                "textcat",
                config={
                    "exclusive_classes": self.exclusive_classes,
                    "architecture": self.architecture,
                },
            )
            self.nlp.add_pipe(self.textcat, last=True)
        else:
            self.textcat = self.nlp.get_pipe("textcat")
github hengruo / QANet-pytorch / preproc.py View on Github external
from tqdm import tqdm
import spacy
import ujson as json
from collections import Counter
import numpy as np
from codecs import open
import os
import config

'''
The content of this file is mostly copied from https://github.com/HKUST-KnowComp/R-Net/blob/master/prepro.py
'''

nlp = spacy.blank("en")


def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]


def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
github HKUST-KnowComp / R-Net / prepro.py View on Github external
import tensorflow as tf
import random
from tqdm import tqdm
import spacy
import ujson as json
from collections import Counter
import numpy as np
import os.path

nlp = spacy.blank("en")


def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]


def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
github explosion / spacy-transformers / spacy_pytorch_transformers / _tokenizers.py View on Github external
def finish_deserializing(self):
        self.bpe_ranks = deserialize_bpe_ranks(self._bpe_ranks)
        self.nlp = spacy.blank("en")
        self.fix_text = ftfy.fix_text
        self.cache = {}
        self.decoder = {v: k for k, v in self.encoder.items()}
github burglarhobbit / machine-reading-comprehension / S-NET / 6_snet_refactored_from_rnet_with_pr / analyze_dataset.py View on Github external
import tensorflow as tf
import random
from tqdm import tqdm
import spacy
import json
from collections import Counter
import numpy as np
from nltk.tokenize.moses import MosesDetokenizer
from rouge import Rouge as R
import string
import re

nlp = spacy.blank("en")


def word_tokenize(sent):
	doc = nlp(sent)
	return [token.text for token in doc]

def convert_idx(text, tokens):
	current = 0
	spans = []
	for token in tokens:
		current = text.find(token, current)
		if current < 0:
			print("Token {} cannot be found".format(token))
			raise Exception()
		spans.append((current, current + len(token)))
		current += len(token)
github kata-ai / indosum / tokenize_jsonl.py View on Github external
def main(args):
    objs = []
    with open(args.path, encoding=args.encoding) as f:
        for linum, line in enumerate(f):
            try:
                objs.append(json.loads(line.strip()))
            except Exception as e:
                message = f'line {linum+1}: {e}'
                raise RuntimeError(message)

    nlp = spacy.blank('id')
    with ProcessPoolExecutor(max_workers=args.max_workers) as exc:
        tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size)
        docs = [Document.from_mapping(obj) for obj in tok_objs]
        if args.discard_long_summary:
            docs = [doc for doc in docs if not has_long_summary(doc)]
        print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs))
github explosion / thinc / examples / attention_tagger.py View on Github external
def FeatureExtracter(lang, attrs=[LOWER, SHAPE, PREFIX, SUFFIX], tokenized=True):
    nlp = spacy.blank(lang)
    nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3]
    nlp.vocab.lex_attr_getters[SUFFIX] = lambda string: string[-3:]

    def forward(texts, drop=0.0):
        if tokenized:
            docs = [Doc(nlp.vocab, words) for words in texts]
        else:
            docs = [nlp(text) for text in texts]
        features = [doc.to_array(attrs) for doc in docs]

        def backward(d_features, sgd=None):
            return d_features

        return features, backward

    return layerize(forward)