Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def annotate_using_janome(sentences, tokenize=False):
assert tokenize, 'no support for using janome with pre-tokenized inputs'
try:
from janome.tokenizer import Tokenizer
except ImportError:
logger.error('failed to import janome. please install it by "pip install janome".')
exit(1)
logger.info('use Janome to tokenize and annotate POS infos.')
tokenizer = Tokenizer()
res = []
raw_sentences = []
for sentence in sentences:
sentence = ''.join(sentence)
tokenized = tokenizer.tokenize(sentence)
tokens = []
for token in tokenized:
pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
token = Token(word=token.surface,
surf=token.surface,
pos=pos,
pos1=pos1,
pos2=pos2,
pos3=pos3,
inflectionForm=token.infl_form,
def annotate_using_spacy(sentences, tokenize=False, n_threads=2, batch_size=10000):
try:
import spacy
from spacy.tokens import Doc
except ImportError:
logger.error('failed to import spacy. please install it by "pip install spacy".')
exit(1)
nlp = spacy.load('en', disable=['parser'])
logger.info('use spacy to annotate POS and NER infos.')
if tokenize:
docs = [nlp.tokenizer(' '.join(sentence)) for sentence in sentences]
raw_sentences = [[str(token) for token in doc] for doc in docs]
else:
docs = [Doc(nlp.vocab, sentence) for sentence in sentences]
for name, proc in nlp.pipeline:
docs = proc.pipe(docs,
n_threads=n_threads,
batch_size=batch_size)
res = []
for sentence in docs:
tokens = []
for token in sentence:
if token.ent_iob_ == 'O':
def annotate_using_jigg(sentences, tokenize=False, pipeline='ssplit,kuromoji'):
assert tokenize, 'no support for using jigg with pre-tokenized inputs'
logger.info('use Jigg to tokenize and annotate POS infos.')
jigg_dir = os.environ.get('JIGG', None)
if not jigg_dir:
logger.error('did not find Jigg at JIGG environmental variable. exiting..')
exit(1)
tmpfile = tempfile.mktemp()
with open(tmpfile, 'w') as f:
for sentence in sentences:
print(' '.join(sentence), file=f)
outfile = tempfile.mktemp()
command = jigg_cmd.format(jigg_dir,
pipeline,
tmpfile,
outfile)
candc_model_pos.exists() and \
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
tmpfile = tempfile.mktemp()
with open(tmpfile, 'w') as f:
for sentence in sentences:
print(' '.join(sentence), file=f)
command = candc_cmd.format(tmpfile,
candc_dir,
candc_model_pos,
candc_model_ner)
proc = subprocess.Popen(command,
shell=True,
candc_dir = os.environ.get('CANDC', None)
candc_model_pos = None
candc_model_ner = None
fail = False
if candc_dir:
candc_dir = Path(candc_dir)
candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
if (candc_dir / 'bin' / 'pos').exists() and \
(candc_dir / 'bin' / 'ner').exists() and \
candc_model_pos.exists() and \
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
if (candc_dir / 'bin' / 'pos').exists() and \
(candc_dir / 'bin' / 'ner').exists() and \
candc_model_pos.exists() and \
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
tmpfile = tempfile.mktemp()
with open(tmpfile, 'w') as f:
for sentence in sentences:
print(' '.join(sentence), file=f)
command = candc_cmd.format(tmpfile,
candc_dir,
candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
if (candc_dir / 'bin' / 'pos').exists() and \
(candc_dir / 'bin' / 'ner').exists() and \
candc_model_pos.exists() and \
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
tmpfile = tempfile.mktemp()
with open(tmpfile, 'w') as f:
for sentence in sentences:
print(' '.join(sentence), file=f)
command = candc_cmd.format(tmpfile,
candc_dir,
candc_model_pos,
candc_dir = os.environ.get('CANDC', None)
candc_model_pos = None
candc_model_ner = None
fail = False
if candc_dir:
candc_dir = Path(candc_dir)
candc_model_pos = Path(os.environ.get('CANDC_MODEL_POS', str(candc_dir / 'models' / 'pos')))
candc_model_ner = Path(os.environ.get('CANDC_MODEL_NER', str(candc_dir / 'models' / 'ner')))
if (candc_dir / 'bin' / 'pos').exists() and \
(candc_dir / 'bin' / 'ner').exists() and \
candc_model_pos.exists() and \
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
tmpfile = tempfile.mktemp()
candc_model_ner.exists():
pass
else:
logger.info('CANDC environmental variable may not be configured correctly.')
logger.info('$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.')
fail = True
else:
fail = True
if fail:
logger.info('did not find C&C parser at CANDC environmental variable.')
logger.info('fill POS tag etc. using XX tag.')
return annotate_XX(sentences)
logger.info('find C&C parser at CANDC environmental variable.')
logger.info('use C&C pipeline to annotate POS and NER infos.')
logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')
stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))
tmpfile = tempfile.mktemp()
with open(tmpfile, 'w') as f:
for sentence in sentences:
print(' '.join(sentence), file=f)
command = candc_cmd.format(tmpfile,
candc_dir,
candc_model_pos,
candc_model_ner)
proc = subprocess.Popen(command,
shell=True,
stdin=subprocess.PIPE,