Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_omikuji_create_train_file(tmpdir, project, datadir):
tmpfile = tmpdir.join('document.tsv')
tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" +
"arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" +
"...\thttp://example.com/none")
corpus = annif.corpus.DocumentFile(str(tmpfile))
omikuji_type = annif.backend.get_backend('omikuji')
omikuji = omikuji_type(
backend_id='omikuji',
config_params={},
project=project)
input = (doc.text for doc in corpus.documents)
veccorpus = omikuji.create_vectorizer(input, {})
omikuji._create_train_file(veccorpus, corpus)
assert datadir.join('omikuji-train.txt').exists()
traindata = datadir.join('omikuji-train.txt').read().splitlines()
assert len(traindata) == 2 # header + 1 example
examples, features, labels = map(int, traindata[0].split())
assert examples == 1
assert features == 2
assert labels == 125
def vw_corpus(tmpdir):
"""return a small document corpus for testing VW training"""
tmpfile = tmpdir.join('document.tsv')
tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" +
"arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" +
"...\thttp://example.com/none")
return annif.corpus.DocumentFile(str(tmpfile))
def test_nn_ensemble_train_and_learn(app, tmpdir):
project = annif.project.get_project('dummy-en')
nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
nn_ensemble = nn_ensemble_type(
backend_id='nn_ensemble',
config_params={'sources': 'dummy-en'},
project=project)
tmpfile = tmpdir.join('document.tsv')
tmpfile.write("dummy\thttp://example.org/dummy\n" +
"another\thttp://example.org/dummy\n" +
"none\thttp://example.org/none")
document_corpus = annif.corpus.DocumentFile(str(tmpfile))
with app.app_context():
nn_ensemble.train(document_corpus)
datadir = py.path.local(project.datadir)
assert datadir.join('nn-model.h5').exists()
assert datadir.join('nn-model.h5').size() > 0
# test online learning
modelfile = datadir.join('nn-model.h5')
old_size = modelfile.size()
old_mtime = modelfile.mtime()
time.sleep(0.1) # make sure the timestamp has a chance to increase
def test_subjectset_uris():
data = """\tdummy
\tanother
"""
sset = annif.corpus.SubjectSet.from_string(data)
assert sset.has_uris()
assert len(sset.subject_uris) == 2
assert "http://example.org/dummy" in sset.subject_uris
assert "http://example.org/another" in sset.subject_uris
def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
backend_id='vw_multi',
config_params={
'chunksize': 4,
'learning_rate': 0.5,
'loss_function': 'hinge'},
datadir=str(datadir))
empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))
vw.train(empty_document_corpus, project)
assert datadir.join('vw-train.txt').exists()
assert datadir.join('vw-train.txt').size() == 0
# test online learning
modelfile = datadir.join('vw-model')
old_size = modelfile.size()
vw.learn(empty_document_corpus, project)
assert modelfile.size() == old_size
assert datadir.join('vw-train.txt').size() == 0
def _corpus_to_vectors(self, corpus):
# pass corpus through all source projects
sources = [(annif.project.get_project(project_id), weight)
for project_id, weight
in annif.util.parse_sources(self.params['sources'])]
score_vectors = []
true_vectors = []
for doc in corpus.documents:
doc_scores = []
for source_project, weight in sources:
hits = source_project.suggest(doc.text)
doc_scores.append(hits.vector * weight)
score_vectors.append(np.array(doc_scores,
dtype=np.float32).transpose())
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true_vectors.append(subjects.as_vector(self.project.subjects))
# collect the results into a single vector, considering weights
scores = np.array(score_vectors, dtype=np.float32)
# collect the gold standard values into another vector
true = np.array(true_vectors, dtype=np.float32)
return (scores, true)
corpus will be returned as an instance of DocumentCorpus."""
def open_doc_path(path):
"""open a single path and return it as a DocumentCorpus"""
if os.path.isdir(path):
return annif.corpus.DocumentDirectory(path, require_subjects=True)
return annif.corpus.DocumentFile(path)
if len(paths) == 0:
logger.warning('Reading empty file')
docs = open_doc_path(os.path.devnull)
elif len(paths) == 1:
docs = open_doc_path(paths[0])
else:
corpora = [open_doc_path(path) for path in paths]
docs = annif.corpus.CombinedCorpus(corpora)
return docs