Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_learn_dummy(app, project, tmpdir):
dummy_type = annif.backend.get_backend("dummy")
dummy = dummy_type(backend_id='dummy', config_params={},
datadir=app.config['DATADIR'])
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('\tkey1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('\tkey2')
docdir = annif.corpus.DocumentDirectory(str(tmpdir))
dummy.learn(docdir, project)
result = dummy.suggest(text='this is some text', project=project)
assert len(result) == 1
assert result[0].uri == 'http://example.org/key1'
assert result[0].label == 'key1'
assert result[0].score == 1.0
def test_project_learn(app, tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('\tkey1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('\tkey2')
docdir = annif.corpus.DocumentDirectory(str(tmpdir))
with app.app_context():
project = annif.project.get_project('dummy-fi')
project.learn(docdir)
result = project.suggest('this is some text')
assert len(result) == 1
assert result[0].uri == 'http://example.org/key1'
assert result[0].label == 'key1'
assert result[0].score == 1.0
def test_project_learn_not_supported(app, tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('\tkey1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('\tkey2')
docdir = annif.corpus.DocumentDirectory(str(tmpdir))
with app.app_context():
project = annif.project.get_project('tfidf-fi')
with pytest.raises(NotSupportedException):
project.learn(docdir)
def test_docdir_key(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('key1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.key').write('key2')
tmpdir.join('doc3.txt').write('doc3')
docdir = annif.corpus.DocumentDirectory(str(tmpdir))
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join('doc1.txt'))
assert files[0][1] == str(tmpdir.join('doc1.key'))
assert files[1][0] == str(tmpdir.join('doc2.txt'))
assert files[1][1] == str(tmpdir.join('doc2.key'))
assert files[2][0] == str(tmpdir.join('doc3.txt'))
assert files[2][1] is None
def test_docdir_key_as_doccorpus(tmpdir, subject_index):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('arkeologit')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.key').write('kalliotaide')
tmpdir.join('doc3.txt').write('doc3')
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
docdir.set_subject_index(subject_index)
docs = list(docdir.documents)
assert len(docs) == 2
assert docs[0].text == 'doc1'
assert docs[0].uris == {'http://www.yso.fi/onto/yso/p10849'}
assert docs[1].text == 'doc2'
assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}
def test_docdir_key_require_subjects(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('\tkey1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.key').write('\tkey2')
tmpdir.join('doc3.txt').write('doc3')
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
files = sorted(list(docdir))
assert len(files) == 2
assert files[0][0] == str(tmpdir.join('doc1.txt'))
assert files[0][1] == str(tmpdir.join('doc1.key'))
assert files[1][0] == str(tmpdir.join('doc2.txt'))
assert files[1][1] == str(tmpdir.join('doc2.key'))
def open_doc_path(path):
"""open a single path and return it as a DocumentCorpus"""
if os.path.isdir(path):
return annif.corpus.DocumentDirectory(path, require_subjects=True)
return annif.corpus.DocumentFile(path)
def run_index(project_id, directory, suffix, force,
limit, threshold, backend_param):
"""
Index a directory with documents, suggesting subjects for each document.
Write the results in TSV files with the given suffix.
"""
project = get_project(project_id)
backend_params = parse_backend_params(backend_param)
hit_filter = SuggestionFilter(limit, threshold)
for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
directory, require_subjects=False):
with open(docfilename, encoding='utf-8') as docfile:
text = docfile.read()
subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
if os.path.exists(subjectfilename) and not force:
click.echo(
"Not overwriting {} (use --force to override)".format(
subjectfilename))
continue
with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
results = project.suggest(text, backend_params)
for hit in hit_filter(results):
line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
click.echo(line, file=subjfile)