How to use the annif.corpus.DocumentDirectory function in annif

To help you get started, we’ve selected a few annif examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NatLibFi / Annif / tests / test_backend.py View on Github external
def test_learn_dummy(app, project, tmpdir):
    dummy_type = annif.backend.get_backend("dummy")
    dummy = dummy_type(backend_id='dummy', config_params={},
                       datadir=app.config['DATADIR'])

    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.tsv').write('\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.tsv').write('\tkey2')
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))

    dummy.learn(docdir, project)

    result = dummy.suggest(text='this is some text', project=project)
    assert len(result) == 1
    assert result[0].uri == 'http://example.org/key1'
    assert result[0].label == 'key1'
    assert result[0].score == 1.0
github NatLibFi / Annif / tests / test_project.py View on Github external
def test_project_learn(app, tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.tsv').write('\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.tsv').write('\tkey2')
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))

    with app.app_context():
        project = annif.project.get_project('dummy-fi')
        project.learn(docdir)
        result = project.suggest('this is some text')
        assert len(result) == 1
        assert result[0].uri == 'http://example.org/key1'
        assert result[0].label == 'key1'
        assert result[0].score == 1.0
github NatLibFi / Annif / tests / test_project.py View on Github external
def test_project_learn_not_supported(app, tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.tsv').write('\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.tsv').write('\tkey2')
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))

    with app.app_context():
        project = annif.project.get_project('tfidf-fi')
        with pytest.raises(NotSupportedException):
            project.learn(docdir)
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_docdir_key(tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.key').write('key1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.key').write('key2')
    tmpdir.join('doc3.txt').write('doc3')

    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
    files = sorted(list(docdir))
    assert len(files) == 3
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
    assert files[0][1] == str(tmpdir.join('doc1.key'))
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
    assert files[1][1] == str(tmpdir.join('doc2.key'))
    assert files[2][0] == str(tmpdir.join('doc3.txt'))
    assert files[2][1] is None
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_docdir_key_as_doccorpus(tmpdir, subject_index):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.key').write('arkeologit')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.key').write('kalliotaide')
    tmpdir.join('doc3.txt').write('doc3')

    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
    docdir.set_subject_index(subject_index)
    docs = list(docdir.documents)
    assert len(docs) == 2
    assert docs[0].text == 'doc1'
    assert docs[0].uris == {'http://www.yso.fi/onto/yso/p10849'}
    assert docs[1].text == 'doc2'
    assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_docdir_key_require_subjects(tmpdir):
    tmpdir.join('doc1.txt').write('doc1')
    tmpdir.join('doc1.key').write('\tkey1')
    tmpdir.join('doc2.txt').write('doc2')
    tmpdir.join('doc2.key').write('\tkey2')
    tmpdir.join('doc3.txt').write('doc3')

    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
    files = sorted(list(docdir))
    assert len(files) == 2
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
    assert files[0][1] == str(tmpdir.join('doc1.key'))
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
    assert files[1][1] == str(tmpdir.join('doc2.key'))
github NatLibFi / Annif / annif / cli.py View on Github external
def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)
github NatLibFi / Annif / annif / cli.py View on Github external
def run_index(project_id, directory, suffix, force,
              limit, threshold, backend_param):
    """
    Index a directory with documents, suggesting subjects for each document.
    Write the results in TSV files with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)
    hit_filter = SuggestionFilter(limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename, encoding='utf-8') as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo(
                "Not overwriting {} (use --force to override)".format(
                    subjectfilename))
            continue
        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
            results = project.suggest(text, backend_params)
            for hit in hit_filter(results):
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
                click.echo(line, file=subjfile)