How to use the annif.corpus function in annif

To help you get started, we’ve selected a few annif examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NatLibFi / Annif / tests / test_backend_omikuji.py View on Github external
def test_omikuji_create_train_file(tmpdir, project, datadir):
    tmpfile = tmpdir.join('document.tsv')
    tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" +
                  "arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" +
                  "...\thttp://example.com/none")
    corpus = annif.corpus.DocumentFile(str(tmpfile))
    omikuji_type = annif.backend.get_backend('omikuji')
    omikuji = omikuji_type(
        backend_id='omikuji',
        config_params={},
        project=project)
    input = (doc.text for doc in corpus.documents)
    veccorpus = omikuji.create_vectorizer(input, {})
    omikuji._create_train_file(veccorpus, corpus)
    assert datadir.join('omikuji-train.txt').exists()
    traindata = datadir.join('omikuji-train.txt').read().splitlines()
    assert len(traindata) == 2  # header + 1 example
    examples, features, labels = map(int, traindata[0].split())
    assert examples == 1
    assert features == 2
    assert labels == 125
github NatLibFi / Annif / tests / test_backend_vw_multi.py View on Github external
def vw_corpus(tmpdir):
    """return a small document corpus for testing VW training"""
    tmpfile = tmpdir.join('document.tsv')
    tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" +
                  "arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" +
                  "...\thttp://example.com/none")
    return annif.corpus.DocumentFile(str(tmpfile))
github NatLibFi / Annif / tests / test_backend_nn_ensemble.py View on Github external
def test_nn_ensemble_train_and_learn(app, tmpdir):
    project = annif.project.get_project('dummy-en')
    nn_ensemble_type = annif.backend.get_backend("nn_ensemble")
    nn_ensemble = nn_ensemble_type(
        backend_id='nn_ensemble',
        config_params={'sources': 'dummy-en'},
        project=project)

    tmpfile = tmpdir.join('document.tsv')
    tmpfile.write("dummy\thttp://example.org/dummy\n" +
                  "another\thttp://example.org/dummy\n" +
                  "none\thttp://example.org/none")
    document_corpus = annif.corpus.DocumentFile(str(tmpfile))

    with app.app_context():
        nn_ensemble.train(document_corpus)

    datadir = py.path.local(project.datadir)
    assert datadir.join('nn-model.h5').exists()
    assert datadir.join('nn-model.h5').size() > 0

    # test online learning
    modelfile = datadir.join('nn-model.h5')

    old_size = modelfile.size()
    old_mtime = modelfile.mtime()

    time.sleep(0.1)  # make sure the timestamp has a chance to increase
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_subjectset_uris():
    data = """\tdummy
    \tanother
    """

    sset = annif.corpus.SubjectSet.from_string(data)
    assert sset.has_uris()
    assert len(sset.subject_uris) == 2
    assert "http://example.org/dummy" in sset.subject_uris
    assert "http://example.org/another" in sset.subject_uris
github NatLibFi / Annif / tests / test_backend_vw_multi.py View on Github external
def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project):
    vw_type = annif.backend.get_backend('vw_multi')
    vw = vw_type(
        backend_id='vw_multi',
        config_params={
            'chunksize': 4,
            'learning_rate': 0.5,
            'loss_function': 'hinge'},
        datadir=str(datadir))

    empty_file = tmpdir.ensure('empty.tsv')
    empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

    vw.train(empty_document_corpus, project)
    assert datadir.join('vw-train.txt').exists()
    assert datadir.join('vw-train.txt').size() == 0

    # test online learning
    modelfile = datadir.join('vw-model')

    old_size = modelfile.size()

    vw.learn(empty_document_corpus, project)

    assert modelfile.size() == old_size
    assert datadir.join('vw-train.txt').size() == 0
github NatLibFi / Annif / annif / backend / nn_ensemble.py View on Github external
def _corpus_to_vectors(self, corpus):
        # pass corpus through all source projects
        sources = [(annif.project.get_project(project_id), weight)
                   for project_id, weight
                   in annif.util.parse_sources(self.params['sources'])]

        score_vectors = []
        true_vectors = []
        for doc in corpus.documents:
            doc_scores = []
            for source_project, weight in sources:
                hits = source_project.suggest(doc.text)
                doc_scores.append(hits.vector * weight)
            score_vectors.append(np.array(doc_scores,
                                          dtype=np.float32).transpose())
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
            true_vectors.append(subjects.as_vector(self.project.subjects))
        # collect the results into a single vector, considering weights
        scores = np.array(score_vectors, dtype=np.float32)
        # collect the gold standard values into another vector
        true = np.array(true_vectors, dtype=np.float32)
        return (scores, true)
github NatLibFi / Annif / annif / cli.py View on Github external
corpus will be returned as an instance of DocumentCorpus."""

    def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)

    if len(paths) == 0:
        logger.warning('Reading empty file')
        docs = open_doc_path(os.path.devnull)
    elif len(paths) == 1:
        docs = open_doc_path(paths[0])
    else:
        corpora = [open_doc_path(path) for path in paths]
        docs = annif.corpus.CombinedCorpus(corpora)
    return docs