How to use the annif.corpus.SubjectSet function in annif

To help you get started, we’ve selected a few annif examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_subjectset_from_tuple():
    uris = ['http://www.yso.fi/onto/yso/p10849',
            'http://www.yso.fi/onto/yso/p19740']
    labels = ['arkeologit', 'obeliskit']
    sset = annif.corpus.SubjectSet((uris, labels))
    assert sset.has_uris()
    assert len(sset.subject_uris) == 2
    assert 'http://www.yso.fi/onto/yso/p10849' in sset.subject_uris
    assert 'http://www.yso.fi/onto/yso/p19740' in sset.subject_uris
github NatLibFi / Annif / tests / test_eval.py View on Github external
def test_evaluation_batch(subject_index):
    batch = annif.eval.EvaluationBatch(subject_index)

    gold_set = annif.corpus.SubjectSet.from_string(
        '\tarkeologit')
    hits1 = annif.suggestion.ListSuggestionResult([
        annif.suggestion.SubjectSuggestion(
            uri='http://www.yso.fi/onto/yso/p10849',
            label='arkeologit',
            score=1.0)], subject_index)
    batch.evaluate(hits1, gold_set)
    hits2 = annif.suggestion.ListSuggestionResult([
        annif.suggestion.SubjectSuggestion(
            uri='http://www.yso.fi/onto/yso/p1747',
            label='egyptologit',
            score=1.0)], subject_index)
    batch.evaluate(hits2, gold_set)
    results = batch.results()
    assert results['Precision (doc avg)'] == 0.5
    assert results['Recall (doc avg)'] == 0.5
github NatLibFi / Annif / annif / backend / pav.py View on Github external
def _suggest_train_corpus(source_project, corpus):
        scores = []
        true = []
        for doc in corpus.documents:
            hits = source_project.suggest(doc.text)
            scores.append(hits.vector)
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
            true.append(subjects.as_vector(source_project.subjects))
        return np.array(scores), np.array(true)
github NatLibFi / Annif / annif / cli.py View on Github external
Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = SuggestionFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.suggest(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))
github NatLibFi / Annif / annif / backend / vw_ensemble.py View on Github external
def _doc_to_example(self, doc, project, source_projects):
        examples = []
        subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        true = subjects.as_vector(project.subjects)
        score_vector = self._doc_score_vector(doc, source_projects)
        for subj_id in range(len(true)):
            if true[subj_id] or score_vector[:, subj_id].sum() > 0.0:
                ex = (subj_id, self._format_example(
                    subj_id,
                    score_vector[:, subj_id],
                    true[subj_id]))
                examples.append(ex)
        return examples