Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _corpus_to_vectors(self, corpus):
# pass corpus through all source projects
sources = [(annif.project.get_project(project_id), weight)
for project_id, weight
in annif.util.parse_sources(self.params['sources'])]
score_vectors = []
true_vectors = []
for doc in corpus.documents:
doc_scores = []
for source_project, weight in sources:
hits = source_project.suggest(doc.text)
doc_scores.append(hits.vector * weight)
score_vectors.append(np.array(doc_scores,
dtype=np.float32).transpose())
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true_vectors.append(subjects.as_vector(self.project.subjects))
# collect the results into a single vector, considering weights
scores = np.array(score_vectors, dtype=np.float32)
# collect the gold standard values into another vector
true = np.array(true_vectors, dtype=np.float32)
def initialize(self):
if self._models is not None:
return # already initialized
self._models = {}
sources = annif.util.parse_sources(self.params['sources'])
for source_project_id, _ in sources:
model_filename = self.MODEL_FILE_PREFIX + source_project_id
path = os.path.join(self.datadir, model_filename)
if os.path.exists(path):
self.debug('loading PAV model from {}'.format(path))
self._models[source_project_id] = joblib.load(path)
else:
raise NotInitializedException(
"PAV model file '{}' not found".format(path),
backend_id=self.backend_id)
def _source_project_ids(self):
sources = annif.util.parse_sources(self.params['sources'])
return [project_id for project_id, _ in sources]
def _suggest(self, text, params):
sources = annif.util.parse_sources(params['sources'])
hits_from_sources = self._suggest_with_sources(text, sources)
merged_hits = self._merge_hits_from_sources(hits_from_sources, params)
self.debug('{} hits after merging'.format(len(merged_hits)))
return merged_hits