Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
source_project_id, min_docs))
source_project = annif.project.get_project(source_project_id)
# suggest subjects for the training corpus
scores, true = self._suggest_train_corpus(source_project, corpus)
# create the concept-specific PAV regression models
pav_regressions = {}
for cid in range(len(source_project.subjects)):
if true[:, cid].sum() < min_docs:
continue # don't create model b/c of too few examples
reg = IsotonicRegression(out_of_bounds='clip')
reg.fit(scores[:, cid], true[:, cid])
pav_regressions[source_project.subjects[cid][0]] = reg
self.info("created PAV model for {} concepts".format(
len(pav_regressions)))
model_filename = self.MODEL_FILE_PREFIX + source_project_id
annif.util.atomic_save(
pav_regressions,
self.datadir,
model_filename,
method=joblib.dump)
def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException(
'Cannot train tfidf project with no documents')
self.info('transforming subject corpus')
subjects = self._generate_subjects_from_documents(corpus, project)
self.info('creating vectorizer')
self._vectorizer = TfidfVectorizer()
veccorpus = self._vectorizer.fit_transform(subjects)
annif.util.atomic_save(
self._vectorizer,
self.datadir,
self.VECTORIZER_FILE,
method=joblib.dump)
self._create_index(veccorpus)
def _create_train_file(self, corpus, project):
self.info('creating VW train file')
examples = self._create_examples(corpus, project)
annif.util.atomic_save(examples,
self.datadir,
self.TRAIN_FILE,
method=self._write_train_file)
def _create_train_file(self, corpus, project):
self.info('creating VW train file')
exampledata = self._create_examples(corpus, project)
subjects = [subj_id for subj_id, ex in exampledata]
self._subject_freq = collections.Counter(subjects)
annif.util.atomic_save(self._subject_freq,
self.datadir,
self.FREQ_FILE,
method=self._write_freq_file)
examples = [ex for subj_id, ex in exampledata]
annif.util.atomic_save(examples,
self.datadir,
self.TRAIN_FILE,
method=self._write_train_file)