Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
fs = sidekit.FeaturesServer(feature_filename_structure=(
"{dir}/{speaker_list}/feat/{{}}.{ext}".format(dir=get_training('i_vector'), speaker_list=speaker_list,
ext=feature_extension)),
dataset_list=["energy", "cep", "vad"],
mask="[0-12]",
feat_norm="cmvn",
keep_all_features=True,
delta=True,
double_delta=True,
rasta=True,
context=None)
#exract ivectors
test_stat_long = sidekit.StatServer(test_list_long, ubm=ubm, distrib_nb=distrib_nb, feature_size=0, index=None)
test_stat_long.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(test_stat_long.segset.shape[0]),
num_thread=nbThread)
test_stat_short = sidekit.StatServer(test_list_short, ubm=ubm, distrib_nb=distrib_nb, feature_size=0, index=None)
test_stat_short.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(test_stat_short.segset.shape[0]),
num_thread=nbThread)
test_iv_long = test_stat_long.estimate_hidden(tv_mean, tv_sigma, V=tv, batch_size=100, num_thread=nbThread)[0]
test_iv_short = test_stat_short.estimate_hidden(tv_mean, tv_sigma, V=tv, batch_size=100, num_thread=nbThread)[0]
iv_lis, y_list, s_list = create_data_lists(False, test_iv_long.stat1, test_iv_short.stat1, test_list_long.leftids.astype(int), test_list_short.leftids.astype(int))
#generate embeddings
embeddings, speakers, num_embeddings=generate_embeddings(iv_lis, y_list, vector_size)
set_of_embeddings.append(embeddings)
def train_total_variability(self, ubm, fs, distrib_nb, rank_TV, tv_iteration, train_idmap, num_threads=10):
self.logger.info('train total variability ')
train_stat = sidekit.StatServer(train_idmap, ubm=ubm, distrib_nb=distrib_nb, feature_size=0, index=None)
train_stat.accumulate_stat(ubm=ubm, feature_server=fs, seg_indices=range(train_stat.segset.shape[0]),
num_thread=num_threads)
tv_mean, tv, _, __, tv_sigma = train_stat.factor_analysis(rank_f=rank_TV,
rank_g=0,
rank_h=None,
re_estimate_residual=False,
it_nb=(tv_iteration, 0, 0),
min_div=True,
ubm=ubm,
batch_size=100,
num_thread=num_threads)
sidekit.sidekit_io.write_tv_hdf5((tv, tv_mean, tv_sigma), get_experiment_nets()+"/TV_{}".format(self.network_file))
if not back_idmap.validate():
raise RuntimeError("Error merging tv_idmap & plda_idmap")
# Check UBM model
ubm_name = "ubm_{}.h5".format(self.NUM_GAUSSIANS)
ubm_path = os.path.join(self.BASE_DIR, "ubm", ubm_name)
if not os.path.exists(ubm_path):
#if UBM model does not exist, train one
logging.info("Training UBM-{} model".format(self.NUM_GAUSSIANS))
ubm = UBM(self.conf_path)
ubm.train()
#load trained UBM model
logging.info("Loading trained UBM-{} model".format(self.NUM_GAUSSIANS))
ubm = sidekit.Mixture()
ubm.read(ubm_path)
back_stat = sidekit.StatServer( statserver_file_name=back_idmap,
ubm=ubm
)
# Create Feature Server
fs = self.createFeatureServer()
# Jointly compute the sufficient statistics of TV and (if enabled) PLDA data
back_filename = 'back_stat_{}.h5'.format(self.NUM_GAUSSIANS)
if not os.path.isfile(os.path.join(self.BASE_DIR, "stat", back_filename)):
#BUG: don't use self.NUM_THREADS when assgining num_thread
# as it's prune to race-conditioning
back_stat.accumulate_stat(
ubm=ubm,
feature_server=fs,
seg_indices=range(back_stat.segset.shape[0])
)
back_stat.write(os.path.join(self.BASE_DIR, "stat", back_filename))
# Load sufficient statistics and extract i-vectors from PLDA training data
if self.ENABLE_PLDA:
plda_filename = 'plda_stat_{}.h5'.format(self.NUM_GAUSSIANS)
if not os.path.isfile(os.path.join(self.BASE_DIR, "stat", plda_filename)):
plda_stat = sidekit.StatServer.read_subset(
os.path.join(self.BASE_DIR, "stat", back_filename),
plda_idmap
)
plda_stat.write(os.path.join(self.BASE_DIR, "stat", plda_filename))
# Load sufficient statistics from test data
filename = 'test_stat_{}.h5'.format(self.NUM_GAUSSIANS)
if not os.path.isfile(os.path.join(self.BASE_DIR, "stat", filename)):
test_idmap = sidekit.IdMap.read(os.path.join(self.BASE_DIR, "task", "test_idmap.h5"))
test_stat = sidekit.StatServer( statserver_file_name=test_idmap,
ubm=ubm
)
# Create Feature Server
fs = self.createFeatureServer()
# Jointly compute the sufficient statistics of TV and PLDA data
#BUG: don't use self.NUM_THREADS when assgining num_thread as it's prune to race-conditioning
test_stat.accumulate_stat(ubm=ubm,
feature_server=fs,
seg_indices=range(test_stat.segset.shape[0])
)
test_stat.write(os.path.join(self.BASE_DIR, "stat", filename))