Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@timeit
def model_fit(self, generate_data=True, inputs=None, outputs=None,
csv_name=None, save_model=False, meta_algo_params=None):
"""
builds the actual training time estimator
(currently we only support NN or RF)
the data is either generated from scratch or taken as input
if specified, the meta algo is saved as a pkl file along
with associated metadata (column names, mse per bin)
:param generate_data: bool (if set to True, calls _generate_data)
:param inputs: pd.DataFrame chosen as input
:param outputs: pd.DataFrame chosen as output
:param csv_name: name if csv in case we fetch data from csv
:param save_model: boolean set to True if the model needs to be saved
:param meta_algo_params: params of the meta algo
:return: meta_algo
@timeit
def model_fit(self, generate_data=True, inputs=None, outputs=None, csv_name=None, save_model=False,
meta_algo_params=None):
"""
builds the actual training time estimator (currently we only support NN or RF)
the data is either generated from scratch or taken as input
if specified, the meta algo is saved as a pkl file along with associated metadata (column names, mse per bin)
:param generate_data: bool (if set to True, calls _generate_data)
:param inputs: pd.DataFrame chosen as input
:param outputs: pd.DataFrame chosen as output
:param csv_name: name if csv in case we fetch data from csv
:param save_model: boolean set to True if the model needs to be saved
:param meta_algo_params: params of the meta algo
:return: meta_algo
:rtype: scikit learn model
"""
@timeit
def _generate_data(self, write_csv=False, validation=False):
"""
measures training runtimes for a set of distinct parameters
if specified, saves results in a csv (row by row)
:param write_csv: set to True in order to write
outputs in a dedicated csv file
:param validation: boolean, set true if data is used
for validation, use only once the model has been trained
:return: inputs, outputs
:rtype: pd.DataFrame
"""
if self.verbose >= 2:
self.logger.info('''Generating dummy training durations to create a training set''')
meta_params = self.params
@timeit
def model_validate(self):
"""
measures training runtimes and compares to actual
runtimes once the model has been trained
:return: results dataframe and error rate
:rtype: pd.DataFrame and float
"""
inputs, outputs, estimated_outputs = \
self._generate_data(validation=True)
actual_values = outputs['output']
estimated_values = estimated_outputs['estimated_outputs']
dot_product = np.dot(actual_values, actual_values - estimated_values)
avg_weighted_error = dot_product / sum(actual_values)
@timeit
def _permute(self, concat_dic, parameters_list,
external_parameters_list, meta_params,
algo_type, write_csv=False, validation=False):
"""
performs a for loop over every possible param combination
to generate data on the specified algo abstracted to support
any sklearn algo runtime of this function depends on the
specified drop_rate: the higher it is, the less data
will be generated a minimum of 4 data points is generated
:param concat_dic: all params + all values range dictionary
:param parameters_list: all internal parameters names
:param external_parameters_list: all external parameters names
:param meta_params: params from json file (equivalent to self.params)
:param algo_type: unsupervised / supervised / classification
@timeit
def model_validate(self):
"""
measures training runtimes and compares to actual runtimes once the model has been trained
:return: results dataframe and error rate
:rtype: pd.DataFrame and float
"""
inputs, outputs, estimated_outputs = self._generate_data(validation=True)
actual_values = outputs['output']
estimated_values = estimated_outputs['estimated_outputs']
avg_weighted_error = np.dot(actual_values, actual_values - estimated_values) / sum(actual_values)
return inputs, outputs, estimated_outputs, avg_weighted_error
@timeit
def _permute(self, concat_dic, parameters_list, external_parameters_list, meta_params, algo_type, write_csv=False, validation=False):
"""
performs a for loop over every possible param combination to generate data on the specified algo
abstracted to support any sklearn algo
runtime of this function depends on the specified drop_rate: the higher it is, the less data will be generated
a minimum of 4 data points is generated
:param concat_dic: all params + all values range dictionary
:param parameters_list: all internal parameters names
:param external_parameters_list: all external parameters names
:param meta_params: params from json file (equivalent to self.params)
:param algo_type: unsupervised / supervised / classification
:param write_csv: set to True in order to write outputs in a dedicated csv file
:param validation: boolean, set true if data is used for validation, use only once the model has been trained
:return: inputs, outputs
@timeit
def _random_search(self, inputs, outputs, iterations, save_model=False):
"""
performs a random search on the NN meta algo to find the best params
:param inputs: pd.DataFrame chosen as input
:param outputs: pd.DataFrame chosen as output
:param iterations: Number of parameter settings that are sampled
:param save_model: boolean set to True if the model needs to be saved
:return: best meta_algo with parameters
:rtype: scikit learn RandomizedSearchCV object
"""
X, y, cols, original_cols = self._transform_data(inputs, outputs)
if self.meta_algo != 'NN':
raise KeyError(f'''meta algo {self.meta_algo} not supported for random search''')
@timeit
def _generate_data(self, write_csv=False, validation=False):
"""
measures training runtimes for a set of distinct parameters
if specified, saves results in a csv (row by row)
:param write_csv: set to True in order to write outputs in a dedicated csv file
:param validation: boolean, set true if data is used for validation, use only once the model has been trained
:return: inputs, outputs
:rtype: pd.DataFrame
"""
if self.verbose >= 2:
self.logger.info('Generating dummy training durations to create a training set')
meta_params = self.params
parameters_list = list(meta_params['internal_params'].keys())
external_parameters_list = list(meta_params['external_params'].keys())
@timeit
def _random_search(self, inputs, outputs, iterations, save_model=False):
"""
performs a random search on the NN meta algo to find the best params
:param inputs: pd.DataFrame chosen as input
:param outputs: pd.DataFrame chosen as output
:param iterations: Number of parameter settings that are sampled
:param save_model: boolean set to True if the model needs to be saved
:return: best meta_algo with parameters
:rtype: scikit learn RandomizedSearchCV object
"""
X, y, cols, original_cols = self._transform_data(inputs, outputs)
if self.meta_algo != 'NN':
raise KeyError(f'meta algo {self.meta_algo} not supported for random search')