Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _assert_explanation_equivalence(actual, expected):
# get the non-null properties in the expected explanation
paramkeys = filter(lambda x, expected=expected: hasattr(expected, getattr(ExplainParams, x)),
list(ExplainParams.get_serializable()))
for paramkey in paramkeys:
param = getattr(ExplainParams, paramkey)
actual_value = getattr(actual, param, None)
expected_value = getattr(expected, param, None)
if isinstance(actual_value, DatasetWrapper) or isinstance(actual_value, DenseData):
if isinstance(actual_value.original_dataset, np.ndarray):
actual_dataset = actual_value.original_dataset.tolist()
else:
actual_dataset = actual_value.original_dataset
if isinstance(expected_value.original_dataset, np.ndarray):
expected_dataset = expected_value.original_dataset.tolist()
else:
expected_dataset = expected_value.original_dataset
np.testing.assert_array_equal(actual_dataset, expected_dataset)
elif isinstance(actual_value, (np.ndarray, collections.Sequence)):
np.testing.assert_array_equal(actual_value, expected_value)
elif isinstance(actual_value, pd.DataFrame) and isinstance(expected_value, pd.DataFrame):
np.testing.assert_array_equal(actual_value.values, expected_value.values)
else:
assert actual_value == expected_value
-------
DenseData object.
"""
group_names = [str(i) for i in range(X.shape[1])]
if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
group_names = X.columns
X = X.values
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
if round_values:
for i in range(k):
for j in range(X.shape[1]):
ind = np.argmin(np.abs(X[:,j] - kmeans.cluster_centers_[i,j]))
kmeans.cluster_centers_[i,j] = X[ind,j]
return DenseData(kmeans.cluster_centers_, group_names, None, 1.0*np.bincount(kmeans.labels_))
:rtype: str
"""
paramkeys = list(ExplainParams.get_serializable())
expldict = dict()
_metadata = dict()
for paramkey in paramkeys:
param = getattr(ExplainParams, paramkey)
if hasattr(explanation, param):
value = getattr(explanation, param)
if isinstance(value, pd.DataFrame):
expldict[param] = value.values.tolist()
_metadata[param] = 'DataFrame'
elif isinstance(value, DatasetWrapper):
expldict[param] = value.original_dataset.tolist()
_metadata[param] = 'DatasetWrapper'
elif isinstance(value, DenseData):
expldict[param] = value.original_dataset.tolist()
_metadata[param] = 'DenseData'
elif isinstance(value, np.ndarray):
expldict[param] = value.tolist()
_metadata[param] = 'ndarray'
else:
expldict[param] = value
return json.dumps({
'_metadata': _metadata,
'explanation': expldict
})
valid = (not t and l == data.shape[1]) or (t and l == data.shape[0])
assert valid, "# of names must match data matrix!"
self.weights = args[1] if len(args) > 1 else np.ones(num_samples)
self.weights /= np.sum(self.weights)
wl = len(self.weights)
valid = (not t and wl == data.shape[0]) or (t and wl == data.shape[1])
assert valid, "# weights must match data matrix!"
self.transposed = t
self.group_names = group_names
self.data = data
self.groups_size = len(self.groups)
class DenseDataWithIndex(DenseData):
def __init__(self, data, group_names, index, index_name, *args):
DenseData.__init__(self, data, group_names, *args)
self.index_value = index
self.index_name = index_name
def convert_to_df(self):
data = pd.DataFrame(self.data, columns=self.group_names)
index = pd.DataFrame(self.index_value, columns=[self.index_name])
df = pd.concat([index, data], axis=1)
df = df.set_index(self.index_name)
return df
def convert_to_data(val, keep_index=False):
if isinstance(val, Data):
return val
:type examples: DatasetWrapper
:param model_task: Optional parameter to specify whether the model is a classification or regression model.
In most cases, the type of the model can be inferred based on the shape of the output, where a classifier
has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and
outputs a 1 dimensional array.
:type model_task: str
:param wrapped: Indicates if function has already been wrapped.
:type wrapped: bool
:return: The function chosen from given model and chosen domain.
:rtype (function, str)
"""
# Try to run the function on a single example - if it doesn't work wrap
# it in a function that converts a 1D array to 2D for those functions
# that only support 2D arrays as input
examples_dataset = examples.dataset
if isinstance(examples_dataset, DenseData):
examples_dataset = examples_dataset.data
try:
result = function(examples.typed_wrapper_func(examples_dataset[0]))
except Exception as ex:
# If function has already been wrapped, re-throw error to prevent stack overflow
if wrapped:
raise ex
def function_input_1D_wrapper(dataset):
if len(dataset.shape) == 1:
dataset = dataset.reshape(1, -1)
return function(dataset)
return _eval_function(function_input_1D_wrapper, examples, model_task, wrapped=True)
if len(result.shape) == 2:
# If the result of evaluation the function is a 2D array of 1 column,
def convert_to_data(val, keep_index=False):
if isinstance(val, Data):
return val
elif type(val) == np.ndarray:
return DenseData(val, [str(i) for i in range(val.shape[1])])
elif str(type(val)).endswith("'pandas.core.series.Series'>"):
return DenseData(val.values.reshape((1,len(val))), list(val.index))
elif str(type(val)).endswith("'pandas.core.frame.DataFrame'>"):
if keep_index:
return DenseDataWithIndex(val.values, list(val.columns), val.index.values, val.index.name)
else:
return DenseData(val.values, list(val.columns))
elif sp.sparse.issparse(val):
if not sp.sparse.isspmatrix_csr(val):
val = val.tocsr()
return SparseData(val)
else:
assert False, "Unknown type passed as data object: "+str(type(val))
feature_names = [labels['FEATURE'] % str(i) for i in range(shap_values.shape[1])]
if features is None:
display_features = ["" for i in range(len(feature_names))]
else:
display_features = features[k, :]
instance = Instance(np.ones((1, len(feature_names))), display_features)
e = AdditiveExplanation(
base_value,
np.sum(shap_values[k, :]) + base_value,
shap_values[k, :],
None,
instance,
link,
Model(None, out_names),
DenseData(np.ones((1, len(feature_names))), list(feature_names))
)
exps.append(e)
return visualize(
exps,
plot_cmap=plot_cmap,
ordering_keys=ordering_keys,
ordering_keys_time_format=ordering_keys_time_format,
text_rotation=text_rotation
)
def __init__(self, data, group_names, index, index_name, *args):
DenseData.__init__(self, data, group_names, *args)
self.index_value = index
self.index_name = index_name
self._column_indexer = initialization_examples.string_index(columns=categorical_features)
self._one_hot_encoder = None
explainable_model_args[LightGBMParams.CATEGORICAL_FEATURE] = categorical_features
else:
# One-hot-encode categoricals for models that don't support categoricals natively
self._column_indexer = initialization_examples.string_index(columns=categorical_features)
self._one_hot_encoder = initialization_examples.one_hot_encode(columns=categorical_features)
self.classes = classes
self.explain_subset = explain_subset
self.transformations = transformations
self._shap_values_output = shap_values_output
# Train the mimic model on the given model
training_data = initialization_examples.dataset
self.initialization_examples = initialization_examples
if isinstance(training_data, DenseData):
training_data = training_data.data
explainable_model_args[ExplainParams.CLASSIFICATION] = self.predict_proba_flag
if self._supports_shap_values_output(explainable_model):
explainable_model_args[ExplainParams.SHAP_VALUES_OUTPUT] = shap_values_output
self.surrogate_model = _model_distill(self.function, explainable_model, training_data,
original_training_data, explainable_model_args)
self._method = self.surrogate_model._method
self._original_eval_examples = None
self._allow_all_transformations = allow_all_transformations
For sparse dataset, use a sparse row for the background with calculated
median for dense columns.
:param X: Matrix of data samples to summarize (# samples x # features).
:type X: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix
:param k: Number of cluster centroids to use for approximation.
:type k: int
:param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value
from X in the corresponding dimension. This ensures discrete features
always get a valid value. Ignored for sparse data sample.
:type to_round_values: bool
:return: DenseData or SparseData object.
:rtype: iml.datatypes.DenseData or iml.datatypes.SparseData
"""
is_sparse = sp.sparse.issparse(X)
if not isinstance(X, DenseData):
if is_sparse:
module_logger.debug('Creating sparse data summary as csr matrix')
# calculate median of sparse background data
median_dense = csc_median_axis_0(X.tocsc())
return sp.sparse.csr_matrix(median_dense)
elif len(X) > 10 * k:
module_logger.debug('Create dense data summary with k-means')
# use kmeans to summarize the examples for initialization
# if there are more than 10 x k of them
return shap.kmeans(X, k, to_round_values)
return X