Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Parameters
----------
x : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
y : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
x, y = replace_nan_with_value(x, y, nan_replace_value)
elif nan_strategy == DROP:
x, y = remove_incomplete_samples(x, y)
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = ss.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x
Parameters
----------
x : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
y : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
x, y = replace_nan_with_value(x, y, nan_replace_value)
elif nan_strategy == DROP:
x, y = remove_incomplete_samples(x, y)
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
Parameters
----------
categories : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
measurements : list / NumPy ndarray / Pandas Series
A sequence of continuous measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
categories, measurements = replace_nan_with_value(categories, measurements, nan_replace_value)
elif nan_strategy == DROP:
categories, measurements = remove_incomplete_samples(categories, measurements)
categories = convert(categories, 'array')
measurements = convert(measurements, 'array')
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
Parameters
----------
x : list / NumPy ndarray / Pandas Series
A sequence of measurements
y : list / NumPy ndarray / Pandas Series
A sequence of measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
x, y = replace_nan_with_value(x, y, nan_replace_value)
elif nan_strategy == DROP:
x, y = remove_incomplete_samples(x, y)
y_counter = Counter(y)
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
entropy = 0.0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
return entropy