"""
This module include the cost sensitive Bayes minimum risk method.
"""
# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause
import numpy as np
from sklearn.base import BaseEstimator
from ..probcal import ROCConvexHull
from ..metrics import cost_loss
[docs]class BayesMinimumRiskClassifier(BaseEstimator):
"""A example-dependent cost-sensitive binary Bayes minimum risk classifier.
Parameters
----------
calibration : bool, optional (default=True)
Whenever or not to calibrate the probabilities.
References
----------
.. [1] A. Correa Bahnsen, A. Stojanovic, D.Aouada, B, Ottersten,
`"Improving Credit Card Fraud Detection with Calibrated Probabilities" <http://albahnsen.com/files/%20Improving%20Credit%20Card%20Fraud%20Detection%20by%20using%20Calibrated%20Probabilities%20-%20Publish.pdf>`__, in Proceedings of the fourteenth SIAM International Conference on Data Mining,
677-685, 2014.
Examples
--------
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.cross_validation import train_test_split
>>> from costcla.datasets import load_creditscoring1
>>> from costcla.models import BayesMinimumRiskClassifier
>>> from costcla.metrics import savings_score
>>> data = load_creditscoring1()
>>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
>>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
>>> f = RandomForestClassifier(random_state=0).fit(X_train, y_train)
>>> y_prob_test = f.predict_proba(X_test)
>>> y_pred_test_rf = f.predict(X_test)
>>> f_bmr = BayesMinimumRiskClassifier()
>>> f_bmr.fit(y_test, y_prob_test)
>>> y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test)
>>> # Savings using only RandomForest
>>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
0.12454256594
>>> # Savings using RandomForest and Bayes Minimum Risk
>>> print(savings_score(y_test, y_pred_test_bmr, cost_mat_test))
0.413425845555
"""
def __init__(self, calibration=True):
self.calibration = calibration
[docs] def fit(self,y_true_cal=None, y_prob_cal=None):
""" If calibration, then train the calibration of probabilities
Parameters
----------
y_true_cal : array-like of shape = [n_samples], optional default = None
True class to be used for calibrating the probabilities
y_prob_cal : array-like of shape = [n_samples, 2], optional default = None
Predicted probabilities to be used for calibrating the probabilities
Returns
-------
self : object
Returns self.
"""
if self.calibration:
self.cal = ROCConvexHull()
self.cal.fit(y_true_cal, y_prob_cal[:, 1])
[docs] def predict(self, y_prob, cost_mat):
""" Calculate the prediction using the Bayes minimum risk classifier.
Parameters
----------
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities.
cost_mat : array-like of shape = [n_samples, 4]
Cost matrix of the classification problem
Where the columns represents the costs of: false positives, false negatives,
true positives and true negatives, for each example.
Returns
-------
y_pred : array-like of shape = [n_samples]
Predicted class
"""
if self.calibration:
y_prob[:, 1] = self.cal.predict_proba(y_prob[:, 1])
y_prob[:, 0] = 1 - y_prob[:, 1]
# t_BMR = (cost_fp - cost_tn) / (cost_fn - cost_tn - cost_tp + cost_fp)
# cost_mat[FP,FN,TP,TN]
t_bmr = (cost_mat[:, 0] - cost_mat[:, 3]) / (cost_mat[:, 1] - cost_mat[:, 3] - cost_mat[:, 2] + cost_mat[:, 0])
y_pred = np.greater(y_prob[:, 1], t_bmr).astype(np.float)
return y_pred
[docs] def fit_predict(self, y_prob, cost_mat, y_true_cal=None, y_prob_cal=None):
""" Calculate the prediction using the Bayes minimum risk classifier.
Parameters
----------
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities.
cost_mat : array-like of shape = [n_samples, 4]
Cost matrix of the classification problem
Where the columns represents the costs of: false positives, false negatives,
true positives and true negatives, for each example.
y_true_cal : array-like of shape = [n_samples], optional default = None
True class to be used for calibrating the probabilities
y_prob_cal : array-like of shape = [n_samples, 2], optional default = None
Predicted probabilities to be used for calibrating the probabilities
Returns
-------
y_pred : array-like of shape = [n_samples]
Predicted class
"""
#TODO: Check input
if self.calibration:
self.cal = ROCConvexHull()
if y_prob_cal is None:
y_prob_cal = y_prob
self.cal.fit(y_true_cal, y_prob_cal[:, 1])
y_prob[:, 1] = self.cal.predict_proba(y_prob[:, 1])
y_prob[:, 0] = 1 - y_prob[:, 1]
# t_BMR = (cost_fp - cost_tn) / (cost_fn - cost_tn - cost_tp + cost_fp)
# cost_mat[FP,FN,TP,TN]
t_bmr = (cost_mat[:, 0] - cost_mat[:, 3]) / (cost_mat[:, 1] - cost_mat[:, 3] - cost_mat[:, 2] + cost_mat[:, 0])
y_pred = np.greater(y_prob[:, 1], t_bmr).astype(np.float)
return y_pred
[docs]class ThresholdingOptimization():
""" Classifier based on finding the threshold that minimizes the total cost on a given set.
Parameters
----------
calibration : bool, optional (default=True)
Whenever or not to calibrate the probabilities.
Attributes
----------
`threshold_` : float
Selected threshold.
References
----------
.. [1] V. Sheng, C. Ling, "Thresholding for making classifiers cost-sensitive",
in Proceedings of the National Conference on Artificial Intelligence, 2006.
Examples
--------
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.cross_validation import train_test_split
>>> from costcla.datasets import load_creditscoring1
>>> from costcla.models import ThresholdingOptimization
>>> from costcla.metrics import savings_score
>>> data = load_creditscoring1()
>>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
>>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
>>> f = RandomForestClassifier(random_state=0).fit(X_train, y_train)
>>> y_prob_train = f.predict_proba(X_train)
>>> y_prob_test = f.predict_proba(X_test)
>>> y_pred_test_rf = f.predict(X_test)
>>> f_t = ThresholdingOptimization().fit(y_prob_train, cost_mat_train, y_train)
>>> y_pred_test_rf_t = f_t.predict(y_prob_test)
>>> # Savings using only RandomForest
>>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
0.12454256594
>>> # Savings using RandomForest and ThresholdingOptimization
>>> print(savings_score(y_test, y_pred_test_rf_t, cost_mat_test))
0.401816361581
"""
def __init__(self, calibration=True):
self.calibration = calibration
[docs] def fit(self, y_prob, cost_mat, y_true):
""" Calculate the optimal threshold using the ThresholdingOptimization.
Parameters
----------
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities.
cost_mat : array-like of shape = [n_samples, 4]
Cost matrix of the classification problem
Where the columns represents the costs of: false positives, false negatives,
true positives and true negatives, for each example.
y_true : array-like of shape = [n_samples]
True class
Returns
-------
self
"""
#TODO: Check input
if self.calibration:
cal = ROCConvexHull()
cal.fit(y_true, y_prob[:, 1])
y_prob[:, 1] = cal.predict_proba(y_prob[:, 1])
y_prob[:, 0] = 1 - y_prob[:, 1]
thresholds = np.unique(y_prob)
cost = np.zeros(thresholds.shape)
for i in range(thresholds.shape[0]):
pred = np.floor(y_prob[:, 1]+(1-thresholds[i]))
cost[i] = cost_loss(y_true, pred, cost_mat)
self.threshold_ = thresholds[np.argmin(cost)]
return self
[docs] def predict(self, y_prob):
""" Calculate the prediction using the ThresholdingOptimization.
Parameters
----------
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities.
Returns
-------
y_pred : array-like of shape = [n_samples]
Predicted class
"""
y_pred = np.floor(y_prob[:, 1] + (1 - self.threshold_))
return y_pred