# coding=utf-8
"""Methods to calibrate the estimated probabilities.
"""
# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause
from sklearn.metrics import roc_curve
import numpy as np
import copy
#TODO: Add isotonic regression from sklearn
#TODO: Add Platt calibration
# http://ift.tt/XuMk3s
[docs]class ROCConvexHull:
"""Implementation the the calibration method ROCConvexHull
Attributes
----------
`calibration_map` : array-like
calibration map for maping the raw probabilities to the calibrated probabilities.
See also
--------
sklearn.IsotonicRegression
References
----------
.. [1] J. Hernandez-Orallo, P. Flach, C. Ferri, 'A Unified View of Performance Metrics :
Translating Threshold Choice into Expected Classification Loss', Journal of
Machine Learning Research, 13, 2813–2869, 2012.
Examples
--------
>>> from costcla.probcal import ROCConvexHull
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.cross_validation import train_test_split
>>> from costcla.datasets import load_creditscoring1
>>> from costcla.metrics import brier_score_loss
>>> data = load_creditscoring1()
>>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
>>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
>>> f = RandomForestClassifier()
>>> f.fit(X_train, y_train)
>>> y_prob_test = f.predict_proba(X_test)
>>> f_cal = ROCConvexHull()
>>> f_cal.fit(y_test, y_prob_test)
>>> y_prob_test_cal = f_cal.predict_proba(y_prob_test)
>>> # Brier score using only RandomForest
>>> print(brier_score_loss(y_test, y_prob_test[:, 1]))
0.0577615264881
>>> # Brier score using calibrated RandomForest
>>> print(brier_score_loss(y_test, y_prob_test_cal))
0.0553677407894
"""
def __init__(self):
self.calibration_map = []
[docs] def fit(self, y, p):
""" Fit the calibration map
Parameters
----------
y_true : array-like of shape = [n_samples]
True class to be used for calibrating the probabilities
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities to be used for calibrating the probabilities
Returns
-------
self : object
Returns self.
"""
# TODO: Check input
if p.size != p.shape[0]:
p = p[:, 1]
fpr, tpr, thresholds = roc_curve(y, p)
#works with sklearn 0.11
if fpr.min() > 0 or tpr.min() > 0:
fpr = np.hstack((0, fpr))
tpr = np.hstack((0, tpr))
thresholds = np.hstack((1.01, thresholds))
def prob_freq(y, predict_proba):
#calculate distribution and return in inverse order
proba_bins = np.unique(predict_proba)
freq_all = np.bincount(proba_bins.searchsorted(predict_proba))
freq_0_tempa = np.unique(predict_proba[np.nonzero(y == 0)[0]])
freq_0_tempb = np.bincount(freq_0_tempa.searchsorted(predict_proba[np.nonzero(y == 0)[0]]))
freq = np.zeros((proba_bins.shape[0], 3))
freq[:, 0] = proba_bins
for i in range(freq_0_tempa.shape[0]):
freq[np.nonzero(proba_bins == freq_0_tempa[i])[0], 1] = freq_0_tempb[i]
freq[:, 2] = freq_all - freq[:, 1]
freq = freq[proba_bins.argsort()[::-1], :]
pr = freq[:, 2] / freq[:, 1:].sum(axis=1)
pr = pr.reshape(freq.shape[0], 1)
#fix when no negatives in range
pr[pr == 1.0] = 0
freq = np.hstack((freq, pr))
return freq
f = prob_freq(y, p)
temp_hull = []
for i in range(fpr.shape[0]):
temp_hull.append((fpr[i], tpr[i]))
#close the plane
temp_hull.append((1, 0))
rocch_ = _convexhull(temp_hull)
rocch = np.array([(a, b) for (a, b) in rocch_[:-1]])
rocch_find = np.zeros(fpr.shape[0], dtype=np.bool)
for i in range(rocch.shape[0]):
rocch_find[np.intersect1d(np.nonzero(rocch[i, 0] == fpr)[0],
np.nonzero(rocch[i, 1] == tpr)[0])] = True
rocch_thresholds = thresholds[rocch_find]
#calibrated probabilities using ROCCH
f_cal = np.zeros((rocch_thresholds.shape[0] - 1, 5))
for i in range(rocch_thresholds.shape[0] - 1):
f_cal[i, 0] = rocch_thresholds[i]
f_cal[i, 1] = rocch_thresholds[i + 1]
join_elements = np.logical_and(f_cal[i, 1] <= f[:, 0], f_cal[i, 0] > f[:, 0])
f_cal[i, 2] = f[join_elements, 1].sum()
f_cal[i, 3] = f[join_elements, 2].sum()
f_cal[:, 4] = f_cal[:, 3] / f_cal[:, [2, 3]].sum(axis=1)
#fix to add 0
f_cal[-1, 1] = 0
calibrated_map = f_cal[:, [0, 1, 4]]
self.calibration_map = calibrated_map
[docs] def predict_proba(self, p):
""" Calculate the calibrated probabilities
Parameters
----------
y_prob : array-like of shape = [n_samples, 2]
Predicted probabilities to be calibrated using calibration map
Returns
-------
y_prob_cal : array-like of shape = [n_samples, 1]
Predicted calibrated probabilities
"""
# TODO: Check input
if p.size != p.shape[0]:
p = p[:, 1]
calibrated_proba = np.zeros(p.shape[0])
for i in range(self.calibration_map.shape[0]):
calibrated_proba[np.logical_and(self.calibration_map[i, 1] <= p, self.calibration_map[i, 0] > p)] = \
self.calibration_map[i, 2]
# TODO: return 2D and refactor
return calibrated_proba
def _convexhull(P):
""" Private function that calculate the convex hull of a set of points
The algorithm was taken from [1].
http://code.activestate.com/recipes/66527-finding-the-convex-hull-of-a-set-of-2d-points/
References
----------
.. [1] Alex Martelli, Anna Ravenscroft, David Ascher, 'Python Cookbook', O'Reilly Media, Inc., 2005.
"""
def mydet(p, q, r):
"""Calc. determinant of a special matrix with three 2D points.
The sign, "-" or "+", determines the side, right or left,
respectivly, on which the point r lies, when measured against
a directed vector from p to q.
"""
# We use Sarrus' Rule to calculate the determinant.
# (could also use the Numeric package...)
sum1 = q[0] * r[1] + p[0] * q[1] + r[0] * p[1]
sum2 = q[0] * p[1] + r[0] * q[1] + p[0] * r[1]
return sum1 - sum2
def isrightturn(p, q, r):
"Do the vectors pq:qr form a right turn, or not?"
assert p != q and q != r and p != r
if mydet(p, q, r) < 0:
return 1
else:
return 0
# Get a local list copy of the points and sort them lexically.
points = copy.copy(P)
points.sort()
# Build upper half of the hull.
upper = [points[0], points[1]]
for p in points[2:]:
upper.append(p)
while len(upper) > 2 and not isrightturn(upper[-3:][0],upper[-3:][1],upper[-3:][2]):
del upper[-2]
# Build lower half of the hull.
points.reverse()
lower = [points[0], points[1]]
for p in points[2:]:
lower.append(p)
while len(lower) > 2 and not isrightturn(lower[-3:][0],lower[-3:][1],lower[-3:][2]):
del lower[-2]
# Remove duplicates.
del lower[0]
del lower[-1]
# Concatenate both halfs and return.
return tuple(upper + lower)