Source code for costcla.probcal.probcal

# coding=utf-8
"""Methods to calibrate the estimated probabilities.
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

from sklearn.metrics import roc_curve
import numpy as np
import copy

#TODO: Add isotonic regression from sklearn
#TODO: Add Platt calibration
# http://ift.tt/XuMk3s

[docs]class ROCConvexHull:
    """Implementation the the calibration method ROCConvexHull

    Attributes
    ----------
    `calibration_map` : array-like
        calibration map for maping the raw probabilities to the calibrated probabilities.

    See also
    --------
    sklearn.IsotonicRegression

    References
    ----------

    .. [1] J. Hernandez-Orallo, P. Flach, C. Ferri, 'A Unified View of Performance Metrics :
           Translating Threshold Choice into Expected Classification Loss', Journal of
           Machine Learning Research, 13, 2813–2869, 2012.

    Examples
    --------
    >>> from costcla.probcal import ROCConvexHull
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.metrics import brier_score_loss
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> f = RandomForestClassifier()
    >>> f.fit(X_train, y_train)
    >>> y_prob_test = f.predict_proba(X_test)
    >>> f_cal = ROCConvexHull()
    >>> f_cal.fit(y_test, y_prob_test)
    >>> y_prob_test_cal = f_cal.predict_proba(y_prob_test)
    >>> # Brier score using only RandomForest
    >>> print(brier_score_loss(y_test, y_prob_test[:, 1]))
    0.0577615264881
    >>> # Brier score using calibrated RandomForest
    >>> print(brier_score_loss(y_test, y_prob_test_cal))
    0.0553677407894
    """

    def __init__(self):
        self.calibration_map = []

[docs]    def fit(self, y, p):
        """ Fit the calibration map

        Parameters
        ----------
        y_true : array-like of shape = [n_samples]
            True class to be used for calibrating the probabilities

        y_prob : array-like of shape = [n_samples, 2]
            Predicted probabilities to be used for calibrating the probabilities

        Returns
        -------
        self : object
            Returns self.
        """

        # TODO: Check input
        if p.size != p.shape[0]:
            p = p[:, 1]

        fpr, tpr, thresholds = roc_curve(y, p)
        #works with sklearn 0.11
        if fpr.min() > 0 or tpr.min() > 0:
            fpr = np.hstack((0, fpr))
            tpr = np.hstack((0, tpr))
            thresholds = np.hstack((1.01, thresholds))

        def prob_freq(y, predict_proba):
            #calculate distribution and return in inverse order
            proba_bins = np.unique(predict_proba)
            freq_all = np.bincount(proba_bins.searchsorted(predict_proba))
            freq_0_tempa = np.unique(predict_proba[np.nonzero(y == 0)[0]])
            freq_0_tempb = np.bincount(freq_0_tempa.searchsorted(predict_proba[np.nonzero(y == 0)[0]]))
            freq = np.zeros((proba_bins.shape[0], 3))
            freq[:, 0] = proba_bins
            for i in range(freq_0_tempa.shape[0]):
                freq[np.nonzero(proba_bins == freq_0_tempa[i])[0], 1] = freq_0_tempb[i]
            freq[:, 2] = freq_all - freq[:, 1]
            freq = freq[proba_bins.argsort()[::-1], :]
            pr = freq[:, 2] / freq[:, 1:].sum(axis=1)
            pr = pr.reshape(freq.shape[0], 1)
            #fix when no negatives in range
            pr[pr == 1.0] = 0
            freq = np.hstack((freq, pr))
            return freq

        f = prob_freq(y, p)
        temp_hull = []
        for i in range(fpr.shape[0]):
            temp_hull.append((fpr[i], tpr[i]))
        #close the plane
        temp_hull.append((1, 0))
        rocch_ = _convexhull(temp_hull)
        rocch = np.array([(a, b) for (a, b) in rocch_[:-1]])
        rocch_find = np.zeros(fpr.shape[0], dtype=np.bool)
        for i in range(rocch.shape[0]):
            rocch_find[np.intersect1d(np.nonzero(rocch[i, 0] == fpr)[0],
                                      np.nonzero(rocch[i, 1] == tpr)[0])] = True
        rocch_thresholds = thresholds[rocch_find]
        #calibrated probabilities using ROCCH
        f_cal = np.zeros((rocch_thresholds.shape[0] - 1, 5))
        for i in range(rocch_thresholds.shape[0] - 1):
            f_cal[i, 0] = rocch_thresholds[i]
            f_cal[i, 1] = rocch_thresholds[i + 1]
            join_elements = np.logical_and(f_cal[i, 1] <= f[:, 0], f_cal[i, 0] > f[:, 0])
            f_cal[i, 2] = f[join_elements, 1].sum()
            f_cal[i, 3] = f[join_elements, 2].sum()
        f_cal[:, 4] = f_cal[:, 3] / f_cal[:, [2, 3]].sum(axis=1)
        #fix to add 0
        f_cal[-1, 1] = 0
        calibrated_map = f_cal[:, [0, 1, 4]]

        self.calibration_map = calibrated_map

[docs]    def predict_proba(self, p):
        """ Calculate the calibrated probabilities

        Parameters
        ----------
        y_prob : array-like of shape = [n_samples, 2]
            Predicted probabilities to be calibrated using calibration map

        Returns
        -------
        y_prob_cal : array-like of shape = [n_samples, 1]
            Predicted calibrated probabilities
        """

        # TODO: Check input
        if p.size != p.shape[0]:
            p = p[:, 1]

        calibrated_proba = np.zeros(p.shape[0])
        for i in range(self.calibration_map.shape[0]):
            calibrated_proba[np.logical_and(self.calibration_map[i, 1] <= p, self.calibration_map[i, 0] > p)] = \
                self.calibration_map[i, 2]

        # TODO: return 2D and refactor
        return calibrated_proba


def _convexhull(P):
    """ Private function that calculate the convex hull of a set of points
    The algorithm was taken from [1].
    http://code.activestate.com/recipes/66527-finding-the-convex-hull-of-a-set-of-2d-points/

    References
    ----------

    .. [1] Alex Martelli, Anna Ravenscroft, David Ascher, 'Python Cookbook', O'Reilly Media, Inc., 2005.
    """

    def mydet(p, q, r):
        """Calc. determinant of a special matrix with three 2D points.

        The sign, "-" or "+", determines the side, right or left,
        respectivly, on which the point r lies, when measured against
        a directed vector from p to q.
        """

        # We use Sarrus' Rule to calculate the determinant.
        # (could also use the Numeric package...)
        sum1 = q[0] * r[1] + p[0] * q[1] + r[0] * p[1]
        sum2 = q[0] * p[1] + r[0] * q[1] + p[0] * r[1]

        return sum1 - sum2


    def isrightturn(p, q, r):
        "Do the vectors pq:qr form a right turn, or not?"

        assert p != q and q != r and p != r

        if mydet(p, q, r) < 0:
            return 1
        else:
            return 0

    # Get a local list copy of the points and sort them lexically.
    points = copy.copy(P)
    points.sort()

    # Build upper half of the hull.
    upper = [points[0], points[1]]
    for p in points[2:]:
        upper.append(p)
        while len(upper) > 2 and not isrightturn(upper[-3:][0],upper[-3:][1],upper[-3:][2]):
            del upper[-2]

    # Build lower half of the hull.
    points.reverse()
    lower = [points[0], points[1]]
    for p in points[2:]:
        lower.append(p)
        while len(lower) > 2 and not isrightturn(lower[-3:][0],lower[-3:][1],lower[-3:][2]):
            del lower[-2]

    # Remove duplicates.
    del lower[0]
    del lower[-1]

    # Concatenate both halfs and return.
    return tuple(upper + lower)