Source code for costcla.models.regression

"""
This module include the cost-sensitive logistic regression method.
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np
import math
from scipy.optimize import minimize
from sklearn.base import BaseEstimator
# from sklearn.linear_model.logistic import _intercept_dot
from pyea import GeneticAlgorithmOptimizer
from ..metrics import cost_loss

# Not in sklearn 0.15, is in 0.16-git
#TODO: replace once sklearn 0.16 is release
# The one in sklearn 0.16 return yz instead of z, therefore,
# the impact on the code should be addressed before making the change.
def _intercept_dot(w, X):
    """Computes y * np.dot(X, w).

    It takes into consideration if the intercept should be fit or not.

    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.

    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.

    """
    c = 0.
    if w.size == X.shape[1] + 1:
        c = w[-1]
        w = w[:-1]

    z = np.dot(X, w) + c
    return w, c, z


def _sigmoid(z):
    """ Private function that calculate the sigmoid function """
    return 1 / (1 + np.exp(-z))


def _logistic_cost_loss_i(w, X, y, cost_mat, alpha):
    n_samples = X.shape[0]
    w, c, z = _intercept_dot(w, X)
    y_prob = _sigmoid(z)

    out = cost_loss(y, y_prob, cost_mat) / n_samples
    out += .5 * alpha * np.dot(w, w)
    return out


def _logistic_cost_loss(w, X, y, cost_mat, alpha):
    """Computes the logistic loss.

    Parameters
    ----------
    w : array-like, shape (n_w, n_features,) or (n_w, n_features + 1,)
        Coefficient vector or matrix of coefficient.

    X : array-like, shape (n_samples, n_features)
        Training data.

    y : ndarray, shape (n_samples,)
        Array of labels.

    cost_mat : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.

    alpha : float
        Regularization parameter. alpha is equal to 1 / C.

    Returns
    -------
    out : float
        Logistic loss.
    """

    if w.shape[0] == w.size:
        # Only evaluating one w
        return _logistic_cost_loss_i(w, X, y, cost_mat, alpha)

    else:
        # Evaluating a set of w
        n_w = w.shape[0]
        out = np.zeros(n_w)

        for i in range(n_w):
            out[i] = _logistic_cost_loss_i(w[i], X, y, cost_mat, alpha)

        return out


[docs]class CostSensitiveLogisticRegression(BaseEstimator):
    """A example-dependent cost-sensitive Logistic Regression classifier.

    Parameters
    ----------

    C : float, optional (default=1.0)
        Inverse of regularization strength; must be a positive float.
        Like in support vector machines, smaller values specify stronger
        regularization.

    fit_intercept : bool, default: True
        Specifies if a constant (a.k.a. bias or intercept) should be
        added the decision function.

    max_iter : int
        Useful only for the ga and bfgs solvers. Maximum number of
        iterations taken for the solvers to converge.

    random_state : int seed, RandomState instance, or None (default)
        The seed of the pseudo random number generator to use when
        shuffling the data.

    solver : {'ga', 'bfgs'}
        Algorithm to use in the optimization problem.

    tol : float, optional
        Tolerance for stopping criteria.

    verbose : int, optional (default=0)
        Controls the verbosity of the optimization process.

    Attributes
    ----------
    `coef_` : array, shape (n_classes, n_features)
        Coefficient of the features in the decision function.

    `intercept_` : array, shape (n_classes,)
        Intercept (a.k.a. bias) added to the decision function.
        If `fit_intercept` is set to False, the intercept is set to zero.

    See also
    --------
    sklearn.tree.DecisionTreeClassifier

    References
    ----------

    .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten,
           `"Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring" <http://albahnsen.com/files/Example-Dependent%20Cost-Sensitive%20Logistic%20Regression%20for%20Credit%20Scoring_publish.pdf>`__,
           in Proceedings of the International Conference on Machine Learning and Applications,
           , 2014.

    Examples
    --------
    >>> from sklearn.linear_model import LogisticRegression
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring2
    >>> from costcla.models import CostSensitiveLogisticRegression
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring2()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> y_pred_test_lr = LogisticRegression(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> f = CostSensitiveLogisticRegression()
    >>> f.fit(X_train, y_train, cost_mat_train)
    >>> y_pred_test_cslr = f.predict(X_test)
    >>> # Savings using Logistic Regression
    >>> print(savings_score(y_test, y_pred_test_lr, cost_mat_test))
    0.00283419465107
    >>> # Savings using Cost Sensitive Logistic Regression
    >>> print(savings_score(y_test, y_pred_test_cslr, cost_mat_test))
    0.142872237978
    """
    def __init__(self,
                 C=1.0,
                 fit_intercept=True,
                 max_iter=100,
                 random_state=None,
                 solver='ga',
                 tol=1e-4,
                 verbose=0):

        self.C = C
        self.fit_intercept = fit_intercept
        self.max_iter = max_iter
        self.random_state = random_state
        self.solver = solver
        self.tol = tol
        self.coef_ = None
        self.intercept_ = 0.
        self.verbose = verbose

[docs]    def fit(self, X, y, cost_mat):
        """ Build a example-dependent cost-sensitive logistic regression from the training set (X, y, cost_mat)

        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        y : array indicator matrix
            Ground truth (correct) labels.

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        Returns
        -------
        self : object
            Returns self.
        """

        #TODO: Check input

        n_features = X.shape[1]
        if self.fit_intercept:
            w0 = np.zeros(n_features + 1)
        else:
            w0 = np.zeros(n_features)

        if self.solver == 'ga':
            #TODO: add n_jobs
            res = GeneticAlgorithmOptimizer(_logistic_cost_loss,
                                            w0.shape[0],
                                            iters=self.max_iter,
                                            type_='cont',
                                            n_chromosomes=100,
                                            per_mutations=0.25,
                                            n_elite=10,
                                            fargs=(X, y, cost_mat, 1. / self.C),
                                            range_=(-5, 5),
                                            n_jobs=1,
                                            verbose=self.verbose)
            res.fit()

        elif self.solver == 'bfgs':

            if self.verbose > 0:
                disp = True
            else:
                disp = False

            res = minimize(_logistic_cost_loss,
                           w0,
                           method='BFGS',
                           args=(X, y, cost_mat, 1. / self.C),
                           tol=self.tol,
                           options={'maxiter': self.max_iter, 'disp': disp})

        if self.fit_intercept:
            self.coef_ = res.x[:-1]
            self.intercept_ = res.x[-1]
        else:
            self.coef_ = res.x

[docs]    def predict_proba(self, X):
        """Probability estimates.

        The returned estimates.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        T : array-like, shape = [n_samples, 2]
            Returns the probability of the sample for each class in the model.
        """
        y_prob = np.zeros((X.shape[0], 2))
        y_prob[:, 1] = _sigmoid(np.dot(X, self.coef_) + self.intercept_)
        y_prob[:, 0] = 1 - y_prob[:, 1]
        return y_prob

[docs]    def predict(self, X, cut_point=0.5):
        """Predicted class.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        T : array-like, shape = [n_samples]
            Returns the prediction of the sample..
        """
        return np.floor(self.predict_proba(X)[:, 1] + (1 - cut_point))