Source code for costcla.models.regression

"""
This module include the cost-sensitive logistic regression method.
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np
import math
from scipy.optimize import minimize
from sklearn.base import BaseEstimator
# from sklearn.linear_model.logistic import _intercept_dot
from pyea import GeneticAlgorithmOptimizer
from ..metrics import cost_loss

# Not in sklearn 0.15, is in 0.16-git
#TODO: replace once sklearn 0.16 is release
# The one in sklearn 0.16 return yz instead of z, therefore,
# the impact on the code should be addressed before making the change.
def _intercept_dot(w, X):
    """Computes y * np.dot(X, w).

    It takes into consideration if the intercept should be fit or not.

    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.

    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.

    """
    c = 0.
    if w.size == X.shape[1] + 1:
        c = w[-1]
        w = w[:-1]

    z = np.dot(X, w) + c
    return w, c, z


def _sigmoid(z):
    """ Private function that calculate the sigmoid function """
    return 1 / (1 + np.exp(-z))


def _logistic_cost_loss_i(w, X, y, cost_mat, alpha):
    n_samples = X.shape[0]
    w, c, z = _intercept_dot(w, X)
    y_prob = _sigmoid(z)

    out = cost_loss(y, y_prob, cost_mat) / n_samples
    out += .5 * alpha * np.dot(w, w)
    return out


def _logistic_cost_loss(w, X, y, cost_mat, alpha):
    """Computes the logistic loss.

    Parameters
    ----------
    w : array-like, shape (n_w, n_features,) or (n_w, n_features + 1,)
        Coefficient vector or matrix of coefficient.

    X : array-like, shape (n_samples, n_features)
        Training data.

    y : ndarray, shape (n_samples,)
        Array of labels.

    cost_mat : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.

    alpha : float
        Regularization parameter. alpha is equal to 1 / C.

    Returns
    -------
    out : float
        Logistic loss.
    """

    if w.shape[0] == w.size:
        # Only evaluating one w
        return _logistic_cost_loss_i(w, X, y, cost_mat, alpha)

    else:
        # Evaluating a set of w
        n_w = w.shape[0]
        out = np.zeros(n_w)

        for i in range(n_w):
            out[i] = _logistic_cost_loss_i(w[i], X, y, cost_mat, alpha)

        return out


[docs]class CostSensitiveLogisticRegression(BaseEstimator): """A example-dependent cost-sensitive Logistic Regression classifier. Parameters ---------- C : float, optional (default=1.0) Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. fit_intercept : bool, default: True Specifies if a constant (a.k.a. bias or intercept) should be added the decision function. max_iter : int Useful only for the ga and bfgs solvers. Maximum number of iterations taken for the solvers to converge. random_state : int seed, RandomState instance, or None (default) The seed of the pseudo random number generator to use when shuffling the data. solver : {'ga', 'bfgs'} Algorithm to use in the optimization problem. tol : float, optional Tolerance for stopping criteria. verbose : int, optional (default=0) Controls the verbosity of the optimization process. Attributes ---------- `coef_` : array, shape (n_classes, n_features) Coefficient of the features in the decision function. `intercept_` : array, shape (n_classes,) Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. See also -------- sklearn.tree.DecisionTreeClassifier References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, `"Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring" <http://albahnsen.com/files/Example-Dependent%20Cost-Sensitive%20Logistic%20Regression%20for%20Credit%20Scoring_publish.pdf>`__, in Proceedings of the International Conference on Machine Learning and Applications, , 2014. Examples -------- >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring2 >>> from costcla.models import CostSensitiveLogisticRegression >>> from costcla.metrics import savings_score >>> data = load_creditscoring2() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> y_pred_test_lr = LogisticRegression(random_state=0).fit(X_train, y_train).predict(X_test) >>> f = CostSensitiveLogisticRegression() >>> f.fit(X_train, y_train, cost_mat_train) >>> y_pred_test_cslr = f.predict(X_test) >>> # Savings using Logistic Regression >>> print(savings_score(y_test, y_pred_test_lr, cost_mat_test)) 0.00283419465107 >>> # Savings using Cost Sensitive Logistic Regression >>> print(savings_score(y_test, y_pred_test_cslr, cost_mat_test)) 0.142872237978 """ def __init__(self, C=1.0, fit_intercept=True, max_iter=100, random_state=None, solver='ga', tol=1e-4, verbose=0): self.C = C self.fit_intercept = fit_intercept self.max_iter = max_iter self.random_state = random_state self.solver = solver self.tol = tol self.coef_ = None self.intercept_ = 0. self.verbose = verbose
[docs] def fit(self, X, y, cost_mat): """ Build a example-dependent cost-sensitive logistic regression from the training set (X, y, cost_mat) Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. y : array indicator matrix Ground truth (correct) labels. cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. Returns ------- self : object Returns self. """ #TODO: Check input n_features = X.shape[1] if self.fit_intercept: w0 = np.zeros(n_features + 1) else: w0 = np.zeros(n_features) if self.solver == 'ga': #TODO: add n_jobs res = GeneticAlgorithmOptimizer(_logistic_cost_loss, w0.shape[0], iters=self.max_iter, type_='cont', n_chromosomes=100, per_mutations=0.25, n_elite=10, fargs=(X, y, cost_mat, 1. / self.C), range_=(-5, 5), n_jobs=1, verbose=self.verbose) res.fit() elif self.solver == 'bfgs': if self.verbose > 0: disp = True else: disp = False res = minimize(_logistic_cost_loss, w0, method='BFGS', args=(X, y, cost_mat, 1. / self.C), tol=self.tol, options={'maxiter': self.max_iter, 'disp': disp}) if self.fit_intercept: self.coef_ = res.x[:-1] self.intercept_ = res.x[-1] else: self.coef_ = res.x
[docs] def predict_proba(self, X): """Probability estimates. The returned estimates. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, 2] Returns the probability of the sample for each class in the model. """ y_prob = np.zeros((X.shape[0], 2)) y_prob[:, 1] = _sigmoid(np.dot(X, self.coef_) + self.intercept_) y_prob[:, 0] = 1 - y_prob[:, 1] return y_prob
[docs] def predict(self, X, cut_point=0.5): """Predicted class. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples] Returns the prediction of the sample.. """ return np.floor(self.predict_proba(X)[:, 1] + (1 - cut_point))