Source code for costcla.sampling.cost_sampling

"""
This module include the cost proportionate sampling methods
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np


[docs]def cost_sampling(X, y, cost_mat, method='RejectionSampling', oversampling_norm=0.1, max_wc=97.5):
    """Cost-proportionate sampling.

    Parameters
    ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        y : array-like of shape = [n_samples]
            Ground truth (correct) labels.

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        method : str, optional (default = RejectionSampling)
            Method to perform the cost-proportionate sampling,
            either 'RejectionSampling' or 'OverSampling'.

        oversampling_norm: float, optional (default = 0.1)
            normalize value of wc, the smaller the biggest the data.

        max_wc: float, optional (default = 97.5)
            outlier adjustment for the cost.

    References
    ----------

    .. [1] B. Zadrozny, J. Langford, N. Naoki, "Cost-sensitive learning by
           cost-proportionate example weighting", in Proceedings of the
           Third IEEE International Conference on Data Mining, 435-442, 2003.

    .. [2] C. Elkan, "The foundations of Cost-Sensitive Learning",
           in Seventeenth International Joint Conference on Artificial Intelligence,
           973-978, 2001.

    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.sampling import cost_sampling, undersampling
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> X_cps_o, y_cps_o, cost_mat_cps_o =  cost_sampling(X_train, y_train, cost_mat_train, method='OverSampling')
    >>> X_cps_r, y_cps_r, cost_mat_cps_r =  cost_sampling(X_train, y_train, cost_mat_train, method='RejectionSampling')
    >>> X_u, y_u, cost_mat_u = undersampling(X_train, y_train, cost_mat_train)
    >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> y_pred_test_rf_cps_o = RandomForestClassifier(random_state=0).fit(X_cps_o, y_cps_o).predict(X_test)
    >>> y_pred_test_rf_cps_r = RandomForestClassifier(random_state=0).fit(X_cps_r, y_cps_r).predict(X_test)
    >>> y_pred_test_rf_u = RandomForestClassifier(random_state=0).fit(X_u, y_u).predict(X_test)
    >>> # Savings using only RandomForest
    >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
    0.12454256594
    >>> # Savings using RandomForest with cost-proportionate over-sampling
    >>> print(savings_score(y_test, y_pred_test_rf_cps_o, cost_mat_test))
    0.192480226286
    >>> # Savings using RandomForest with cost-proportionate rejection-sampling
    >>> print(savings_score(y_test, y_pred_test_rf_cps_r, cost_mat_test))
    0.465830173459
    >>> # Savings using RandomForest with under-sampling
    >>> print(savings_score(y_test, y_pred_test_rf_u, cost_mat_test))
    0.466630646543
    >>> # Size of each training set
    >>> print(X_train.shape[0], X_cps_o.shape[0], X_cps_r.shape[0], X_u.shape[0])
    75653 109975 8690 10191
    >>> # Percentage of positives in each training set
    >>> print(y_train.mean(), y_cps_o.mean(), y_cps_r.mean(), y_u.mean())
    0.0668182358928 0.358054103205 0.436939010357 0.49602590521
    """

    #TODO: Check consistency of input

    # The methods are construct only for the misclassification costs, not the full cost matrix.
    cost_mis = cost_mat[:, 0]
    cost_mis[y == 1] = cost_mat[y == 1, 1]

    # wc = cost_mis / cost_mis.max()
    wc = np.minimum(cost_mis / np.percentile(cost_mis, max_wc), 1)

    n_samples = X.shape[0]

    filter_ = range(n_samples)

    if method == 'RejectionSampling':
        # under-sampling by rejection [1]
        #TODO: Add random state
        rej_rand = np.random.rand(n_samples)

        filter_ = rej_rand <= wc

    elif method == 'OverSampling':
        # over-sampling with normalized wn [2]
        wc_n = np.ceil(wc / oversampling_norm).astype(np.int)

        new_n = wc_n.sum()

        filter_ = np.ones(new_n, dtype=np.int)

        e = 0
        #TODO replace for
        for i in range(n_samples):
            filter_[e: e + wc_n[i]] = i
            e += wc_n[i]

    x_cps = X[filter_]
    y_cps = y[filter_]
    cost_mat_cps = cost_mat[filter_]

    return x_cps, y_cps, cost_mat_cps