Source code for costcla.sampling.sampling

"""
This module include the sampling methods
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np
from ._smote import _SMOTE

[docs]def undersampling(X, y, cost_mat=None, per=0.5):
    """Under-sampling.

    Parameters
    ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        y : array-like of shape = [n_samples]
            Ground truth (correct) labels.

        cost_mat : array-like of shape = [n_samples, 4], optional (default=None)
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        per: float, optional (default = 0.5)
            Percentage of the minority class in the under-sampled data
    """

    n_samples = X.shape[0]
    #TODO: allow y different from (0, 1)
    num_y1 = y.sum()
    num_y0 = n_samples - num_y1

    filter_rand = np.random.rand(int(num_y1 + num_y0))

    #TODO: rewrite in a more readable way
    if num_y1 < num_y0:
        num_y0_new = num_y1 * 1.0 / per - num_y1
        num_y0_new_per = num_y0_new * 1.0 / num_y0
        filter_0 = np.logical_and(y == 0, filter_rand <= num_y0_new_per)
        filter_ = np.nonzero(np.logical_or(y == 1, filter_0))[0]
    else:
        num_y1_new = num_y0 * 1.0 / per - num_y0
        num_y1_new_per = num_y1_new * 1.0 / num_y1
        filter_1 = np.logical_and(y == 1, filter_rand <= num_y1_new_per)
        filter_ = np.nonzero(np.logical_or(y == 0, filter_1))[0]

    X_u = X[filter_, :]
    y_u = y[filter_]

    if not cost_mat is None:
        cost_mat_u = cost_mat[filter_, :]
        return X_u, y_u, cost_mat_u
    else:
        return X_u, y_u
    

[docs]def smote(X, y, cost_mat=None, per=0.5):
    """SMOTE: synthetic minority over-sampling technique

    Parameters
    ----------
        X : array-like of shape = [n_samples, n_features]
            The input samples.

        y : array-like of shape = [n_samples]
            Ground truth (correct) labels.

        cost_mat : array-like of shape = [n_samples, 4], optional (default=None)
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        per: float, optional (default = 0.5)
            Percentage of the minority class in the over-sampled data

    References
    ----------

    .. [1] N. Chawla, K. Bowyer, L. Hall, W. Kegelmeyer, "SMOTE: Synthetic Minority Over-sampling Technique",
           Journal of Artificial Intelligence Research, 16, 321-357, 2002.

    Examples
    --------
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.sampling import smote
    >>> data = load_creditscoring1()
    >>> data_smote, target_smote = smote(data.data, data.target, per=0.7)
    >>> # Size of each training set
    >>> print(data.data.shape[0], data_smote.shape[0])
    112915 204307
    >>> # Percentage of positives in each training set
    >>> print(data.target.mean(), target_smote.mean())
    0.0674489660364 0.484604051746
    """
    #TODO: Add random state
    #TODO: Check input
    n_samples = X.shape[0]
    #TODO: allow y different from (0, 1)
    num_y1 = y.sum()
    num_y0 = n_samples - num_y1

    #TODO: rewrite in a more readable way
    if num_y1 < num_y0:
        N = int((num_y0 * 1.0 / (1 - per) - num_y0) / num_y1) * 100
        X_m = X[y == 1]
        X_majority = X[y == 0]
        minority = 1
    else:
        N = int((num_y1 * 1.0 / (1 - per) - num_y1) / num_y0) * 100
        X_m = X[y == 0]
        X_majority = X[y == 1]
        minority = 0

    X_m_o = _SMOTE(X_m, N, k=3)

    X_s = np.vstack((X_majority, X_m_o))

    n_samples_s = X_s.shape[0]

    y_s = np.ones(n_samples_s) * (minority - 1)**2
    y_s[max(num_y1, num_y0):] = minority

    #TODO: Include cost_mat

    return X_s, y_s