Source code for costcla.sampling.sampling

"""
This module include the sampling methods
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np
from ._smote import _SMOTE

[docs]def undersampling(X, y, cost_mat=None, per=0.5): """Under-sampling. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. y : array-like of shape = [n_samples] Ground truth (correct) labels. cost_mat : array-like of shape = [n_samples, 4], optional (default=None) Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. per: float, optional (default = 0.5) Percentage of the minority class in the under-sampled data """ n_samples = X.shape[0] #TODO: allow y different from (0, 1) num_y1 = y.sum() num_y0 = n_samples - num_y1 filter_rand = np.random.rand(int(num_y1 + num_y0)) #TODO: rewrite in a more readable way if num_y1 < num_y0: num_y0_new = num_y1 * 1.0 / per - num_y1 num_y0_new_per = num_y0_new * 1.0 / num_y0 filter_0 = np.logical_and(y == 0, filter_rand <= num_y0_new_per) filter_ = np.nonzero(np.logical_or(y == 1, filter_0))[0] else: num_y1_new = num_y0 * 1.0 / per - num_y0 num_y1_new_per = num_y1_new * 1.0 / num_y1 filter_1 = np.logical_and(y == 1, filter_rand <= num_y1_new_per) filter_ = np.nonzero(np.logical_or(y == 0, filter_1))[0] X_u = X[filter_, :] y_u = y[filter_] if not cost_mat is None: cost_mat_u = cost_mat[filter_, :] return X_u, y_u, cost_mat_u else: return X_u, y_u
[docs]def smote(X, y, cost_mat=None, per=0.5): """SMOTE: synthetic minority over-sampling technique Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. y : array-like of shape = [n_samples] Ground truth (correct) labels. cost_mat : array-like of shape = [n_samples, 4], optional (default=None) Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. per: float, optional (default = 0.5) Percentage of the minority class in the over-sampled data References ---------- .. [1] N. Chawla, K. Bowyer, L. Hall, W. Kegelmeyer, "SMOTE: Synthetic Minority Over-sampling Technique", Journal of Artificial Intelligence Research, 16, 321-357, 2002. Examples -------- >>> from costcla.datasets import load_creditscoring1 >>> from costcla.sampling import smote >>> data = load_creditscoring1() >>> data_smote, target_smote = smote(data.data, data.target, per=0.7) >>> # Size of each training set >>> print(data.data.shape[0], data_smote.shape[0]) 112915 204307 >>> # Percentage of positives in each training set >>> print(data.target.mean(), target_smote.mean()) 0.0674489660364 0.484604051746 """ #TODO: Add random state #TODO: Check input n_samples = X.shape[0] #TODO: allow y different from (0, 1) num_y1 = y.sum() num_y0 = n_samples - num_y1 #TODO: rewrite in a more readable way if num_y1 < num_y0: N = int((num_y0 * 1.0 / (1 - per) - num_y0) / num_y1) * 100 X_m = X[y == 1] X_majority = X[y == 0] minority = 1 else: N = int((num_y1 * 1.0 / (1 - per) - num_y1) / num_y0) * 100 X_m = X[y == 0] X_majority = X[y == 1] minority = 0 X_m_o = _SMOTE(X_m, N, k=3) X_s = np.vstack((X_majority, X_m_o)) n_samples_s = X_s.shape[0] y_s = np.ones(n_samples_s) * (minority - 1)**2 y_s[max(num_y1, num_y0):] = minority #TODO: Include cost_mat return X_s, y_s