Source code for costcla.sampling.cost_sampling

"""
This module include the cost proportionate sampling methods
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

import numpy as np


[docs]def cost_sampling(X, y, cost_mat, method='RejectionSampling', oversampling_norm=0.1, max_wc=97.5): """Cost-proportionate sampling. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. y : array-like of shape = [n_samples] Ground truth (correct) labels. cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. method : str, optional (default = RejectionSampling) Method to perform the cost-proportionate sampling, either 'RejectionSampling' or 'OverSampling'. oversampling_norm: float, optional (default = 0.1) normalize value of wc, the smaller the biggest the data. max_wc: float, optional (default = 97.5) outlier adjustment for the cost. References ---------- .. [1] B. Zadrozny, J. Langford, N. Naoki, "Cost-sensitive learning by cost-proportionate example weighting", in Proceedings of the Third IEEE International Conference on Data Mining, 435-442, 2003. .. [2] C. Elkan, "The foundations of Cost-Sensitive Learning", in Seventeenth International Joint Conference on Artificial Intelligence, 973-978, 2001. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring1 >>> from costcla.sampling import cost_sampling, undersampling >>> from costcla.metrics import savings_score >>> data = load_creditscoring1() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> X_cps_o, y_cps_o, cost_mat_cps_o = cost_sampling(X_train, y_train, cost_mat_train, method='OverSampling') >>> X_cps_r, y_cps_r, cost_mat_cps_r = cost_sampling(X_train, y_train, cost_mat_train, method='RejectionSampling') >>> X_u, y_u, cost_mat_u = undersampling(X_train, y_train, cost_mat_train) >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test) >>> y_pred_test_rf_cps_o = RandomForestClassifier(random_state=0).fit(X_cps_o, y_cps_o).predict(X_test) >>> y_pred_test_rf_cps_r = RandomForestClassifier(random_state=0).fit(X_cps_r, y_cps_r).predict(X_test) >>> y_pred_test_rf_u = RandomForestClassifier(random_state=0).fit(X_u, y_u).predict(X_test) >>> # Savings using only RandomForest >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test)) 0.12454256594 >>> # Savings using RandomForest with cost-proportionate over-sampling >>> print(savings_score(y_test, y_pred_test_rf_cps_o, cost_mat_test)) 0.192480226286 >>> # Savings using RandomForest with cost-proportionate rejection-sampling >>> print(savings_score(y_test, y_pred_test_rf_cps_r, cost_mat_test)) 0.465830173459 >>> # Savings using RandomForest with under-sampling >>> print(savings_score(y_test, y_pred_test_rf_u, cost_mat_test)) 0.466630646543 >>> # Size of each training set >>> print(X_train.shape[0], X_cps_o.shape[0], X_cps_r.shape[0], X_u.shape[0]) 75653 109975 8690 10191 >>> # Percentage of positives in each training set >>> print(y_train.mean(), y_cps_o.mean(), y_cps_r.mean(), y_u.mean()) 0.0668182358928 0.358054103205 0.436939010357 0.49602590521 """ #TODO: Check consistency of input # The methods are construct only for the misclassification costs, not the full cost matrix. cost_mis = cost_mat[:, 0] cost_mis[y == 1] = cost_mat[y == 1, 1] # wc = cost_mis / cost_mis.max() wc = np.minimum(cost_mis / np.percentile(cost_mis, max_wc), 1) n_samples = X.shape[0] filter_ = range(n_samples) if method == 'RejectionSampling': # under-sampling by rejection [1] #TODO: Add random state rej_rand = np.random.rand(n_samples) filter_ = rej_rand <= wc elif method == 'OverSampling': # over-sampling with normalized wn [2] wc_n = np.ceil(wc / oversampling_norm).astype(np.int) new_n = wc_n.sum() filter_ = np.ones(new_n, dtype=np.int) e = 0 #TODO replace for for i in range(n_samples): filter_[e: e + wc_n[i]] = i e += wc_n[i] x_cps = X[filter_] y_cps = y[filter_] cost_mat_cps = cost_mat[filter_] return x_cps, y_cps, cost_mat_cps