Source code for costcla.models.cost_ensemble

"""
This module include the cost sensitive ensemble methods.
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

from sklearn.cross_validation import train_test_split
from ..models import CostSensitiveDecisionTreeClassifier
from ..models.bagging import BaggingClassifier


[docs]class CostSensitiveRandomForestClassifier(BaggingClassifier): """A example-dependent cost-sensitive random forest classifier. Parameters ---------- n_estimators : int, optional (default=10) The number of base estimators in the ensemble. combination : string, optional (default="majority_voting") Which combination method to use: - If "majority_voting" then combine by majority voting - If "weighted_voting" then combine by weighted voting using the out of bag savings as the weight for each estimator. - If "stacking" then a Cost Sensitive Logistic Regression is used to learn the combination. - If "stacking_proba" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the combination,. - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used to learn the probabilities and a BayesMinimumRisk for the prediction. - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the probabilities, and a BayesMinimumRisk for the prediction. - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of majority_voting - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of weighted_voting max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split in each tree: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features. pruned : bool, optional (default=True) Whenever or not to prune the decision tree using cost-based pruning n_jobs : int, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- `base_estimator_`: list of estimators The base estimator from which the ensemble is grown. `estimators_`: list of estimators The collection of fitted base estimators. `estimators_samples_`: list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. `estimators_features_`: list of arrays The subset of drawn features for each base estimator. See also -------- costcla.models.CostSensitiveDecisionTreeClassifier References ---------- .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B. `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__, 2015, http://arxiv.org/abs/1505.04637. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring1 >>> from costcla.models import CostSensitiveRandomForestClassifier >>> from costcla.metrics import savings_score >>> data = load_creditscoring1() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test) >>> f = CostSensitiveRandomForestClassifier() >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) >>> # Savings using only RandomForest >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test)) 0.12454256594 >>> # Savings using CostSensitiveRandomForestClassifier >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test)) 0.499390945808 """ def __init__(self, n_estimators=10, combination='majority_voting', max_features='auto', n_jobs=1, verbose=False, pruned=False): super(BaggingClassifier, self).__init__( base_estimator=CostSensitiveDecisionTreeClassifier(max_features=max_features, pruned=pruned), n_estimators=n_estimators, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, combination=combination, n_jobs=n_jobs, random_state=None, verbose=verbose) self.pruned = pruned
[docs]class CostSensitiveBaggingClassifier(BaggingClassifier): """A example-dependent cost-sensitive bagging classifier. Parameters ---------- n_estimators : int, optional (default=10) The number of base estimators in the ensemble. max_samples : int or float, optional (default=0.5) The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. combination : string, optional (default="majority_voting") Which combination method to use: - If "majority_voting" then combine by majority voting - If "weighted_voting" then combine by weighted voting using the out of bag savings as the weight for each estimator. - If "stacking" then a Cost Sensitive Logistic Regression is used to learn the combination. - If "stacking_proba" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the combination,. - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used to learn the probabilities and a BayesMinimumRisk for the prediction. - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the probabilities, and a BayesMinimumRisk for the prediction. - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of majority_voting - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of weighted_voting pruned : bool, optional (default=True) Whenever or not to prune the decision tree using cost-based pruning n_jobs : int, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- `base_estimator_`: list of estimators The base estimator from which the ensemble is grown. `estimators_`: list of estimators The collection of fitted base estimators. `estimators_samples_`: list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. `estimators_features_`: list of arrays The subset of drawn features for each base estimator. See also -------- costcla.models.CostSensitiveDecisionTreeClassifier References ---------- .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B. `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__, 2015, http://arxiv.org/abs/1505.04637. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring1 >>> from costcla.models import CostSensitiveBaggingClassifier >>> from costcla.metrics import savings_score >>> data = load_creditscoring1() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test) >>> f = CostSensitiveBaggingClassifier() >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) >>> # Savings using only RandomForest >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test)) 0.12454256594 >>> # Savings using CostSensitiveRandomForestClassifier >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test)) 0.478964004931 """ def __init__(self, n_estimators=10, max_samples=0.5, combination='majority_voting', n_jobs=1, verbose=False, pruned=False): super(BaggingClassifier, self).__init__( base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned), n_estimators=n_estimators, max_samples=max_samples, max_features=1.0, bootstrap=True, bootstrap_features=False, combination=combination, n_jobs=n_jobs, random_state=None, verbose=verbose) self.pruned = pruned
[docs]class CostSensitivePastingClassifier(BaggingClassifier): """A example-dependent cost-sensitive pasting classifier. Parameters ---------- n_estimators : int, optional (default=10) The number of base estimators in the ensemble. max_samples : int or float, optional (default=0.5) The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. combination : string, optional (default="majority_voting") Which combination method to use: - If "majority_voting" then combine by majority voting - If "weighted_voting" then combine by weighted voting using the out of bag savings as the weight for each estimator. - If "stacking" then a Cost Sensitive Logistic Regression is used to learn the combination. - If "stacking_proba" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the combination,. - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used to learn the probabilities and a BayesMinimumRisk for the prediction. - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the probabilities, and a BayesMinimumRisk for the prediction. - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of majority_voting - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of weighted_voting pruned : bool, optional (default=True) Whenever or not to prune the decision tree using cost-based pruning n_jobs : int, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- `base_estimator_`: list of estimators The base estimator from which the ensemble is grown. `estimators_`: list of estimators The collection of fitted base estimators. `estimators_samples_`: list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. `estimators_features_`: list of arrays The subset of drawn features for each base estimator. See also -------- costcla.models.CostSensitiveDecisionTreeClassifier References ---------- .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B. `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__, 2015, http://arxiv.org/abs/1505.04637. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring1 >>> from costcla.models import CostSensitivePastingClassifier >>> from costcla.metrics import savings_score >>> data = load_creditscoring1() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test) >>> f = CostSensitivePastingClassifier() >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) >>> # Savings using only RandomForest >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test)) 0.12454256594 >>> # Savings using CostSensitiveRandomForestClassifier >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test)) 0.479633754848 """ def __init__(self, n_estimators=10, max_samples=0.5, combination='majority_voting', n_jobs=1, verbose=False, pruned=False): super(BaggingClassifier, self).__init__( base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned), n_estimators=n_estimators, max_samples=max_samples, max_features=1.0, bootstrap=False, bootstrap_features=False, combination=combination, n_jobs=n_jobs, random_state=None, verbose=verbose) self.pruned = pruned
[docs]class CostSensitiveRandomPatchesClassifier(BaggingClassifier): """A example-dependent cost-sensitive pasting classifier. Parameters ---------- n_estimators : int, optional (default=10) The number of base estimators in the ensemble. max_samples : int or float, optional (default=0.5) The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. max_features : int or float, optional (default=0.5) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. combination : string, optional (default="majority_voting") Which combination method to use: - If "majority_voting" then combine by majority voting - If "weighted_voting" then combine by weighted voting using the out of bag savings as the weight for each estimator. - If "stacking" then a Cost Sensitive Logistic Regression is used to learn the combination. - If "stacking_proba" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the combination,. - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used to learn the probabilities and a BayesMinimumRisk for the prediction. - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained with the estimated probabilities is used to learn the probabilities, and a BayesMinimumRisk for the prediction. - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of majority_voting - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the prediction using the predicted probabilities of weighted_voting pruned : bool, optional (default=True) Whenever or not to prune the decision tree using cost-based pruning n_jobs : int, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. verbose : int, optional (default=0) Controls the verbosity of the building process. Attributes ---------- `base_estimator_`: list of estimators The base estimator from which the ensemble is grown. `estimators_`: list of estimators The collection of fitted base estimators. `estimators_samples_`: list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. `estimators_features_`: list of arrays The subset of drawn features for each base estimator. See also -------- costcla.models.CostSensitiveDecisionTreeClassifier References ---------- .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B. `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__, 2015, http://arxiv.org/abs/1505.04637. Examples -------- >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.cross_validation import train_test_split >>> from costcla.datasets import load_creditscoring1 >>> from costcla.models import CostSensitiveRandomPatchesClassifier >>> from costcla.metrics import savings_score >>> data = load_creditscoring1() >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0) >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test) >>> f = CostSensitiveRandomPatchesClassifier(combination='weighted_voting') >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test) >>> # Savings using only RandomForest >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test)) 0.12454256594 >>> # Savings using CostSensitiveRandomForestClassifier >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test)) 0.499548618518 """ def __init__(self, n_estimators=10, max_samples=0.5, max_features=0.5, combination='majority_voting', n_jobs=1, verbose=False, pruned=False): super(BaggingClassifier, self).__init__( base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned), n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=False, bootstrap_features=False, combination=combination, n_jobs=n_jobs, random_state=None, verbose=verbose) self.pruned = pruned
#TODO not working in parallel, without error # from costcla.datasets import load_creditscoring1 # data = load_creditscoring1() # x=data.data # y=data.target # c=data.cost_mat # # print 'start' # f = BaggingClassifier(n_estimators=10, verbose=100, n_jobs=2) # f.fit(x[0:1000],y[0:1000],c[0:1000]) # print 'predict proba' # f.__setattr__('n_jobs', 4) # f.predict(x) # print 'predict END'