Source code for costcla.models.cost_ensemble

"""
This module include the cost sensitive ensemble methods.
"""

# Authors: Alejandro Correa Bahnsen <al.bahnsen@gmail.com>
# License: BSD 3 clause

from sklearn.cross_validation import train_test_split
from ..models import CostSensitiveDecisionTreeClassifier
from ..models.bagging import BaggingClassifier


[docs]class CostSensitiveRandomForestClassifier(BaggingClassifier):
    """A example-dependent cost-sensitive random forest  classifier.

    Parameters
    ----------

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    combination : string, optional (default="majority_voting")
        Which combination method to use:
          - If "majority_voting" then combine by majority voting
          - If "weighted_voting" then combine by weighted voting using the
            out of bag savings as the weight for each estimator.
          - If "stacking" then a Cost Sensitive Logistic Regression is used
            to learn the combination.
          - If "stacking_proba" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the combination,.
          - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used
            to learn the probabilities and a BayesMinimumRisk for the prediction.
          - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the probabilities,
            and a BayesMinimumRisk for the prediction.
          - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of majority_voting
          - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of weighted_voting

    max_features : int, float, string or None, optional (default=None)
        The number of features to consider when looking for the best split in each tree:
          - If int, then consider `max_features` features at each split.
          - If float, then `max_features` is a percentage and
            `int(max_features * n_features)` features are considered at each
            split.
          - If "auto", then `max_features=sqrt(n_features)`.
          - If "sqrt", then `max_features=sqrt(n_features)`.
          - If "log2", then `max_features=log2(n_features)`.
          - If None, then `max_features=n_features`.

        Note: the search for a split does not stop until at least one
        valid partition of the node samples is found, even if it requires to
        effectively inspect more than ``max_features`` features.

    pruned : bool, optional (default=True)
        Whenever or not to prune the decision tree using cost-based pruning

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    `base_estimator_`: list of estimators
        The base estimator from which the ensemble is grown.

    `estimators_`: list of estimators
        The collection of fitted base estimators.

    `estimators_samples_`: list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    `estimators_features_`: list of arrays
        The subset of drawn features for each base estimator.

    See also
    --------
    costcla.models.CostSensitiveDecisionTreeClassifier

    References
    ----------

    .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B.
           `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__,
           2015, http://arxiv.org/abs/1505.04637.
    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.models import CostSensitiveRandomForestClassifier
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> f = CostSensitiveRandomForestClassifier()
    >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
    >>> # Savings using only RandomForest
    >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
    0.12454256594
    >>> # Savings using CostSensitiveRandomForestClassifier
    >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test))
    0.499390945808
    """
    def __init__(self,
                 n_estimators=10,
                 combination='majority_voting',
                 max_features='auto',
                 n_jobs=1,
                 verbose=False,
                 pruned=False):
        super(BaggingClassifier, self).__init__(
            base_estimator=CostSensitiveDecisionTreeClassifier(max_features=max_features, pruned=pruned),
            n_estimators=n_estimators,
            max_samples=1.0,
            max_features=1.0,
            bootstrap=True,
            bootstrap_features=False,
            combination=combination,
            n_jobs=n_jobs,
            random_state=None,
            verbose=verbose)
        self.pruned = pruned


[docs]class CostSensitiveBaggingClassifier(BaggingClassifier):
    """A example-dependent cost-sensitive bagging  classifier.

    Parameters
    ----------

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=0.5)
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.

    combination : string, optional (default="majority_voting")
        Which combination method to use:
          - If "majority_voting" then combine by majority voting
          - If "weighted_voting" then combine by weighted voting using the
            out of bag savings as the weight for each estimator.
          - If "stacking" then a Cost Sensitive Logistic Regression is used
            to learn the combination.
          - If "stacking_proba" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the combination,.
          - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used
            to learn the probabilities and a BayesMinimumRisk for the prediction.
          - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the probabilities,
            and a BayesMinimumRisk for the prediction.
          - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of majority_voting
          - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of weighted_voting

    pruned : bool, optional (default=True)
        Whenever or not to prune the decision tree using cost-based pruning

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    `base_estimator_`: list of estimators
        The base estimator from which the ensemble is grown.

    `estimators_`: list of estimators
        The collection of fitted base estimators.

    `estimators_samples_`: list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    `estimators_features_`: list of arrays
        The subset of drawn features for each base estimator.

    See also
    --------
    costcla.models.CostSensitiveDecisionTreeClassifier

    References
    ----------

    .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B.
           `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__,
           2015, http://arxiv.org/abs/1505.04637.

    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.models import CostSensitiveBaggingClassifier
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> f = CostSensitiveBaggingClassifier()
    >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
    >>> # Savings using only RandomForest
    >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
    0.12454256594
    >>> # Savings using CostSensitiveRandomForestClassifier
    >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test))
    0.478964004931
    """
    def __init__(self,
                 n_estimators=10,
                 max_samples=0.5,
                 combination='majority_voting',
                 n_jobs=1,
                 verbose=False,
                 pruned=False):
        super(BaggingClassifier, self).__init__(
            base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned),
            n_estimators=n_estimators,
            max_samples=max_samples,
            max_features=1.0,
            bootstrap=True,
            bootstrap_features=False,
            combination=combination,
            n_jobs=n_jobs,
            random_state=None,
            verbose=verbose)
        self.pruned = pruned


[docs]class CostSensitivePastingClassifier(BaggingClassifier):
    """A example-dependent cost-sensitive pasting  classifier.

    Parameters
    ----------

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=0.5)
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.

    combination : string, optional (default="majority_voting")
        Which combination method to use:
          - If "majority_voting" then combine by majority voting
          - If "weighted_voting" then combine by weighted voting using the
            out of bag savings as the weight for each estimator.
          - If "stacking" then a Cost Sensitive Logistic Regression is used
            to learn the combination.
          - If "stacking_proba" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the combination,.
          - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used
            to learn the probabilities and a BayesMinimumRisk for the prediction.
          - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the probabilities,
            and a BayesMinimumRisk for the prediction.
          - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of majority_voting
          - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of weighted_voting

    pruned : bool, optional (default=True)
        Whenever or not to prune the decision tree using cost-based pruning

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    `base_estimator_`: list of estimators
        The base estimator from which the ensemble is grown.

    `estimators_`: list of estimators
        The collection of fitted base estimators.

    `estimators_samples_`: list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    `estimators_features_`: list of arrays
        The subset of drawn features for each base estimator.

    See also
    --------
    costcla.models.CostSensitiveDecisionTreeClassifier

    References
    ----------

    .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B.
           `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__,
           2015, http://arxiv.org/abs/1505.04637.

    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.models import CostSensitivePastingClassifier
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> f = CostSensitivePastingClassifier()
    >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
    >>> # Savings using only RandomForest
    >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
    0.12454256594
    >>> # Savings using CostSensitiveRandomForestClassifier
    >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test))
    0.479633754848
    """
    def __init__(self,
                 n_estimators=10,
                 max_samples=0.5,
                 combination='majority_voting',
                 n_jobs=1,
                 verbose=False,
                 pruned=False):
        super(BaggingClassifier, self).__init__(
            base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned),
            n_estimators=n_estimators,
            max_samples=max_samples,
            max_features=1.0,
            bootstrap=False,
            bootstrap_features=False,
            combination=combination,
            n_jobs=n_jobs,
            random_state=None,
            verbose=verbose)
        self.pruned = pruned


[docs]class CostSensitiveRandomPatchesClassifier(BaggingClassifier):
    """A example-dependent cost-sensitive pasting  classifier.

    Parameters
    ----------

    n_estimators : int, optional (default=10)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default=0.5)
        The number of samples to draw from X to train each base estimator.
            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.

    max_features : int or float, optional (default=0.5)
        The number of features to draw from X to train each base estimator.
            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    combination : string, optional (default="majority_voting")
        Which combination method to use:
          - If "majority_voting" then combine by majority voting
          - If "weighted_voting" then combine by weighted voting using the
            out of bag savings as the weight for each estimator.
          - If "stacking" then a Cost Sensitive Logistic Regression is used
            to learn the combination.
          - If "stacking_proba" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the combination,.
          - If "stacking_bmr" then a Cost Sensitive Logistic Regression is used
            to learn the probabilities and a BayesMinimumRisk for the prediction.
          - If "stacking_proba_bmr" then a Cost Sensitive Logistic Regression trained
            with the estimated probabilities is used to learn the probabilities,
            and a BayesMinimumRisk for the prediction.
          - If "majority_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of majority_voting
          - If "weighted_bmr" then the BayesMinimumRisk algorithm is used to make the
            prediction using the predicted probabilities of weighted_voting

    pruned : bool, optional (default=True)
        Whenever or not to prune the decision tree using cost-based pruning

    n_jobs : int, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    `base_estimator_`: list of estimators
        The base estimator from which the ensemble is grown.

    `estimators_`: list of estimators
        The collection of fitted base estimators.

    `estimators_samples_`: list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    `estimators_features_`: list of arrays
        The subset of drawn features for each base estimator.

    See also
    --------
    costcla.models.CostSensitiveDecisionTreeClassifier

    References
    ----------

    .. [1] Correa Bahnsen, A., Aouada, D., & Ottersten, B.
           `"Ensemble of Example-Dependent Cost-Sensitive Decision Trees" <http://arxiv.org/abs/1505.04637>`__,
           2015, http://arxiv.org/abs/1505.04637.

    Examples
    --------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.cross_validation import train_test_split
    >>> from costcla.datasets import load_creditscoring1
    >>> from costcla.models import CostSensitiveRandomPatchesClassifier
    >>> from costcla.metrics import savings_score
    >>> data = load_creditscoring1()
    >>> sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=0)
    >>> X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
    >>> y_pred_test_rf = RandomForestClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
    >>> f = CostSensitiveRandomPatchesClassifier(combination='weighted_voting')
    >>> y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
    >>> # Savings using only RandomForest
    >>> print(savings_score(y_test, y_pred_test_rf, cost_mat_test))
    0.12454256594
    >>> # Savings using CostSensitiveRandomForestClassifier
    >>> print(savings_score(y_test, y_pred_test_csdt, cost_mat_test))
    0.499548618518
    """
    def __init__(self,
                 n_estimators=10,
                 max_samples=0.5,
                 max_features=0.5,
                 combination='majority_voting',
                 n_jobs=1,
                 verbose=False,
                 pruned=False):
        super(BaggingClassifier, self).__init__(
            base_estimator=CostSensitiveDecisionTreeClassifier(pruned=pruned),
            n_estimators=n_estimators,
            max_samples=max_samples,
            max_features=max_features,
            bootstrap=False,
            bootstrap_features=False,
            combination=combination,
            n_jobs=n_jobs,
            random_state=None,
            verbose=verbose)
        self.pruned = pruned


#TODO not working in parallel, without error

# from costcla.datasets import load_creditscoring1
# data = load_creditscoring1()
# x=data.data
# y=data.target
# c=data.cost_mat
#
# print 'start'
# f = BaggingClassifier(n_estimators=10, verbose=100, n_jobs=2)
# f.fit(x[0:1000],y[0:1000],c[0:1000])
# print 'predict proba'
# f.__setattr__('n_jobs', 4)
# f.predict(x)
# print 'predict END'