Source code for costcla.datasets.base

"""
Base IO code for all datasets
https://github.com/scikit-learn/scikit-learn/blob/56057c9630dd13f3c61fbb4c7debdff6ba8e9e8c/sklearn/datasets/base.py
"""

# Copyright (c) 2007 David Cournapeau <cournape@gmail.com>
#               2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
#               2010 Olivier Grisel <olivier.grisel@ensta.org>
#               2014 Alejandro CORREA BAHNSEN <al.bahnsen@gmail.com>
# License: BSD 3 clause

from os.path import dirname
from os.path import join
import numpy as np
import pandas as pd


class Bunch(dict):
    """Container object for datasets: dictionary-like object that
       exposes its keys as attributes."""
    def __init__(self, **kwargs):
        dict.__init__(self, kwargs)
        self.__dict__ = self


[docs]def load_bankmarketing(cost_mat_parameters=None): """Load and return the bank marketing dataset (classification). The bank marketing is a easily transformable example-dependent cost-sensitive classification dataset. Parameters ---------- cost_mat_parameters : Dictionary-like object, optional (default=None) If not None, must include 'per_balance', 'ca', and 'int_r' Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, 'cost_mat', the cost matrix of each example, 'target_names', the meaning of the labels, 'feature_names', the meaning of the features, and 'DESCR', the full description of the dataset. References ---------- .. [1] A. Correa Bahnsen, A. Stojanovic, D.Aouada, B, Ottersten, `"Improving Credit Card Fraud Detection with Calibrated Probabilities" <http://albahnsen.com/files/%20Improving%20Credit%20Card%20Fraud%20Detection%20by%20using%20Calibrated%20Probabilities%20-%20Publish.pdf>`__, in Proceedings of the fourteenth SIAM International Conference on Data Mining, 677-685, 2014. Examples -------- Let's say you are interested in the samples 10, 25, and 50 >>> from costcla.datasets import load_bankmarketing >>> data = load_bankmarketing() >>> data.target[[10, 25, 319]] array([0, 0, 1]) >>> data.cost_mat[[10, 25, 319]] array([[ 1. , 1.66274977, 1. , 0. ], [ 1. , 1.63195811, 1. , 0. ], [ 1. , 5.11141597, 1. , 0. ]]) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'bankmarketing.csv.gz'), delimiter=';', compression='gzip') descr = open(join(module_path, 'descr', 'bankmarketing.rst')).read() #only use features pre-contact: # 1 - age (numeric) # 2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur", # "student","blue-collar","self-employed","retired","technician","services") # 3 - marital : marital status (categorical: "married","divorced","single"; # note: "divorced" means divorced or widowed) # 4 - education (categorical: "unknown","secondary","primary","tertiary") # 5 - default: has credit in default? (binary: "yes","no") # 6 - balance: average yearly balance, in euros (numeric) # 7 - housing: has housing loan? (binary: "yes","no") # 8 - loan: has personal loan? (binary: "yes","no") # 15 - previous: number of contacts performed before this campaign and for this client (numeric) # 16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success") #Folloring features exclude because are collected after the contact event # # related with the last contact of the current campaign: # 9 - contact: contact communication type (categorical: "unknown","telephone","cellular") # 10 - day: last contact day of the month (numeric) # 11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") # 12 - duration: last contact duration, in seconds (numeric) # # other attributes: # 13 - campaign: number of contacts performed during this campaign and for this client # 14 - pdays: number of days that passed by after the client was last contacted from a # previous campaign (numeric, -1 means client was not previously contacted) #Filter if balance>0 raw_data = raw_data.loc[raw_data['balance'] > 0] n_samples = raw_data.shape[0] target = np.zeros((n_samples,), dtype=np.int) target[raw_data['y'].values == 'yes'] = 1 # Create dummies data = raw_data[['age', 'balance', 'previous']] cols_dummies = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome'] for col_ in cols_dummies: temp_ = pd.get_dummies(raw_data[col_], prefix=col_) data = data.join(temp_) # Calculate cost_mat (see[1]) if cost_mat_parameters is None: cost_mat_parameters = {'per_balance': 0.25, 'ca': 1, 'int_r': 0.02463333} per_balance = cost_mat_parameters['per_balance'] ca = cost_mat_parameters['ca'] int_r = cost_mat_parameters['int_r'] cost_mat = np.zeros((n_samples, 4)) # cost_mat[FP,FN,TP,TN] cost_mat[:, 0] = ca cost_mat[:, 1] = np.maximum(data['balance'].values * int_r * per_balance, ca) # C_FN >= C_TN Elkan cost_mat[:, 2] = ca cost_mat[:, 3] = 0.0 return Bunch(data=data.values, target=target, cost_mat=cost_mat, target_names=['no', 'yes'], DESCR=descr, feature_names=data.columns.values, name='DirectMarketing')
[docs]def load_creditscoring1(cost_mat_parameters=None): """Load and return the credit scoring Kaggle Credit competition dataset (classification). The credit scoring is a easily transformable example-dependent cost-sensitive classification dataset. Parameters ---------- cost_mat_parameters : Dictionary-like object, optional (default=None) If not None, must include 'int_r', 'int_cf', 'cl_max', 'n_term', 'k','lgd' Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, 'cost_mat', the cost matrix of each example, 'target_names', the meaning of the labels, 'feature_names', the meaning of the features, and 'DESCR', the full description of the dataset. References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, , 2014. Examples -------- Let's say you are interested in the samples 10, 25, and 50 >>> from costcla.datasets import load_creditscoring1 >>> data = load_creditscoring1() >>> data.target[[10, 17, 400]] array([0, 1, 0]) >>> data.cost_mat[[10, 17, 400]] array([[ 1023.73054104, 18750. , 0. , 0. ], [ 717.25781516, 6749.25 , 0. , 0. ], [ 1004.32819923, 17990.25 , 0. , 0. ]]) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'creditscoring1.csv.gz'), delimiter=',', compression='gzip') descr = open(join(module_path, 'descr', 'creditscoring1.rst')).read() # Exclude MonthlyIncome = nan or =0 or DebtRatio >1 raw_data = raw_data.dropna() raw_data = raw_data.loc[(raw_data['MonthlyIncome'] > 0)] raw_data = raw_data.loc[(raw_data['DebtRatio'] < 1)] target = raw_data['SeriousDlqin2yrs'].values.astype(np.int) data = raw_data.drop(['SeriousDlqin2yrs', 'id'], 1) # Calculate cost_mat (see[1]) if cost_mat_parameters is None: cost_mat_parameters = {'int_r': 0.0479 / 12, 'int_cf': 0.0294 / 12, 'cl_max': 25000, 'n_term': 24, 'k': 3, 'lgd': .75} pi_1 = target.mean() cost_mat = _creditscoring_costmat(data['MonthlyIncome'].values, data['DebtRatio'].values, pi_1, cost_mat_parameters) return Bunch(data=data.values, target=target, cost_mat=cost_mat, target_names=['no', 'yes'], DESCR=descr, feature_names=data.columns.values, name='CreditScoring_Kaggle2011')
[docs]def load_creditscoring2(cost_mat_parameters=None): """Load and return the credit scoring PAKDD 2009 competition dataset (classification). The credit scoring is a easily transformable example-dependent cost-sensitive classification dataset. Parameters ---------- cost_mat_parameters : Dictionary-like object, optional (default=None) If not None, must include 'int_r', 'int_cf', 'cl_max', 'n_term', 'k','lgd' Returns ------- data : Bunch Dictionary-like object, the interesting attributes are: 'data', the data to learn, 'target', the classification labels, 'cost_mat', the cost matrix of each example, 'target_names', the meaning of the labels, 'feature_names', the meaning of the features, and 'DESCR', the full description of the dataset. References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, , 2014. Examples -------- Let's say you are interested in the samples 10, 25, and 50 >>> from costcla.datasets import load_creditscoring2 >>> data = load_creditscoring2() >>> data.target[[10, 17, 50]] array([1, 0, 0]) >>> data.cost_mat[[10, 17, 50]] array([[ 209. , 547.965, 0. , 0. ], [ 24. , 274.725, 0. , 0. ], [ 89. , 371.25 , 0. , 0. ]]) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'creditscoring2.csv.gz'), delimiter='\t', compression='gzip') descr = open(join(module_path, 'descr', 'creditscoring2.rst')).read() # Exclude TARGET_LABEL_BAD=1 == 'N' raw_data = raw_data.loc[raw_data['TARGET_LABEL_BAD=1'] != 'N'] # Exclude 100<PERSONAL_NET_INCOME<10000 raw_data = raw_data.loc[(raw_data['PERSONAL_NET_INCOME'].values.astype(np.float) > 100)] raw_data = raw_data.loc[(raw_data['PERSONAL_NET_INCOME'].values.astype(np.float) < 10000)] target = raw_data['TARGET_LABEL_BAD=1'].values.astype(np.int) # Continuous features cols_con = ['ID_SHOP', 'AGE', 'AREA_CODE_RESIDENCIAL_PHONE', 'PAYMENT_DAY', 'SHOP_RANK', 'MONTHS_IN_RESIDENCE', 'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'MATE_INCOME', 'QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION', 'PERSONAL_NET_INCOME'] data = raw_data[cols_con].astype(float) cols_dummies = ['SEX', 'MARITAL_STATUS', 'FLAG_RESIDENCIAL_PHONE', 'RESIDENCE_TYPE', 'FLAG_MOTHERS_NAME', 'FLAG_FATHERS_NAME', 'FLAG_RESIDENCE_TOWN_eq_WORKING_TOWN', 'FLAG_RESIDENCE_STATE_eq_WORKING_STATE', 'FLAG_RESIDENCIAL_ADDRESS_eq_POSTAL_ADDRESS'] for col_ in cols_dummies: temp_ = pd.get_dummies(raw_data[col_], prefix=col_) data = data.join(temp_) # Calculate cost_mat (see[1]) if cost_mat_parameters is None: cost_mat_parameters = {'int_r': 0.63 / 12, 'int_cf': 0.165 / 12, 'cl_max': 25000 * 0.33, 'n_term': 24, 'k': 3, 'lgd': .75} n_samples = data.shape[0] pi_1 = target.mean() monthly_income = data['PERSONAL_NET_INCOME'].values * 0.33 cost_mat = _creditscoring_costmat(monthly_income, np.zeros(n_samples), pi_1, cost_mat_parameters) return Bunch(data=data.values, target=target, cost_mat=cost_mat, target_names=['no', 'yes'], DESCR=descr, feature_names=data.columns.values, name='CreditScoring_PAKDD2009')
def _creditscoring_costmat(income, debt, pi_1, cost_mat_parameters): """ Private function to calculate the cost matrix of credit scoring models. Parameters ---------- income : array of shape = [n_samples] Monthly income of each example debt : array of shape = [n_samples] Debt ratio each example pi_1 : float Percentage of positives in the training set References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, , 2014. Returns ------- cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represents the costs of: false positives, false negatives, true positives and true negatives, for each example. """ def calculate_a(cl_i, int_, n_term): """ Private function """ return cl_i * ((int_ * (1 + int_) ** n_term) / ((1 + int_) ** n_term - 1)) def calculate_pv(a, int_, n_term): """ Private function """ return a / int_ * (1 - 1 / (1 + int_) ** n_term) #Calculate credit line Cl def calculate_cl(k, inc_i, cl_max, debt_i, int_r, n_term): """ Private function """ cl_k = k * inc_i A = calculate_a(cl_k, int_r, n_term) Cl_debt = calculate_pv(inc_i * min(A / inc_i, 1 - debt_i), int_r, n_term) return min(cl_k, cl_max, Cl_debt) #calculate costs def calculate_cost_fn(cl_i, lgd): return cl_i * lgd def calculate_cost_fp(cl_i, int_r, n_term, int_cf, pi_1, lgd, cl_avg): a = calculate_a(cl_i, int_r, n_term) pv = calculate_pv(a, int_cf, n_term) r = pv - cl_i r_avg = calculate_pv(calculate_a(cl_avg, int_r, n_term), int_cf, n_term) - cl_avg cost_fp = r - (1 - pi_1) * r_avg + pi_1 * calculate_cost_fn(cl_avg, lgd) return max(0, cost_fp) v_calculate_cost_fp = np.vectorize(calculate_cost_fp) v_calculate_cost_fn = np.vectorize(calculate_cost_fn) v_calculate_cl = np.vectorize(calculate_cl) # Parameters k = cost_mat_parameters['k'] int_r = cost_mat_parameters['int_r'] n_term = cost_mat_parameters['n_term'] int_cf = cost_mat_parameters['int_cf'] lgd = cost_mat_parameters['lgd'] cl_max = cost_mat_parameters['cl_max'] cl = v_calculate_cl(k, income, cl_max, debt, int_r, n_term) cl_avg = cl.mean() n_samples = income.shape[0] cost_mat = np.zeros((n_samples, 4)) #cost_mat[FP,FN,TP,TN] cost_mat[:, 0] = v_calculate_cost_fp(cl, int_r, n_term, int_cf, pi_1, lgd, cl_avg) cost_mat[:, 1] = v_calculate_cost_fn(cl, lgd) cost_mat[:, 2] = 0.0 cost_mat[:, 3] = 0.0 return cost_mat