Table of Contents
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, Normalizer, scale
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix, log_loss, auc, roc_curve, roc_auc_score, recall_score, precision_recall_curve
from sklearn.metrics import make_scorer, precision_score, fbeta_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
seed = 999
creditcard = pd.read_csv('creditcard.csv')
creditcard.columns = [x.lower() for x in creditcard.columns]
creditcard.rename(columns = {'class': 'fraud'}, inplace = True)
0. Data Preparation
Read in the data. Since the data is from PCA
, there is no missing data issue. Then we will normalize the amount.
# 1. Split Test Data Out
creditcard.drop(columns = 'time', inplace = True)
# Normalize the 'amount' column
scaler = StandardScaler()
creditcard['amount'] = scaler.fit_transform(creditcard['amount'].values.reshape(-1, 1))
# creditcard.drop(columns = 'amount', inplace = True)
X = creditcard.iloc[:, :-1]
y = creditcard.iloc[:, -1]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .33, stratify = y, random_state = seed)
Modeling Part 3: Ensembing(Stacking) Models
In Part I and Part II, we have tested the Logistic Regression and Random Forest models on this imbalanced data.
Here we will try the ebsembing models by combining the predictions of multiple machine learning models. The basic idea is to use multiple base models to predict the data, and then use another model to combine these base model results. Since these base models results will be used as input in the new model, the base models results will be better if they are higher in performance and less correlated. If the base models results are high correlated, then combining them will not improve too much since all the input are correlated. Ensembling / Stacking models have these advantages:
1. A lot of the time, it can beat most state-of-art single model
2. Each single base model can be simple and built quickly
Here we will do these things:
1. Prepare a series of base models: RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, and LinearSVC models.
2. Use GridSearchCV to select the hyperparameters for these candidate models
3. Use the output from the base models as input to build a new model
1. More models with GridSearchCV to Select Hyperparameters
1.1. Prepare base models
We will use these models: RandomForestClassifier
, ExtraTreesClassifier
, AdaBoostClassifier
, GradientBoostingClassifier
, and LinearSVC
models.
seed = 999
oversample_ratio = sum(ytrain == 0) / sum(ytrain == 1) # size to repeat y == 1
# repeat the positive data for X and y
ytrain_pos_oversample = pd.concat([ytrain[ytrain==1]] * oversample_ratio, axis = 0)
Xtrain_pos_oversample = pd.concat([Xtrain.loc[ytrain==1, :]] * oversample_ratio, axis = 0)
# concat the repeated data with the original data together
ytrain_oversample = pd.concat([ytrain, ytrain_pos_oversample], axis = 0).reset_index(drop = True)
Xtrain_oversample = pd.concat([Xtrain, Xtrain_pos_oversample], axis = 0).reset_index(drop = True)
models = {
'rf': RandomForestClassifier(random_state = seed, warm_start = True), # 0:17:24.925482
'et': ExtraTreesClassifier(random_state = seed, warm_start = True), # 0:02:45.797856
'ada': AdaBoostClassifier(random_state = seed), #0:05:17.973671
'gb': GradientBoostingClassifier(random_state = seed, warm_start = True), #0:54:11.773175
'svc': LinearSVC(random_state = seed) } #0:02:01.656640
gv_parameters = {
'rf': {'n_estimators': [50, 100, 200, 500], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf': [10, 20, 50]},
'et': {'n_estimators': [50, 100, 200, 500], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf': [10, 20, 50]},
'ada': {'n_estimators': [50, 100, 200, 500], 'learning_rate': [0.75, 1, 1.5]},
'gb': {'n_estimators': [50, 100, 200, 500], 'max_depth': [10, 20, 50, 100], 'min_samples_leaf': [10, 20, 50]},
'svc': {'C': np.power(5.0, np.arange(-3,3))} }
1.2. GridSearch to find the best hyperparameters
For each model's hyperparameter combinations, GridSearchCV
will loop through the combinations and find the parameters that gives the best performance metric. This will be time consuming if the data is big or if the model is slow(like SVM
) or complicated.
grid_searches = {}
for key in models.keys():
print("---------------------------------Running GridSearchCV for %s. ---------------" % key)
model = models[key]
params = gv_parameters[key]
gs = GridSearchCV(model, params, cv = 3, n_jobs = 70, verbose = 3, scoring=wt_loss_score)
ts = datetime.datetime.now()
gs.fit(Xtrain_oversample, ytrain_oversample)
te = datetime.datetime.now()
print ("run time for " + str(key) + " is: " + str(ts-te))
grid_searches[key] = gs
# to get the summary information of the model results from GridSearchCV on all parameter combinations
def grid_cv_summary(grid_searches):
cv_results = pd.DataFrame()
for key in grid_searches.keys():
model_result = pd.DataFrame(grid_searches[key].cv_results_)
model_result['model'] = key
cv_results = pd.concat([cv_results, model_result], axis = 0)
return cv_results
cv_sum = grid_cv_summary(grid_searches)
cv_grp = cv_sum.groupby('model')
# get the best_params_ for the model with highest mean_test_score
best_params = cv_grp.apply(lambda x: x[x.mean_test_score == x.mean_test_score.max()]).params
best_params = dict(zip(best_params.index.get_level_values(0), best_params.values))
1.3. Base models performance
For each base model, we print out their recall/performance/roc_auc score and confusion matrix. If we only look at ROC, then LinearSVC
gives the best single model, although it has lower precision score than the other models.
for key in grid_searches.keys():
pred_test = grid_searches[key].predict(Xtest)
print ("\n\n\n----------- For %s, the metrics on TEST data is: ---------- \n" %key)
print("recall score on test data is %s" %str(recall_score(ytest, pred_test)))
print("precision score on test data is %s" %str(precision_score(ytest, pred_test)))
print("roc_auc score on test data is %s" %str(roc_auc_score(ytest, pred_test)))
print("confusion matrix on the test data is: \n")
print(confusion_matrix(ytest, pred_test))
----------- For et, the metrics on TEST data is: ----------
recall score on test data is 0.845679012345679
precision score on test data is 0.7696629213483146
roc_auc score on test data is 0.9226210142996714
confusion matrix on the test data is:
[[93784 41]
[ 25 137]]
----------- For rf, the metrics on TEST data is: ----------
recall score on test data is 0.8271604938271605
precision score on test data is 0.8481012658227848
roc_auc score on test data is 0.9134523492317259
confusion matrix on the test data is:
[[93801 24]
[ 28 134]]
----------- For ada, the metrics on TEST data is: ----------
recall score on test data is 0.845679012345679
precision score on test data is 0.7828571428571428
roc_auc score on test data is 0.9226370015099032
confusion matrix on the test data is:
[[93787 38]
[ 25 137]]
----------- For svc, the metrics on TEST data is: ----------
recall score on test data is 0.9320987654320988
precision score on test data is 0.06425531914893617
roc_auc score on test data is 0.9543307576161294
confusion matrix on the test data is:
[[91626 2199]
[ 11 151]]
----------- For gb, the metrics on TEST data is: ----------
recall score on test data is 0.8271604938271605
precision score on test data is 0.8535031847133758
roc_auc score on test data is 0.9134576783018031
confusion matrix on the test data is:
[[93802 23]
[ 28 134]]
1.4. Stacking
We will collect the output from the base models and put them as input into the new model. Here the base model outputs are correlated since there are too many 0. So the outpur from stacking did not improve too much compared to the single model. We can try GridSearch again to see if it can be improved.
def multiple_pred(grid_searches, Xtrain, Xtest, ytrain, ytest):
train_pred = {}
test_pred = {}
for key in grid_searches.keys():
train_pred[key] = grid_searches[key].predict(Xtrain)
test_pred[key] = grid_searches[key].predict(Xtest)
return ((pd.DataFrame(train_pred), ytrain), (pd.DataFrame(test_pred), ytest))
ensemble_data = multiple_pred(grid_searches, Xtrain, Xtest, ytrain, ytest)
# ensembling model, change to xgb later
gbc = GradientBoostingClassifier()
gbc.fit(ensemble_data[0][0], ensemble_data[0][1])
pred = gbc.predict(ensemble_data[1][0])
print recall_score(ensemble_data[1][1], pred)
print confusion_matrix(ensemble_data[1][1], pred)
print classification_report(ensemble_data[1][1], pred)
0.827160493827
[[93802 23]
[ 28 134]]
precision recall f1-score support
0 1.00 1.00 1.00 93825
1 0.85 0.83 0.84 162
avg / total 1.00 1.00 1.00 93987
1.5. Single base model parameters and results
for key in grid_searches.keys():
print(key)
print(grid_searches[key].best_params_)
pred_grid = grid_searches[key].predict(Xtest)
print(roc_auc_score(ytest, pred_grid))
print(recall_score(ytest, pred_grid))
print(confusion_matrix(ytest, pred_grid))
print('\n')
et
{'n_estimators': 500, 'max_depth': 50, 'min_samples_leaf': 10}
0.9226210143
0.845679012346
[[93784 41]
[ 25 137]]
rf
{'n_estimators': 200, 'max_depth': 50, 'min_samples_leaf': 10}
0.913452349232
0.827160493827
[[93801 24]
[ 28 134]]
ada
{'n_estimators': 500, 'learning_rate': 1.5}
0.92263700151
0.845679012346
[[93787 38]
[ 25 137]]
svc
{'C': 1.0}
0.954330757616
0.932098765432
[[91626 2199]
[ 11 151]]
gb
{'n_estimators': 200, 'max_depth': 50, 'min_samples_leaf': 20}
0.913457678302
0.827160493827
[[93802 23]
[ 28 134]]