Credit Card Fraud Detection Using Different Models

In this blog we are going to hands on the Credit Card Fraud Detection Prediction using different models Like (KNN, Decesion Tree Classifier, AdaBoost Classifier , Naive Bayes. We will check which model is giving us high accuracy and which model will fit in with database.

Here is the code

Importing Liabraries

# Your code here - remember to use markdown cells for comments as well!

import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_curve, auc
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

# Extracting Rows and columns of Database 
data = pd.read_csv('Data/creditcard.csv', sep=',')
data.head()

# Extracting Information and shape of database
display(data.describe())
display(data.info())
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time      284807 non-null float64
V1        284807 non-null float64
V2        284807 non-null float64
V3        284807 non-null float64
V4        284807 non-null float64
V5        284807 non-null float64
V6        284807 non-null float64
V7        284807 non-null float64
V8        284807 non-null float64
V9        284807 non-null float64
V10       284807 non-null float64
V11       284807 non-null float64
V12       284807 non-null float64
V13       284807 non-null float64
V14       284807 non-null float64
V15       284807 non-null float64
V16       284807 non-null float64
V17       284807 non-null float64
V18       284807 non-null float64
V19       284807 non-null float64
V20       284807 non-null float64
V21       284807 non-null float64
V22       284807 non-null float64
V23       284807 non-null float64
V24       284807 non-null float64
V25       284807 non-null float64
V26       284807 non-null float64
V27       284807 non-null float64
V28       284807 non-null float64
Amount    284807 non-null float64
Class     284807 non-null int64
dtypes: float64(30), int64(1)

(284807, 31)

data.hist(figsize=(20,20))
plt.show()

Exploratory Data Analysis

# Checking if is there any Null Values or Missing Values in Database
data.isnull().values.any()

False

count_classes = pd.value_counts(data['Class'], sort = True)

count_classes.plot(kind = 'bar', rot=0)

plt.title("Transaction Class Distribution")

plt.xticks(range(2), LABELS)

plt.xlabel("Class")

plt.ylabel("Frequency")

Text(0, 0.5, ‘Frequency’)

# Get the Fraud and Normal Dataset

fraud = data[data['Class'] == 1]

normal = data[data['Class'] == 0]

# Extracting Fraud Transactions Only
fraud.count()
print(fraud.shape)

# Extracting Normal Transactions Only
print(normal.shape)

(284315, 31)

## We need to analyze more amount of information from the transaction data
#How different are the amount of money used in different transaction classes?
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

normal.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle("Amount per transaction by class")
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount')
plt.ylabel('Number of Transaction')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

f,(ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of Transactions Vs Amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

data1 = data.sample(frac=0.1, random_state=1)
data1.shape

(28481, 31)

data.shape

(284807, 31)

fraud = data1[data1['Class'] == 1]
valid = data1[data1['Class'] == 0]

outlier_fraction = len(fraud)/float(len(valid))

print(outlier_fraction)

print("Fraud Class : {}".format(len(fraud)))
print("Valid Class : {}".format(len(valid)))

0.0017234102419808666 Fraud Class : 49 Valid Class : 28432

# Correlation

cormat = data1.corr()
top_corr_feature = cormat.index
plt.figure(figsize=(20,20))

#plt heatmap
g=sns.heatmap(data[top_corr_feature].corr(), annot=True, cmap="RdYlGn")

Classification

# Spliting the data in Train and Test Method

X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Classification Models
clf = linear_model.LogisticRegression(C=1e5)

clf.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred_logistic = np.array(clf.predict(X_test))
y_pred_logistic_decision = clf.decision_function(X_test)
y = np.array(y_test)

# Confusion Matrix
print(confusion_matrix(y_test, y_pred_logistic))

[[56853 11] [ 46 52]]

print(accuracy_score(y_test, y_pred_logistic))

0.9989993328885924

print(classification_report(y_test, y_pred_logistic))

precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.53      0.65        98

    accuracy                           1.00     56962
   macro avg       0.91      0.77      0.82     56962
weighted avg       1.00      1.00      1.00     56962

KNN

#K Nearest Neighbors
#P=2 - Euclidean distance formula
#N_neigbors = 2 gets the highest precision for correctly classifying non - functional which I think is most important to prevent
#sickness
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
kn_clf = KNeighborsClassifier(n_neighbors=2, p=2)
kn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

test_preds = kn_clf.predict(X_test)
test_preds_decision = kn_clf.predict_proba(X_test)

print(classification_report(y_test, test_preds))

precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       1.00      0.08      0.15        98

    accuracy                           1.00     56962
   macro avg       1.00      0.54      0.58     56962
weighted avg       1.00      1.00      1.00     56962

print(confusion_matrix(y_test, test_preds))

[[56864     0]
 [   90     8]]

accuracy_score(y_test, test_preds)

0.9984199992977775

Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

#Gini - measurement of inequility among values for each variable - the higher the gini score, the data is more dispearsed
#Entropy - measurement of uncertainty 
#RandomizedSearchCV- combinatorial grid search - with combinations order does not matter
#Parameters are not independent of eachother
#Combines cross validation with grid search of best parameters
#GridSearchCV - search over all specified parameter values

from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [3,5,7,20]}

gs_inst = GridSearchCV(dtc, param_grid=param_grid,cv=5)
gs_inst.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

from sklearn.metrics import accuracy_score
y_pred_gs = gs_inst.predict(X_test)
accuracy_score(y_test, y_pred_gs)

0.9994733330992591

gs_inst.best_params_

{‘criterion’: ‘entropy’, ‘max_depth’: 7}

param_grid = {'criterion': ['entropy'], 'max_depth': [5]}
gs_inst2 = GridSearchCV(dtc, param_grid=param_grid,cv=5)
gs_inst2.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['entropy'], 'max_depth': [5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

y_pred_gs2 = gs_inst2.predict(X_test)
accuracy_score(y_test, y_pred_gs2)

0.9993679997191109

print(classification_report(y_test, y_pred_gs))

     precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.90      0.79      0.84        98

    accuracy                           1.00     56962
   macro avg       0.95      0.89      0.92     56962
weighted avg       1.00      1.00      1.00     56962

print(confusion_matrix(y_test, y_pred_gs))

Naive Bayes

from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB().fit(X_train, y_train)

nb.classes_

array([0, 1])

#Number of training samples in each class
nb.class_count_

array([227451., 394.])

accuracy_score(y_test, nb.predict(X_test))

0.9991046662687406

nb_y_pred = nb.predict(X_test)

print(classification_report(y_test,  nb.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.81      0.63      0.71        98

    accuracy                           1.00     56962
   macro avg       0.90      0.82      0.85     56962
weighted avg       1.00      1.00      1.00     56962

print(confusion_matrix(y_test, nb.predict(X_test)))

[[56849 15] [ 36 62]]

 
 ### Plot ROC and compare AUC
 
 # Logistic Regression
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic_decision)
auc_logistic = auc(logistic_fpr, logistic_tpr)

# KNN 
knn_fpr, knn_tpr, threshold = roc_curve(y_test, test_preds_decision[:, 1])
auc_knn = auc(knn_fpr, knn_tpr)

# Dicision Tree Classifier
dt_fpr, dt_tpr, threshold = roc_curve(y_test, y_pred_gs2)
auc_dt = auc(dt_fpr, dt_tpr)

# AdaBooster Classifier
ab_fpr, ab_tpr, threshold = roc_curve(y_test, y_pred_gs_decision)
auc_ab = auc(ab_fpr, ab_tpr)

#SGD
sgd_fpr, sgd_tpr, threshold = roc_curve(y_test, sgd_pred_decision)
auc_sgd = auc(sgd_fpr, sgd_tpr)

# Naive Bayes

nb_fpr, nb_tpr, threshold = roc_curve(y_test, nb_y_pred)
auc_nb = auc(nb_fpr, nb_tpr)

plt.figure(figsize=(8,8), dpi=100)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic(auc = %0.3f)' % auc_logistic)
plt.plot(knn_fpr, knn_tpr, linestyle='--', label='KNN(auc = %0.2f)' % auc_knn)
plt.plot(dt_fpr, dt_tpr, linestyle='--', label='Decicion Tree(auc = %0.3f)' % auc_dt)
plt.plot(ab_fpr, ab_tpr, marker='.', label='AdaBooster(auc = %0.3f)' % auc_ab)
plt.plot(sgd_fpr, sgd_tpr, marker='.', label='Stochastic Gradient Descent(auc = %0.3f)' % auc_sgd)
plt.plot(nb_fpr, nb_tpr, marker='.', label='Naive Bayes(auc = %0.3f)' % auc_nb)
plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')

plt.legend()

plt.show()

Conclusion In above figure we can clearly state that AdaBooster Classifier has performed extremly well with having 97% of Area under curve. LogisticRegression Classifier also performed well but still slightly below from AdaBooster Classifier