In this blog we are going to hands on the Credit Card Fraud Detection Prediction using different models Like (KNN, Decesion Tree Classifier, AdaBoost Classifier , Naive Bayes. We will check which model is giving us high accuracy and which model will fit in with database.
Here is the code
Importing Liabraries
# Your code here - remember to use markdown cells for comments as well!
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_curve, auc
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]
# Extracting Rows and columns of Database
data = pd.read_csv('Data/creditcard.csv', sep=',')
data.head()
# Extracting Information and shape of database
display(data.describe())
display(data.info())
print(data.shape)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
Time 284807 non-null float64
V1 284807 non-null float64
V2 284807 non-null float64
V3 284807 non-null float64
V4 284807 non-null float64
V5 284807 non-null float64
V6 284807 non-null float64
V7 284807 non-null float64
V8 284807 non-null float64
V9 284807 non-null float64
V10 284807 non-null float64
V11 284807 non-null float64
V12 284807 non-null float64
V13 284807 non-null float64
V14 284807 non-null float64
V15 284807 non-null float64
V16 284807 non-null float64
V17 284807 non-null float64
V18 284807 non-null float64
V19 284807 non-null float64
V20 284807 non-null float64
V21 284807 non-null float64
V22 284807 non-null float64
V23 284807 non-null float64
V24 284807 non-null float64
V25 284807 non-null float64
V26 284807 non-null float64
V27 284807 non-null float64
V28 284807 non-null float64
Amount 284807 non-null float64
Class 284807 non-null int64
dtypes: float64(30), int64(1)
(284807, 31)
data.hist(figsize=(20,20))
plt.show()
Exploratory Data Analysis
# Checking if is there any Null Values or Missing Values in Database
data.isnull().values.any()
False
count_classes = pd.value_counts(data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")
Text(0, 0.5, ‘Frequency’)
# Get the Fraud and Normal Dataset
fraud = data[data['Class'] == 1]
normal = data[data['Class'] == 0]
# Extracting Fraud Transactions Only
fraud.count()
print(fraud.shape)
# Extracting Normal Transactions Only
print(normal.shape)
(284315, 31)
## We need to analyze more amount of information from the transaction data
#How different are the amount of money used in different transaction classes?
fraud.Amount.describe()
count 492.000000
mean 122.211321
std 256.683288
min 0.000000
25% 1.000000
50% 9.250000
75% 105.890000
max 2125.870000
Name: Amount, dtype: float64
normal.Amount.describe()
count 284315.000000
mean 88.291022
std 250.105092
min 0.000000
25% 5.650000
50% 22.000000
75% 77.050000
max 25691.160000
Name: Amount, dtype: float64
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle("Amount per transaction by class")
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount')
plt.ylabel('Number of Transaction')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();
f,(ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of Transactions Vs Amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()
data1 = data.sample(frac=0.1, random_state=1)
data1.shape
(28481, 31)
data.shape
(284807, 31)
fraud = data1[data1['Class'] == 1]
valid = data1[data1['Class'] == 0]
outlier_fraction = len(fraud)/float(len(valid))
print(outlier_fraction)
print("Fraud Class : {}".format(len(fraud)))
print("Valid Class : {}".format(len(valid)))
0.0017234102419808666 Fraud Class : 49 Valid Class : 28432
# Correlation
cormat = data1.corr()
top_corr_feature = cormat.index
plt.figure(figsize=(20,20))
#plt heatmap
g=sns.heatmap(data[top_corr_feature].corr(), annot=True, cmap="RdYlGn")
Classification
# Spliting the data in Train and Test Method
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Classification Models
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(X_train, y_train)
LogisticRegression(C=100000.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)
y_pred_logistic = np.array(clf.predict(X_test))
y_pred_logistic_decision = clf.decision_function(X_test)
y = np.array(y_test)
# Confusion Matrix
print(confusion_matrix(y_test, y_pred_logistic))
[[56853 11] [ 46 52]]
print(accuracy_score(y_test, y_pred_logistic))
0.9989993328885924
print(classification_report(y_test, y_pred_logistic))
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.83 0.53 0.65 98
accuracy 1.00 56962
macro avg 0.91 0.77 0.82 56962
weighted avg 1.00 1.00 1.00 56962
KNN
#K Nearest Neighbors
#P=2 - Euclidean distance formula
#N_neigbors = 2 gets the highest precision for correctly classifying non - functional which I think is most important to prevent
#sickness
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
kn_clf = KNeighborsClassifier(n_neighbors=2, p=2)
kn_clf.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=2, p=2,
weights='uniform')
test_preds = kn_clf.predict(X_test)
test_preds_decision = kn_clf.predict_proba(X_test)
print(classification_report(y_test, test_preds))
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 1.00 0.08 0.15 98
accuracy 1.00 56962
macro avg 1.00 0.54 0.58 56962
weighted avg 1.00 1.00 1.00 56962
print(confusion_matrix(y_test, test_preds))
[[56864 0]
[ 90 8]]
accuracy_score(y_test, test_preds)
0.9984199992977775
Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
#Gini - measurement of inequility among values for each variable - the higher the gini score, the data is more dispearsed
#Entropy - measurement of uncertainty
#RandomizedSearchCV- combinatorial grid search - with combinations order does not matter
#Parameters are not independent of eachother
#Combines cross validation with grid search of best parameters
#GridSearchCV - search over all specified parameter values
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [3,5,7,20]}
gs_inst = GridSearchCV(dtc, param_grid=param_grid,cv=5)
gs_inst.fit(X_train,y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=False, random_state=None,
splitter='best'),
iid='warn', n_jobs=None,
param_grid={'criterion': ['gini', 'entropy'],
'max_depth': [3, 5, 7, 20]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
from sklearn.metrics import accuracy_score
y_pred_gs = gs_inst.predict(X_test)
accuracy_score(y_test, y_pred_gs)
0.9994733330992591
gs_inst.best_params_
{‘criterion’: ‘entropy’, ‘max_depth’: 7}
param_grid = {'criterion': ['entropy'], 'max_depth': [5]}
gs_inst2 = GridSearchCV(dtc, param_grid=param_grid,cv=5)
gs_inst2.fit(X_train,y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=DecisionTreeClassifier(class_weight=None,
criterion='gini', max_depth=None,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
presort=False, random_state=None,
splitter='best'),
iid='warn', n_jobs=None,
param_grid={'criterion': ['entropy'], 'max_depth': [5]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
y_pred_gs2 = gs_inst2.predict(X_test)
accuracy_score(y_test, y_pred_gs2)
0.9993679997191109
print(classification_report(y_test, y_pred_gs))
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.90 0.79 0.84 98
accuracy 1.00 56962
macro avg 0.95 0.89 0.92 56962
weighted avg 1.00 1.00 1.00 56962
print(confusion_matrix(y_test, y_pred_gs))
Naive Bayes
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB().fit(X_train, y_train)
nb.classes_
array([0, 1])
#Number of training samples in each class
nb.class_count_
array([227451., 394.])
accuracy_score(y_test, nb.predict(X_test))
0.9991046662687406
nb_y_pred = nb.predict(X_test)
print(classification_report(y_test, nb.predict(X_test)))
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.81 0.63 0.71 98
accuracy 1.00 56962
macro avg 0.90 0.82 0.85 56962
weighted avg 1.00 1.00 1.00 56962
print(confusion_matrix(y_test, nb.predict(X_test)))
[[56849 15] [ 36 62]]
### Plot ROC and compare AUC
# Logistic Regression
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic_decision)
auc_logistic = auc(logistic_fpr, logistic_tpr)
# KNN
knn_fpr, knn_tpr, threshold = roc_curve(y_test, test_preds_decision[:, 1])
auc_knn = auc(knn_fpr, knn_tpr)
# Dicision Tree Classifier
dt_fpr, dt_tpr, threshold = roc_curve(y_test, y_pred_gs2)
auc_dt = auc(dt_fpr, dt_tpr)
# AdaBooster Classifier
ab_fpr, ab_tpr, threshold = roc_curve(y_test, y_pred_gs_decision)
auc_ab = auc(ab_fpr, ab_tpr)
#SGD
sgd_fpr, sgd_tpr, threshold = roc_curve(y_test, sgd_pred_decision)
auc_sgd = auc(sgd_fpr, sgd_tpr)
# Naive Bayes
nb_fpr, nb_tpr, threshold = roc_curve(y_test, nb_y_pred)
auc_nb = auc(nb_fpr, nb_tpr)
plt.figure(figsize=(8,8), dpi=100)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic(auc = %0.3f)' % auc_logistic)
plt.plot(knn_fpr, knn_tpr, linestyle='--', label='KNN(auc = %0.2f)' % auc_knn)
plt.plot(dt_fpr, dt_tpr, linestyle='--', label='Decicion Tree(auc = %0.3f)' % auc_dt)
plt.plot(ab_fpr, ab_tpr, marker='.', label='AdaBooster(auc = %0.3f)' % auc_ab)
plt.plot(sgd_fpr, sgd_tpr, marker='.', label='Stochastic Gradient Descent(auc = %0.3f)' % auc_sgd)
plt.plot(nb_fpr, nb_tpr, marker='.', label='Naive Bayes(auc = %0.3f)' % auc_nb)
plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')
plt.legend()
plt.show()
Conclusion In above figure we can clearly state that AdaBooster Classifier has performed extremly well with having 97% of Area under curve. LogisticRegression Classifier also performed well but still slightly below from AdaBooster Classifier