Goal:

Identify unusual/weird events that have a high chance of being a fraud with credit card transactions.

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

1: Customer incentive policy

1-1. Aim

Enlarge money flow by automatically increasing their limit

1-2. Method

Identify target users that never went above the monthly credit card limit (calendar month).

general_info = pd.read_csv('./cc_info.csv')
transaction = pd.read_csv('./transactions.csv',parse_dates=['date'])

general_info.head()

transaction['month'] = pd.to_datetime(transaction['date'],format = '%Y%m%d')

transaction['month'] = transaction['month'].dt.month

transaction.head()

## collect monthly amount for each card holder

credit = pd.merge(general_info,transaction,on = ['credit_card'])

credit.head()

task1 = credit.groupby(['month','credit_card']).agg({'credit_card_limit':'min','transaction_dollar_amount':'sum'})

task1.reset_index(inplace = True)

task1 = task1.loc[(task1['transaction_dollar_amount']<=task1['credit_card_limit'])]

over_limit_user = task1['credit_card'].unique()

### Ratio of identified card holders

over_limit_user.size/credit['credit_card'].unique().size

### details for over limit holders monthly transaction which are over amount

print(task1)

      month       credit_card  credit_card_limit  transaction_dollar_amount
       7  1003715054175576              20000                     162.56
       7  1013870087888817              15000                     281.43
       7  1023820165155391              28000                     943.26
       7  1073931538936472              10000                     220.07
       7  1087468642191606               3000                     241.60
...     ...               ...                ...                        ...
   10  9946917923319410              15000                   10174.10
   10  9958678964376192              10000                    5623.22
   10  9961694231875562              10000                    5442.72
   10  9981251982982618              18000                   12959.72
   10  9986135779184360              14000                   11796.95

[3571 rows x 4 columns]

2 : Increase retention

2-1. Aim:

Decrease the punishment by setting a remind for users to pay their bills.

2-2. Method:

Build a function that for each day, returns a list of users who went above their credit card monthly limit on that day

def credit_alarm(date):
    month = date.month
    print(month)
    task2 = credit.groupby(['month','credit_card']).agg({'credit_card_limit':'min','transaction_dollar_amount':'sum'})
    task2.reset_index(inplace = True)
    users = task2.loc[(task2['month'] == month) & (task2['transaction_dollar_amount']>task2['credit_card_limit'])]['credit_card']
    return users.unique()

# test with example: 2015-10-29 18:23:04
date = pd.to_datetime(['2015-10-29 18:23:04'])
credit_alarm(date[0])

10





array([1106824181265726, 1460880989446247, 1749458277555747,
       1934150487562155, 2245942585429940, 2302576486327459,
       2366928097135853, 2505223645294729, 2610112472096585,
       2891791194252089, 2980539633198204, 3138132199016625,
       3264419298955673, 3276369883343727, 3355576223096097,
       3369600965634913, 3370960377586437, 3546693056773873,
       3676109815092640, 3797102737432115, 3929517687134990,
       3936887050542904, 4052848131106690, 4118286032166087,
       4298557099672376, 4318352196714983, 4462290711836916,
       4973517790485920, 5199442973583621, 5257380962581683,
       5488856737032471, 5612235316109460, 5723635641134781,
       5795626689544539, 5899644472359642, 5915891114492596,
       5975270769354417, 5996982621454469, 6174559182308122,
       6198761755487915, 6292410823269309, 6358192544004241,
       6497866359354370, 6766253113444560, 6984795534098127,
       7107467078128879, 7198750113791865, 7214837915436490,
       7238936669483666, 7266500047328736, 7280963829231048,
       7299183791723634, 7324887971716592, 7338934618553557,
       7492940622489570, 7499289351166761, 7545819552904208,
       7556827548313098, 7762807525339038, 7850942767136368,
       7922818627489943, 7924297455503050, 7943675133681182,
       8117664962797683, 8138690656185482, 8522875529951473,
       8660372645853870, 8766575362057055, 8896425420278012,
       8972201384562696, 9143914562725960, 9213346056999744,
       9484591448272784, 9577424157559810, 9632319271199136,
       9727202337611852, 9999757432802760])

3: Fraud Detection

3-1: Aim

detect all transactions that seem unusual and are worth being investigated further

3-2: Method

implement an unsupervised algorithm (PCA + K-Means)

Feature Engineering + PCA + Building Model

### 1. transform register zip code to [longitude,latitude]

from geopy.geocoders import Nominatim

credit['register_place'] = credit['city']+','+credit['state']

credit['trans_place'] = credit[['Lat','Long']].values.tolist()

geolocator = Nominatim(user_agent="custering")

#adding longituede,latitude information to cities
#since we have over 290000 records but only have 124 distinct cities.

cities = credit['register_place'].unique()

df = pd.DataFrame({'name': cities})

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="clustering")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
df['location'] = df['name'].apply(geocode)

df['point'] = df['location'].apply(lambda x:[x.latitude,x.longitude] if x else None)

df.head()

#merging back to credit and remove duplicate grographic columns
df = pd.merge(credit[['credit_card','credit_card_limit','transaction_dollar_amount','register_place','trans_place']],df[['name','point']],left_on = ['register_place'],right_on = ['name'])

df['register_place'] = df['point']

df

### 2. calculate the distance between resgiered place and transaction place

from geopy.distance import geodesic
df['distance'] = df.apply(lambda x: geodesic(x['register_place'], x['trans_place']).km, axis=1)

df.head()

def stst_percentile(x):
    ps = [25,50,75]
    res = np.percentile(x,ps)
    return pd.Series(res,index = ['{}%'.format(p) for p in ps])

percent = credit.groupby(['credit_card'])['transaction_dollar_amount'].apply(stst_percentile).unstack()

percent.head()

credit_df = pd.merge(df,percent,on= ['credit_card'])

credit_df.to_csv('./credit_df.csv',index = False)



### 1. Clustering: PCA+ k-means

credit_df.head()

from sklearn.preprocessing import scale
#standarization
feature = ['credit_card_limit','transaction_dollar_amount','distance','25%','50%','75%']
X = credit_df[feature]
X = scale(X)

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
PrincipleComp = pca.fit_transform(X)
x2d = pd.DataFrame(PrincipleComp,columns=['pc1','pc2'])
plt.scatter(x2d.pc1,x2d.pc2,alpha = 0.1)

from mpl_toolkits.mplot3d import Axes3D
pca = PCA(n_components=3)
PrincipleComp = pca.fit_transform(X)
x3d = pd.DataFrame(PrincipleComp,columns=['pc1','pc2','pc3'])
fig = plt.figure()
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
plt.scatter(x3d['pc1'], x3d['pc2'], x3d['pc3'],marker='o')

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7fa64d01cc10>

Tuning Hyperparameter

#hyperparameter k selection
inertia = []
for k in range(1, 8):
    kmeans = KMeans(n_clusters=k, random_state=1).fit(X)
    inertia.append(np.sqrt(kmeans.inertia_))

plt.plot(range(1, 8), inertia, marker='s');
plt.xlabel('$k$')
plt.ylabel('$J(C_k)$');

## K-means to cluster


n_clusters = 6
kmeans = KMeans(n_clusters = n_clusters).fit(x3d)

x3d['label'] = kmeans.labels_

x3d['label'].value_counts()

import itertools
colors = itertools.cycle( ['r','g','b','c','m','y','k'] )

fig = plt.figure(figsize = (12,10) )
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
ax.set_xlim3d(-5, 10)
for label in range(n_clusters) :
    temp = x3d.loc[x3d.label == label,:]
    plt.scatter(temp.pc1,temp.pc2,temp.pc3, c=next(colors),label=label,alpha=0.3,marker='o')

plt.legend(loc='best')
plt.show()

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/collections.py:886: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

## check the distribution of x3d labels

x3d.label.value_counts()

  113127
   73124
   70056
   27449
    5445
    5387
Name: label, dtype: int64

Comparing the distribution of two core features among classified two clusters

suspicious_label=4

suspect = credit_df.loc[x3d.label ==4]

suspect.to_csv('./suspect',index = False)

cols = ["transaction_dollar_amount",'75%']
plt.hist(suspect.loc[:,cols].values,bins=50,label=cols)
plt.legend(loc='best')
plt.show()

unsuspect = credit_df.loc[x3d.label !=3]

plt.hist(suspect.loc[:,['distance']].values,bins=50,density = True,label = 'suspect')
plt.hist(unsuspect.loc[:,['distance']].values,bins=50,density = True,label = 'unsuspect')
plt.legend(loc='best')
plt.show()

### deleting 0 distance and replot

unsuspect = unsuspect.loc[unsuspect.distance>100]

plt.hist(suspect.loc[:,['distance']].values,bins=50,density = True,label = 'suspect')
plt.hist(unsuspect.loc[:,['distance']].values,bins=50,density = True,label = 'unsuspect')
plt.legend(loc='best')
plt.show()

quantify the effect of fraud transactions

#ratio of suspectious fraud transactions
suspect.shape[0]/credit_df.shape[0]*100

1.8483441280703898

#total amount of suspectious fraud transactions
suspect['transaction_dollar_amount'].sum()

444493.55000000005

non-parametric & parametric tests

Aim: check whether the mean of suspect and non-suspect team are different.

#for diatance
dis_x = suspect['distance']
dis_y = unsuspect['distance']
#z-test
from statsmodels.stats.weightstats import ztest
r1 = ztest(dis_x,dis_y)
print(r1)
#Mann Whitney U Test
from scipy.stats import mannwhitneyu
r2 = mannwhitneyu(dis_x,dis_y)
print(r2)

(184.11775527497144, 0.0) MannwhitneyuResult(statistic=14921897.5, pvalue=0.0)

#for total_amount
dis_x = suspect['transaction_dollar_amount']
dis_y = unsuspect['transaction_dollar_amount']
#z-test
from statsmodels.stats.weightstats import ztest
r1 = ztest(dis_x,dis_y)
print(r1)
#Mann Whitney U Test
from scipy.stats import mannwhitneyu
r2= mannwhitneyu(dis_x,dis_y)
print(r2)

(11.961471627328077, 5.655073132421734e-33) MannwhitneyuResult(statistic=137735388.0, pvalue=1.5637055034033401e-15)

Credit Card Fraud Detection

Use K-Means clustering to detect irregular credit card transaction.