银行卡盗刷预测(不平衡数据处理)

编程入门行业动态更新时间:2024-10-27 01:23:49

银行卡盗刷预测(不平衡<a href=https://www.elefans.com/category/jswz/34/1768995.html style= 数据处理)"/>

银行卡盗刷预测(不平衡数据处理)

作者:紫雨

#查看数据分布情况
print('统计分布:正常(0) vs. 盗刷(1)')
print(df['Class'].value_counts())      #分类求和
plt.style.use('ggplot')                  #sns.set_style('darkgrid')
df['Class'].value_counts().plot.bar() # sns.countplot(df['Class'])
plt.title('Distribution')
plt.xlabel('Class')
plt.ylabel('transaction')
plt.xticks(ticks=[0,1],labels=['正常(0)','盗刷(1)'],rotation=0)
plt.show()

2.2.2从交易金额观察

normal=df[df['Class']==0]                                               #选出正常刷卡数据
fraud=df[df['Class']==1]                                                 #选出盗刷卡数据
print('正常交易金额统计\n' , normal['Amount'].describe())     #换行键\n和文本都在引号内,\n不放引号外
print('\n盗刷交易金额统计\n' , fraud['Amount'].describe())

plt.figure(figsize=(10,5))        #设置画布大小
ax1=plt.subplot(121)            
sns.boxplot(x='Class',y='Amount',data=df,ax=ax1)
plt.title('Amount distribution')ax2=plt.subplot(122)
df['Amount_log']=np.log(df['Amount']+0.01)  #最小值为0,0的话不能直接log,所以加一个比较小的数保证能够进行变换,或者使用np.log1p(df['Amount'])
sns.boxplot(x='Class',y='Amount_log',data=df,ax=ax2)
plt.title('Distribution in log transformation')
plt.show()

2.2.3从正常和盗刷其他特征分布观察

list=df.iloc[:,1:29].columns                #选出第2列至第第29列的列名并制成列表grid=gridspec.GridSpec(14,2)           #制作14*2的画布
plt.figure(figsize=(15,40))
for i,col in enumerate(df[list]):          #形成索引(i)和列表名(col)一一对应的关系，0对应V1()第2列列名，1对应V2（第3列列名），以此类推ax=plt.subplot(grid[i])                 #在画布上定位sns.distplot(df[df['Class']==1][col],bins=50,color='r')sns.distplot(df[df['Class']==0][col],bins=50,color='g')plt.title(str(col))plt.ylabel('Density')plt.xlabel('')
plt.show()

从以上28个图可看出V2,V3,V4,V9,V10,V11,V12,V14,V16,V17,V18,V19,V27在分布上差异较大,对于正常和盗刷的情况有比较好的区分度,故后边仅会选出这13个特征进行训练

timedelta=pd.to_timedelta(df['Time'],unit='s')             #将时间转换成?d?h?m的形式
df['Time_hour']=(timedelta.dtponents.hours).astype(int)     #提取小时
df['Time_min']=(timedelta.dtponents.minutes).astype(int)   #提取分
sns.distplot(df[df['Class']==0]['Time_hour'],color='g')
sns.distplot(df[df['Class']==1]['Time_hour'],color='r')
plt.xlim([-1,25])                   #将横坐标范围设置在-1至25
plt.show()

3.建立分类模型

from imblearn.over_sampling import SMOTE    #过采样避免数据不平衡
from sklearn.model_selection import train_test_split
from collections import Counter        #计数,分类统计好各种类别的数量
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,fbeta_score,precision_recall_curve#混合矩阵、准确度、精确度、召回率、f值、精确度与召回率曲线
from sklearn.model_selection import cross_val_score    
from sklearn.model_selection import GridSearchCV     #网格自动调参import warnings    #忽略warning
warnings.filterwarnings("ignore", category=DeprecationWarning)

#定义一个评分函数
def print_results(headline,true_value,pred):print(headline)print('accuracy %f' % (accuracy_score(true_value,pred)))print('precision %f' % (precision_score(true_value,pred)))print('recall %f' % (recall_score(true_value,pred)))print('f2 %f' % (fbeta_score(true_value,pred,beta=2)))   #当  β =1时，成为F1-Score，这时召回率和精确率都很重要，权重相同。当有些情况下我们认为精确率更为重要，那就调整 β 的值小于 1 ，如果我们认为召回率更加重要，那就调整 β的值大于1，比如F2-Score。

df=df[['Time_hour','Time_min','V2','V3','V4','V9','V10','V11','V12','V14','V16','V17','V18','V19','V27','Amount','Class']]

# 进行log变换，使得模型分布凸显
df.Amount=np.log(df.Amount+0.01)
df.head()

此处图表省略

x=df.drop(['Class'],axis=1).values
y=df.Class.values

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

3.1逻辑回归

param_grid={'C':[0.01,0.1,1,10],'penalty':['l1','l2']}     # penalty：惩罚项，可选l1,l2，对参数约束，减少过拟合风险，C为正则化系数λ的倒数，通常默认为1
logreg=LogisticRegression(random_state=2)
grid_search_lr=GridSearchCV(logreg,param_grid=param_grid,scoring='recall',cv=5)
grid_search_lr.fit(x_train,y_train)

print(grid_search_lr.best_score_)   #打印逻辑回归最好得分
print(grid_search_lr.best_params_)  #打印逻辑回归最佳参数

logreg=LogisticRegression(C=1,penalty='l2',random_state=2)          #选择最佳参数重新运行逻辑回归模型
logreg.fit(x_train,y_train)                                                           #模型训练
print('cross validation of x and y train: \n',cross_val_score(logreg,x_train,y_train,cv=5,scoring='recall'))    #打印交叉验证得分,确认recall分数

y_pred=logreg.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print_results('logreg classification',y_test,y_pred)

y_pred_prob=logreg.predict_proba(x_test)[:,1]
precision,recall,thresholds=precision_recall_curve(y_test,y_pred_prob)
plt.plot(precision,recall)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

3.2随机森林

param_grid={'max_depth':[3,5,None],'n_estimators':[3,5,10],'max_features':[5,6,7,8]}   #depth:每颗树的深度，None表示最深，n_estimators:树的个数,max_features:特征数

model=RandomForestClassifier(max_features=3,max_depth=2,n_estimators=10,random_state=3,criterion='entropy',n_jobs=-1)  #n_jobs默认为1,-1表示使用CPU的全部核并行运算
grid_search=GridSearchCV(model,param_grid=param_grid,cv=5,scoring='recall')
grid_search.fit(x_train,y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

rf=RandomForestClassifier(max_features=7,max_depth=5,n_estimators=10)
rf.fit(x_train,y_train)
print('Training score data:\n',rf.score(x_train,y_train))
y_pred=rf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print_results('RF classification',y_test,y_pred)

y_pred_prob=rf.predict_proba(x_test)[:,1]
precision,recall,thresholds=precision_recall_curve(y_test,y_pred_prob)
plt.plot(precision,recall)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

4.数据不平衡处理(SMOTE)

print('normal data distribution: {}' .format(Counter(y_train)))
x_smote,y_smote=SMOTE(random_state=4).fit_sample(x_train,y_train)
print('SMOTE data distribution:  {}' .format(Counter(y_smote)))

5.重新建立模型

5.1SMOTE+逻辑回归

param_grid_smote={'C':[0.01,0.1,1,10],'penalty':['l1','l2']}   
logreg_smote=LogisticRegression(random_state=2)
grid_search_lr_smote=GridSearchCV(logreg_smote,param_grid=param_grid_smote,scoring='recall',cv=5)
grid_search_lr_smote.fit(x_smote,y_smote)
print(grid_search_lr_smote.best_score_)
print(grid_search_lr_smote.best_params_)

logreg_smote=LogisticRegression(C=10,penalty='l2',random_state=2)
logreg_smote.fit(x_smote,y_smote)
print('cross validation of x and y train: \n',cross_val_score(logreg_smote,x_smote,y_smote,cv=5,scoring='recall'))

y_pred=logreg_smote.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print_results('logreg classification',y_test,y_pred)

y_pred_prob=logreg_smote.predict_proba(x_test)[:,1]
precision,recall,thresholds=precision_recall_curve(y_test,y_pred_prob)
plt.plot(precision,recall)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

预测后,虽然recall上升,但precision和accuracy 均出现下降,尤其以precision下降最明显,模型预测的不是很好。但从银行的角度,预测盗刷的人偏多,相比较将盗刷预测成正常的模型会好一些

5.2SMOTE+随机森林

model_smote=RandomForestClassifier(max_features=3,max_depth=2,n_estimators=10,random_state=3,criterion='entropy',n_jobs=-1)
grid_search_smote=GridSearchCV(model_smote,param_grid=param_grid,cv=5,scoring='recall')
grid_search.fit(x_smote,y_smote)
print(grid_search.best_score_)
print(grid_search.best_params_)

rf_smote=RandomForestClassifier(max_features=6,max_depth=None,n_estimators=10)
rf_smote.fit(x_smote,y_smote)
print('Training score data:\n',rf_smote.score(x_train_smote,y_train_smote))
y_pred=rf_smote.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print_results('RF classification',y_test,y_pred)

y_pred_prob=rf_smote.predict_proba(x_test)[:,1]
precision,recall,thresholds=precision_recall_curve(y_test,y_pred_prob)
plt.plot(precision,recall)
plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

results=cross_val_score(rf_smote,x_smote,y_smote,cv=10,scoring='recall')
results

#特征值重要性排名
features=['Time_hour','Time_min','V2','V3','V4','V9','V10','V11','V12','V14','V16','V17','V18','V19','V27','Amount']
plt.figure(figsize=(12,6))
feat_import=pd.DataFrame({'Feature':features,'Feature importance':rf_smote.feature_importances_})
feat_import=feat_import.sort_values(by='Feature importance',ascending=False)
sns.barplot(x='Feature',y='Feature importance',data=feat_import)
plt.xticks(rotation=45)
plt.title('Feature importance')
plt.show()

通过SMOTE后，随机森林模型整体表现较好，accuracy虽由0.999561下降至0.999491，precision由 0.915493下降至0.823529，但recall由recall 0.773810上升至 0.833333，F值作为综合评价由0.798526上升至0.831354，该模型预测能力较好。