数据导入与查看
# -*- coding: utf-8 -*-
# %%time
# from pyhive import presto
import pandas as pd
import numpy as np
import warnings
import os
data=pd.read_csv('*/全域风险.csv')
data.head(2)
# label= pd.DataFrame(list(result),columns=columns_names)
# label.to_csv('/data/ljk/baixin.csv',index=False)
数据筛选
data2=data[data['fina_date']<'2020-01-01']
feature=['num_id','zhiye','weiyue','gongzhai','qingchang','zhuxing','lvyue','shouxin','xiaofei','xingqu','chengzhang']
data2=data2[feature]
data2.head()
scorecardpy Python包的使用
import scorecardpy as sc
import matplotlib.pyplot as plt
%matplotlib inline
plt.show()
bins_new=sc.woebin(data_new.loc[data_new.overdue!=-1,['zhiye','overdue']], y="overdue")
woebin_plot=sc.woebin_plot(bins_new)
woebin_plot
结果编辑
# data_new = data_new.drop(['flag','var_name'],axis=1)
merge_result_total = pd.DataFrame()
for cl in data_new.columns[1:]:
x=data_new[data_new[cl]>=0][cl]
if len(set(x))>=10:
value_bins=pd.qcut(x,5,duplicates='drop',retbins=True)[0]
data_new['flag']=value_bins #拦截点打标
data_new['var_name']= cl #变量
tmp = data_new[['var_name','flag','overdue']]
tmp.rename(columns={'overdue':'label'},inplace=True)
result_stp=tmp.groupby(['var_name','flag']).count() #拦截数
result_pos=tmp.groupby(['var_name','flag'])['label'].sum().to_frame() #黑样本
result_neg=tmp[tmp['label']==0].groupby(['var_name','flag'])['label'].count().to_frame() #白样本
merge_result=result_stp.merge(result_pos,how='left',on=['var_name','flag']).merge(result_neg,how='left',on=['var_name','flag'])
merge_result.reset_index(inplace=True)
merge_result.rename(columns={'label_x':'stp','label_y':'pos','label':'neg'},inplace=True)
merge_result['rank']=pd.Series([i+1 for i in range(merge_result.shape[0])])
merge_result.sort_values(by=['rank'],axis=0,ascending=True,inplace=True,na_position='last')
merge_result['cunsum_pos']=merge_result.groupby('var_name')['pos'].cumsum()
merge_result['cunsum_neg']=merge_result.groupby('var_name')['neg'].cumsum()
merge_result['cunsum_stp']=merge_result.groupby('var_name')['stp'].cumsum()
merge_result['total_pos']=merge_result[merge_result['rank']== merge_result.shape[0]]['cunsum_pos'].values[0]
merge_result['total_neg']=merge_result[merge_result['rank']== merge_result.shape[0]]['cunsum_neg'].values[0]
merge_result['total_stp']=merge_result[merge_result['rank']== merge_result.shape[0]]['cunsum_stp'].values[0]
res = merge_result
res['intercept']=res['stp']/res['total_stp'] # 区间拦截率
res['precision']=res['pos']/res['stp'] # 准确率
res['recall']=res['pos']/res['total_pos'] #召回率
res['Disturb']=res['neg']/res['total_neg'] #打扰率
res['cum_precision']=res['cunsum_pos']/res['cunsum_stp'] # 累计准确率
res['avg_precision']=res['total_pos']/res['total_stp']
res['cum_recall']=res['cunsum_pos']/res['total_pos'] # 累计召回率
res['cum_Disturb']=res['cunsum_neg']/res['total_neg'] # 累计打扰率
res['ks']=res['cum_recall']-res['cum_Disturb']
res['ks_max']=res.groupby('var_name')['ks'].max().values[0]
rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
merge_result_total = merge_result_total.append(rs)
merge_result_total.rename(columns={'var_name':'变量','flag':'拦截区间','stp':'拦截样本数','pos':'黑样本数','neg':'白样本数','cunsum_pos':'累计黑样本数','cunsum_neg':'累计白样本数','cunsum_stp':'累计拦截数','intercept':'拦截率','precision':'准确率','recall':'召回率','Disturb':'打扰率','cum_precision':'累计准确率','avg_precision':'平均准确率','cum_recall':'累计召回率','cum_Disturb':'累计打扰率','ks':'ks区间值','ks_max':'ks值','total_pos':'总黑样本','total_neg':'总白样本','total_stp':'总样本'},inplace=True)
merge_result_total.to_csv('*/quanyumob3_result0421.csv',header=True,index=False)
merge_result_total
ks曲线函数
调用方法
ks=PlotKS(data_new3['zhiye'],data_new3['overdue'],n=20,asc=True)
ks
plt.show
import pandas as pd
import matplotlib.pyplot as plt
####################### PlotKS ##########################
def PlotKS(preds, labels, n=20, asc=True):
# preds is score: asc=1
# preds is prob: asc=0
pred = preds # 预测值
bad = labels # 取1为bad, 0为good
ksds = pd.DataFrame({'bad': bad, 'pred': pred})
ksds['good'] = 1 - ksds.bad
if asc == 1:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
elif asc == 0:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
ksds1.index = range(len(ksds1.pred))
ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)
ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)
if asc == 1:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
elif asc == 0:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
ksds2.index = range(len(ksds2.pred))
ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)
ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)
# ksds1 ksds2 -> average
ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]
ksds['cumsum_good2'] = ksds2['cumsum_good2']
ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2
# ks
ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
ksds['tile0'] = range(1, len(ksds.ks) + 1)
ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])
qe = list(np.arange(0, 1, 1.0/n))
qe.append(1)
qe = qe[1:]
ks_index = pd.Series(ksds.index)
ks_index = ks_index.quantile(q = qe)
ks_index = np.ceil(ks_index).astype(int)
ks_index = list(ks_index)
ksds = ksds.loc[ks_index]
ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
ksds0 = np.array([[0, 0, 0, 0]])
ksds = np.concatenate([ksds0, ksds], axis=0)
ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])
ks_value = ksds.ks.max()
ks_pop = ksds.tile[ksds.ks.idxmax()]
print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))
# chart
plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',
color='blue', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',
color='red', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.ks, label='ks',
color='green', linestyle='-', linewidth=2)
plt.axvline(ks_pop, color='gray', linestyle='--')
plt.axhline(ks_value, color='green', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')
plt.title('KS=%s ' %np.round(ks_value, 4) +
'at Pop=%s' %np.round(ks_pop, 4), fontsize=15)
return ksds