背景:
目前有一批汽车信用贷款用户违约数据(客户属性 + 账号属性 + 消费行为 +还款行为),市场部门想根据这些数据建立模型从而预测下一批相似用户将来是否会违约。
数据源:
data.csv(一份汽车贷款违约数据)
样本量:7193
建模方法: 逻辑回归
指标评估:准确度 和 ROC 曲线 --用来描述模型分辨能力,对角线以上的图形越高越好
模型结论
- 验证集准确率为67.76%,测试集的准确率为67.37%,精度效果一般
- 验证集的ROC和测试集的ROC为0.791和 0.782,模型效果一般
- 逻辑回归不是很适合该类数据,建议使用决策树,神经网络,贝叶斯分类器,KNN分类算法等相关分类模型预测优化
代码
1.变量的粗筛》2.缺失值处理》3.变量的细筛》4.建模》5.检查共线性和模型的评估
> # 1.读取数据
> data<-read.csv("data.csv")
> data <- data[,c(-1)]
> # 2.变量类型
> # 2.1 分类变量转化为因子
> factor_var = c('bad_ind','bankruptcy_ind','vehicle_make','used_ind')
> data$bad_ind<-as.factor(data$bad_ind)
> data$bankruptcy_ind<-as.factor(data$bankruptcy_ind)
> data$vehicle_make<-as.factor(data$vehicle_make)
> data$used_ind<-as.factor(data$used_ind)
> # 2.2 连续变量转化为数值,没有特殊格式的数值
> # 3. 变量类型即缺失情况,发现有很多缺失值
> dfexplore::dfplot(data)
# 4. 变量粗筛
>
> library(party)
> set.seed(123)
> crf <- cforest(bad_ind~.,controls = cforest_unbiased(mtry = 2,ntree =100),data = data)
> varIMP = varimp(crf);varIMP = varIMP[order(varIMP,decreasing = T)]
> barplot(varIMP)
> abline(h = 0.001,col = 'red')
#选取前20个较为重要的变量
> var = names(varIMP[1:20])
> data1 = data[,var]
> data1$bad_ind = data$bad_ind
>
>
> #5.缺失值,异常值和错误值的处理
>
> outlier = function(data,var)
+ {
+ vmean<-mean(data[,var],na.rm=TRUE)
+ data[is.na(data[,var]),i]<-vmean
+ data[data[,var] < quantile(data[,var],0.01),var] = quantile(data[,var],0.01)
+ # data[筛选条件,对应组] =
+ # quantile(x,probs) #求分位数。
+ # 其中x为待求分位数的数值型向量,pobs为一个由[0,1]之间的概率值组成的数值向量
+ data[data[,var] > quantile(data[,var],0.99),var] = quantile(data[,var],0.99)
+ return(data[,var])
+ }
>
> var = sapply(data1,class)
> interval_var = names(var[var != 'factor'])
> for (i in interval_var){data1[,i] = outlier(data1,i)}
>
> # 检验还有没有缺失值
> dfexplore::dfplot(data1)
#变量的细筛,信息价值,剔除weak以下的信息价值低的变量
> library(woe)
> IV <- iv.mult(data1,"bad_ind",TRUE)
> iv.plot.summary(IV)
> data2 = data1[,setdiff(names(data1),IV[IV$Strength == 'Wery weak',1])]
>
> #训练集测试集
>
> index = sample(nrow(data2),nrow(data2)*.6)
> train = data2[index,]
> test = data2[-index,]
>
> # logit建模
>
> lg<-glm(bad_ind~.,family=binomial(link='logit'),data = train)
> summary(lg)
> lg_ms<-step(lg,direction = "both")
Step: AIC=3655.05
bad_ind ~ fico_score + age_oldest_tr + ltv + tot_rev_line + rev_util +
bankruptcy_ind
Df Deviance AIC
<none> 3639.0 3655.0
+ tot_rev_tr 1 3637.3 3655.3
+ msrp 1 3637.9 3655.9
+ loan_term 1 3638.0 3656.0
+ tot_rev_debt 1 3638.5 3656.5
+ down_pyt 1 3638.6 3656.6
+ tot_derog 1 3638.6 3656.6
+ veh_mileage 1 3638.9 3656.9
+ tot_tr 1 3639.0 3657.0
- rev_util 1 3644.4 3658.4
- tot_rev_line 1 3649.5 3663.5
- age_oldest_tr 1 3670.1 3684.1
- bankruptcy_ind 2 3693.7 3705.7
- ltv 1 3742.3 3756.3
+ vehicle_make 133 3498.0 3780.0
- fico_score 1 4043.0 4057.0
> #共线性,lg_ms模型有没有共线性,发现该模型没有数据共线性的问题
> library(car)
> vif(lg_ms)
GVIF Df GVIF^(1/(2*Df))
fico_score 1.191609 1 1.091609
age_oldest_tr 1.201768 1 1.096252
ltv 1.060458 1 1.029785
tot_rev_line 1.270180 1 1.127023
rev_util 1.178712 1 1.085685
bankruptcy_ind 1.143360 2 1.034060
> #系数情况
> summary(lg_ms)
Call:
glm(formula = bad_ind ~ fico_score + age_oldest_tr + ltv + tot_rev_line +
rev_util + bankruptcy_ind, family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9473 -0.7030 -0.4036 -0.1497 3.0196
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.511e+00 6.754e-01 14.082 < 2e-16 ***
fico_score -1.781e-02 9.609e-04 -18.538 < 2e-16 ***
age_oldest_tr -3.034e-03 5.595e-04 -5.423 5.85e-08 ***
ltv 2.787e-02 2.853e-03 9.769 < 2e-16 ***
tot_rev_line -1.066e-05 3.395e-06 -3.140 0.00169 **
rev_util 2.398e-03 1.029e-03 2.330 0.01983 *
bankruptcy_indN -1.183e+00 1.736e-01 -6.817 9.27e-12 ***
bankruptcy_indY -1.661e+00 2.265e-01 -7.336 2.20e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4466.7 on 4314 degrees of freedom
Residual deviance: 3639.0 on 4307 degrees of freedom
AIC: 3655
Number of Fisher Scoring iterations: 5
> train$lg_p<-predict(lg_ms, train)
> train$p<-(1/(1+exp(-1*train$lg_p)))
> summary(train$p)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.001521 0.062900 0.172000 0.212700 0.323600 0.866600
> train$out<-1
> train[train$p<0.2,]$out<-0
> table(train$bad_ind,train$out)
0 1
0 2202 1195
1 196 722
>
> test$lg_p<-predict(lg_ms, test)
> test$p<-(1/(1+exp(-1*test$lg_p)))
> summary(test$p)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.002186 0.067500 0.177400 0.214300 0.325300 0.875600
> test$out<-1
> test[test$p<0.2,]$out<-0
> table(test$bad_ind,test$out)
0 1
0 1442 809
1 130 497
# 计算验证集准确率为67.76%,测试集的准确率为67.37%
> rate<-sum(train$out==train$bad_ind)/length(train$bad_ind)
> print(rate)
[1] 0.6776362
>
> rate2<-sum(test$out==test$bad_ind)/length(test$bad_ind)
> print(rate2)
[1] 0.6737318
>
> #检验,ROC曲线
>
> library(ROCR)
> pred_Te <- prediction(test$p, test$bad_ind)
> perf_Te <- performance(pred_Te,"tpr","fpr")
> pred_Tr <- prediction(train$p, train$bad_ind)
> perf_Tr <- performance(pred_Tr,"tpr","fpr")
> plot(perf_Te, col='blue',lty=1);
> plot(perf_Tr, col='black',lty=2,add=TRUE);
> abline(0,1,lty=2,col='red')
>
> lr_m_auc<-round(as.numeric(performance(pred_Tr,'auc')@y.values),3)
> lr_m_str<-paste("Mode_Train-AUC:",lr_m_auc,sep="")
> legend(0.3,0.4,c(lr_m_str),2:8)
>
> lr_m_auc<-round(as.numeric(performance(pred_Te,'auc')@y.values),3)
> lr_m_ste<-paste("Mode_Test-AUC:",lr_m_auc,sep="")
> legend(0.3,0.2,c(lr_m_ste),2:8)
参考资料:CDA《信用风险建模》微专业