data(iris)
set.seed(1234)
library(missForest)
iris.miss <- prodNA(iris)
summary(iris.miss)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :41
# 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:45
# Median :5.700 Median :3.000 Median :4.400 Median :1.300 virginica :45
# Mean :5.787 Mean :3.059 Mean :3.822 Mean :1.182 NA's :19
# 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
# Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# NA's :12 NA's :16 NA's :12 NA's :16
#查阅缺少值在40%以上(不含)数据所在行的行号
library(DMwR)
manyNAs(iris.miss,0.4)
#缺失值统计1
library(mice)
par(mar=c(0,0,0,0))
md.pattern(iris.miss,rotate.names=T)
#缺失值统计2
library(VIM)
aggr(iris.miss,prop=F,numbers=T,cex.axis=0.8)
处理方法:常规方法
#删除缺少值所在行
iris.sub <- na.omit(iris.miss)
iris.sub <- iris.miss[complete.cases(iris.miss),]
nrow(iris.sub)
#平均值填补
iris1 <- iris.miss
library(Hmisc)
iris1$Sepal.Length <- impute(iris.miss$Sepal.Length,mean)
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Imputed Values:
#
# [1] 0.25803403 0.15739130 0.01915991 0.03550725 0.06661992 0.08143547 0.18493570 0.20726623 0.24844720 0.21797885
# [11] 0.24844720 0.13627515
#
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01241 0.00000 0.25803
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.08269565 0.11525709 0.33949534 0.01240999
#中位数填补
iris1$Sepal.Length <- impute(iris.miss$Sepal.Length,median)
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Imputed Values:
#
# [1] 0.23913043 0.14000000 0.03389831 0.05000000 0.08064516 0.09523810 0.19718310 0.21917808 0.25974026 0.22972973
# [11] 0.25974026 0.14925373
#
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01302 0.00000 0.25974
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.08733333 0.12566667 0.35449495 0.01302491
#使用缺失值前(后)的数据进行填补
library(zoo)
iris1$Sepal.Length <- na.locf(iris.miss$Sepal.Length,fromLast = T)#fromLast = F
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.007691 0.000000 0.181818
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.051333333 0.047800000 0.218632111 0.007691407
#众数填补(有多个,也只取第一个)
zs <- function(x){ return(as.numeric(names(sort(table(x),decreasing = T)[1])))}
iris1$Sepal.Length <- impute(iris.miss$Sepal.Length,zs)
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Imputed Values:
#
# [1] 0.08695652 0.00000000 0.15254237 0.16666667 0.19354839 0.20634921 0.29577465 0.31506849 0.35064935 0.32432432
# [11] 0.35064935 0.25373134
#
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01798 0.00000 0.35065
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.12466667 0.25353333 0.50352094 0.01797507
#随机填补
set.seed(1234)
iris1$Sepal.Length <- impute(iris.miss$Sepal.Length,"random")
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Imputed Values:
#
# [1] 0.13043478 0.20000000 0.13559322 0.28333333 0.00000000 0.07936508 0.18309859 0.10958904 0.15584416 0.32432432
# [11] 0.18181818 0.13432836
#
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01278 0.00000 0.32432
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.08400000 0.11626667 0.34097898 0.01278486
由mape(Mean absolute percentage error,平均绝对百分比误差)可知,以上的效果都不咋的,随机填补的效果竟然处于第一等(和平均值一样),其他的都要更差劲
其他方法
#基于数据的中心趋势(差劲)
iris1 <- centralImputation(iris.miss)
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01302 0.00000 0.25974
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.08733333 0.12566667 0.35449495 0.01302491
#KNN填补
library(DMwR)
iris1 <- knnImputation(iris.miss,k=5,scale = T,meth = "weighAvg")
summary(abs(iris$Sepal.Length-iris1$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.004713 0.000000 0.135677
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.032537344 0.022380002 0.149599473 0.004713242
#基于热卡(hot-deck)插补法
library(hot.deck)
iris1 <- hot.deck(iris.miss)
summary(abs(iris$Sepal.Length-iris1$data[[1]][,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.006674 0.000000 0.168831
regr.eval(iris$Sepal.Length,iris1$data[[1]][,1])
# mae mse rmse mape
# 0.043333333 0.032466667 0.180185090 0.006674233
#基于K-means聚类
library(ClustImpute)
res <- ClustImpute(iris.miss[,1:4],nr_cluster=3,seed_nr = 1234)
summary(abs(iris$Sepal.Length-res$complete_data[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01298 0.00000 0.41558
regr.eval(iris$Sepal.Length,res$complete_data[,1])
# mae mse rmse mape
# 0.09000000 0.16473333 0.40587354 0.01298464
#随机森林填补
library(missForest)
set.seed(1234)
iris1<- missForest(iris.miss,ntree = 100)
summary(abs(iris$Sepal.Length-iris1$ximp$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.003736 0.000000 0.098378
regr.eval(iris$Sepal.Length,iris1$ximp$Sepal.Length)
# mae mse rmse mape
# 0.024307178 0.010210473 0.101046885 0.003736354
#多重插补
library(mice)
imputed.data <- mice(iris.miss,seed = 1234)
summary(imputed.data)
# imputed.data$imp$Sepal.Length#每个缺失值有五组插补值
iris1<- complete(imputed.data)
summary(abs(iris$Sepal.Length-iris3$ximp$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.003736 0.000000 0.098378
regr.eval(iris$Sepal.Length,iris1$Sepal.Length)
# mae mse rmse mape
# 0.034000000 0.022600000 0.150332964 0.005511659
#基于逐步线性回归
library(imputeR)
impdata <- impute(iris.miss[1:4], lmFun = "stepBothR")
summary(abs(iris$Sepal.Length-impdata$imp[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.004157 0.000000 0.113774
regr.eval(iris$Sepal.Length,impdata$imp[,1])
# mae mse rmse mape
# 0.028266645 0.014647495 0.121026838 0.004156997
#基于偏最小二乘法
impdata <- impute(iris.miss[1:4], lmFun = "plsR")
summary(abs(iris$Sepal.Length-impdata$imp[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.004602 0.000000 0.117457
regr.eval(iris$Sepal.Length,impdata$imp[,1])
# mae mse rmse mape
# 0.031493171 0.018252665 0.135102424 0.004601763
#基于lasso(ridge)回归
impdata <- impute(iris.miss[1:4], lmFun = "lassoR")#也可以选择ridgeR
summary(abs(iris$Sepal.Length-impdata$imp[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.004185 0.000000 0.113492
regr.eval(iris$Sepal.Length,impdata$imp[,1])
# mae mse rmse mape
# 0.028470418 0.014835973 0.121803010 0.004184943
#基于主成分回归
impdata <- impute(iris.miss[1:4], lmFun = "pcrR")
summary(abs(iris$Sepal.Length-impdata$imp[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.004751 0.000000 0.122191
regr.eval(iris$Sepal.Length,impdata$imp[,1])
# mae mse rmse mape
# 0.032481150 0.019359134 0.139137105 0.004751286
#另一种基于SVD的主成分分析
library(missMDA)
nb <- estim_ncpPCA( iris.miss[1:4],ncp.max = 5)
imputed <- imputePCA(iris.miss[1:4],ncp=2)
summary(abs(iris$Sepal.Length-imputed$completeObs[,1])/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.003319 0.000000 0.145998
regr.eval(iris$Sepal.Length,imputed$completeObs[,1])
# mae mse rmse mape
# 0.021932719 0.014370704 0.119877870 0.003319263
#基于混合数据的因子分析
res.impute <- imputeFAMD(iris.miss,ncp=3)
summary(abs(iris$Sepal.Length-res.impute$completeObs$Sepal.Length)/iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000000 0.000000 0.000000 0.003186 0.000000 0.133000
regr.eval(iris$Sepal.Length,res.impute$completeObs$Sepal.Length)
# mae mse rmse mape
# 0.021656096 0.015004907 0.122494517 0.003185862
其他方法中,除基于数据的中心趋势和K-means聚类外,其他填补的效果都还不错,尤其是随机森林和SVD主成分以及基于混合数据的因子分析的算法,错误率低,效果相当可以