注,有任何建议或疑问,请加 QQ: 1595218767 ,共同探讨学习
如R/python代码编程作图等方面需要帮忙,欢迎来店咨询 之恒科技, 挥动热情的小爪爪期待你哦
前几天网友遇到一简单的freq统计,由于元数据有 18000000 行,运行半天没出结果,apply/分布式 ...方法尝试了多种, 遂推荐data.table给他, 效果豁然
m1 <- data.table::data.table(m2=sample(1:10000,18000000,replace = TRUE))
system.time(m1[,.N,m2])
用户 系统 流逝
1.60 0.03 1.64
system.time(table(m1$m2))
用户 系统 流逝
14.63 0.25 14.94
A. data.table简介
1. 基本架构
Note: what to do means select/summarise/update/mutate...
2. 特殊符号定义函数
Functions | What for |
---|---|
.N | n(),nrow(dt),length(dt$col) |
.SD | all cols except ones for by/keyby |
.SDcols | cols for .SD |
.I | rowIndex; seq_len(nrow(dt)) |
.GRP | groupid of each group; seq(1,length(distinct(group))) |
.BY | merge_by, 用法见 数据集合并 |
B. 使用笔记
1. 数据读取和存取
1.1 读取 (data)
1.1.1 fread
dt4 <- fread('F:/360Downloads/aa.csv',header = TRUE)
str(dt4)
Classes ‘data.table’ and 'data.frame': 26 obs. of 3 variables:
$ x: int 1 1 2 2 3 3 4 4 5 5 ...
$ y: chr "IR_consensus_ALT" "self_consensus_ALT" "IR_consensus_ALT" "self_consensus_ALT" ...
$ z: int 20 29 26 14 10 7 399 346 439 28 ...
- attr(*, ".internal.selfref")=<externalptr>
1.1.2 其他类型转化
# --- from data.frame to data.table
dt2 <- data.table(iris)
str(dt2)
Classes ‘data.table’ and 'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, ".internal.selfref")=<externalptr>
# --- from matrix to data.table
(mx1 <- matrix(1:12,ncol=4))
[,1] [,2] [,3] [,4]
[1,] 1 4 7 10
[2,] 2 5 8 11
[3,] 3 6 9 12
> (data.table(mx1 <- matrix(1:12,ncol=4)))
V1 V2 V3 V4
1: 1 4 7 10
2: 2 5 8 11
3: 3 6 9 12
# --- from list to data.table (最底层list的规格必须一致)
ls1 <- list(a1=list(a11=c(1:3),a12=letters[4:6]),b1=list(b11=rep('nokia',3),b12=rep('good',3)))
str(ls1)
List of 2
$ a1:List of 2
..$ a11: int [1:3] 1 2 3
..$ a12: chr [1:3] "d" "e" "f"
$ b1:List of 2
..$ b11: chr [1:3] "nokia" "nokia" "nokia"
..$ b12: chr [1:3] "good" "good" "good"
(ls2 <- lapply(ls1,function(x)do.call(cbind,x)))
$a1
a11 a12
[1,] "1" "d"
[2,] "2" "e"
[3,] "3" "f"
$b1
b11 b12
[1,] "nokia" "good"
[2,] "nokia" "good"
[3,] "nokia" "good"
do.call(cbind,ls2)
a11 a12 b11 b12
[1,] "1" "d" "nokia" "good"
[2,] "2" "e" "nokia" "good"
[3,] "3" "f" "nokia" "good"
1.1.3 直接生成 data.table
dt3 <- data.table(a1=c(1:6),a2=rep(1:3,2))
str(dt3)
Classes ‘data.table’ and 'data.frame': 6 obs. of 2 variables:
$ a1: int 1 2 3 4 5 6
$ a2: int 1 2 3 1 2 3
- attr(*, ".internal.selfref")=<externalptr>
1.2 存取 (save)
fwrite(x, file = "",na = "", dec = ".", row.names = FALSE, col.names = TRUE)
fwrite(dt1,'./dt1.csv',row.names=FALSE)
与 write.csv 等函数相较, fwrite 缺少 fileEncoding 参数, 参数(fileEncoding='UTF-8')会对windows 平台对有很大帮助,但该包的maintainer截止现在还没有将其加进维护list的打算...如果你也需要这个参数,请 需求+1 in fwrite.utf-8
2. 数据排序 (arrange)
dt3 <- data.table(a1=c(1:6),a2=rep(1:3,2))
# --- order
dt3[order(a2,-a1)]
a1 a2
1: 4 1
2: 1 1
3: 5 2
4: 2 2
5: 6 3
6: 3 3
dt3[,.SD,keyby=a2]
a2 a1
1: 1 1
2: 1 4
3: 2 2
4: 2 5
5: 3 3
6: 3 6
# --- setkey
setkey(dt3,a2)
dt3
a1 a2
1: 1 1
2: 4 1
3: 2 2
4: 5 2
5: 3 3
6: 6 3
3. 数据分组和统计 (group_by,summarise)
set.seed(123)
m1 <- data.table::data.table(m2=sample(1:10000,18000000,replace = TRUE))
m3 <- m1[,.N,m2]
head(m3)
m2 N
1: 220 846
2: 6271 816
3: 8731 797
4: 5147 813
5: 6910 801
6: 6690 775
m3 <- m1[,.N,keyby=m2]
head(m3)
m2 N
1: 1 782
2: 2 785
3: 3 834
4: 4 769
5: 5 808
6: 6 785
m4 <- m1[order(m2),.(cnt1=.N,mean1=mean(m2)),m2]
head(m4)
m2 cnt1 mean1
1: 1 782 1
2: 2 785 2
3: 3 834 3
4: 4 769 4
5: 5 808 5
6: 6 785 6
dim(m4)
[1] 10000 3
dim(m3)
[1] 10000 2
dim(m1)
[1] 18000000 2
# --- special use of .N
m1[.N] # last row
m2 cnt1
1: 10000 851
# --- .SD
dt4 <- data.table(a1=1:10,a2=rep(c(1,2),each=5),a3=rep(c(3,4),5),a4=runif(10,100,1000))
a1 a2 a3 a4
1: 1 1 3 804.9962
2: 2 1 4 403.3980
3: 3 1 3 212.7495
4: 4 1 4 872.7093
5: 5 1 3 744.6552
6: 6 2 4 760.6584
7: 7 2 3 227.0887
8: 8 2 4 643.6969
9: 9 2 3 489.9786
10: 10 2 4 458.0811
dt4[,lapply(.SD,mean),by=.(a2,a3)] # apply mean to all cols
a2 a3 a1 a4
1: 1 3 3 587.4670
2: 1 4 3 638.0536
3: 2 4 8 620.8121
4: 2 3 8 358.5337
dt4[,lapply(.SD,mean),by=.(a2,a3),.SDcols=c('a4')] # apply mean to all cols of .SDcols
a2 a3 a4
1: 1 3 587.4670
2: 1 4 638.0536
3: 2 4 620.8121
4: 2 3 358.5337
dt4[,lapply(.(a1,a4),mean),by=.(a2,a3)]
a2 a3 V1 V2
1: 1 3 3 587.4670
2: 1 4 3 638.0536
3: 2 4 8 620.8121
4: 2 3 8 358.5337
# --- c() and .() in what-to-do
dt4[,c(.N,lapply(.(a1,a4),mean)),by=.(a2,a3)]
a2 a3 V1 V2 V3
1: 1 3 3 3 587.4670
2: 1 4 2 3 638.0536
3: 2 4 3 8 620.8121
4: 2 3 2 8 358.5337
dt4[,.(.N,lapply(.(a1,a4),mean)),by=.(a2,a3)]
a2 a3 N V2
1: 1 3 3 3
2: 1 3 3 587.467
3: 1 4 2 3
4: 1 4 2 638.0536
5: 2 4 3 8
6: 2 4 3 620.8121
7: 2 3 2 8
8: 2 3 2 358.5337
# --- topN in each group
dt4[,head(.SD,2),by=.(a2)] # top2 in each group
a2 a1 a3 a4
1: 1 1 3 804.9962
2: 1 2 4 403.3980
3: 2 6 4 760.6584
4: 2 7 3 227.0887
dt4[,.SD[1:2],by=.(a2)] # top2 in each group
a2 a1 a3 a4
1: 1 1 3 804.9962
2: 1 2 4 403.3980
3: 2 6 4 760.6584
4: 2 7 3 227.0887
dt4[order(a3),head(.SD,2),by=.(a2)] # top2 in each group ordered by cols
a2 a1 a3 a4
1: 1 1 3 804.9962
2: 1 3 3 212.7495
3: 2 7 3 227.0887
4: 2 9 3 489.9786
# --- .I
dt4[, .(indx1=.I[which.max(a4)],val1=max(a4)), by=a2] # max(a4) index and value
a2 indx1 val1
1: 1 4 872.7093
2: 2 6 760.6584
dt4[,.I[1:2],keyby=.(a2)] # row num of top2 in each group
a2 V1
1: 1 1
2: 1 2
3: 2 6
4: 2 7
# --- .GRP
dt4[,.(ids1=.GRP),by=a3] # Groupid of group(seq(1,n_distinct(group)))
a3 ids1
1: 3 1
2: 4 2
4. 数据筛选
4.1 行筛选 (filter)
iris3 <- data.table(iris)
iris3[1:3,]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 5.1 3.5 1.4 0.2 setosa
2: 4.9 3.0 1.4 0.2 setosa
3: 4.7 3.2 1.3 0.2 setosa
iris3[1:3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 5.1 3.5 1.4 0.2 setosa
2: 4.9 3.0 1.4 0.2 setosa
3: 4.7 3.2 1.3 0.2 setosa
iris3[c(1,3)]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 5.1 3.5 1.4 0.2 setosa
2: 4.7 3.2 1.3 0.2 setosa
iris3[Sepal.Length>=7&Petal.Width>=2.3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1: 7.2 3.6 6.1 2.5 virginica
2: 7.7 2.6 6.9 2.3 virginica
3: 7.7 3.0 6.1 2.3 virginica
iris3[Sepal.Length>=6&Petal.Width>=2][,c(2,4,5),with=FALSE][1:3]
Sepal.Width Petal.Width Species
1: 3.3 2.5 virginica
2: 3.0 2.1 virginica
3: 3.0 2.2 virginica
iris3[Sepal.Length>=6][Petal.Width>=2][,c(2,4,5),with=FALSE][1:3]
Sepal.Width Petal.Width Species
1: 3.3 2.5 virginica
2: 3.0 2.1 virginica
3: 3.0 2.2 virginica
4.2 列筛选 (select)
iris3[1:3,.(Sepal.Length,Petal.Width,Species)]
Sepal.Length Petal.Width Species
1: 5.1 0.2 setosa
2: 4.9 0.2 setosa
3: 4.7 0.2 setosa
iris3[1:3,1:4]
Sepal.Length Sepal.Width Petal.Length Petal.Width
1: 5.1 3.5 1.4 0.2
2: 4.9 3.0 1.4 0.2
3: 4.7 3.2 1.3 0.2
iris3[1:3,c(1,3,4),with=FALSE]
Sepal.Length Petal.Length Petal.Width
1: 5.1 1.4 0.2
2: 4.9 1.4 0.2
3: 4.7 1.3 0.2
iris3[1:3,c(-5),with=FALSE]
Sepal.Length Sepal.Width Petal.Length Petal.Width
1: 5.1 3.5 1.4 0.2
2: 4.9 3.0 1.4 0.2
3: 4.7 3.2 1.3 0.2
iris3[c(1,3,5),-('Species'),with=FALSE]
Sepal.Length Sepal.Width Petal.Length Petal.Width
1: 5.1 3.5 1.4 0.2
2: 4.7 3.2 1.3 0.2
3: 5.0 3.6 1.4 0.2
5. 生成新变量 或者 变量值修改
> (dt3 <- data.table(a1=c(1:6),a2=rep(1:3,2),a3=c(letters[c(1,2,2,3,5)],NA)))
a1 a2 a3
1: 1 1 a
2: 2 2 b
3: 3 3 b
4: 4 1 c
5: 5 2 e
6: 6 3 NA
> dt3[,a4:=a1+a2]
> dt3
a1 a2 a3 a4
1: 1 1 a 2
2: 2 2 b 4
3: 3 3 b 6
4: 4 1 c 5
5: 5 2 e 7
6: 6 3 NA 9
> dt3[a2>2,a4:=NA]
> dt3
a1 a2 a3 a4
1: 1 1 a 2
2: 2 2 b 4
3: 3 3 b NA
4: 4 1 c 5
5: 5 2 e 7
6: 6 3 NA NA
6. 数据拆分 (split)
dt4 <- data.table(a1=1:10,a2=rep(c(1,2),each=5),a3=rep(c(3,4),5),a4=runif(10,100,1000))
> spl1 <- split(dt4,by=c('a2','a3'))
> spl2 <- split(dt4,by=c('a2','a3'),flatten = FALSE)
> str(spl1)
List of 4
$ 1.3:Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
..$ a1: int [1:3] 1 3 5
..$ a2: num [1:3] 1 1 1
..$ a3: num [1:3] 3 3 3
..$ a4: num [1:3] 339 141 819
..- attr(*, ".internal.selfref")=<externalptr>
$ 1.4:Classes ‘data.table’ and 'data.frame': 2 obs. of 4 variables:
..$ a1: int [1:2] 2 4
..$ a2: num [1:2] 1 1
..$ a3: num [1:2] 4 4
..$ a4: num [1:2] 872 498
..- attr(*, ".internal.selfref")=<externalptr>
$ 2.4:Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
..$ a1: int [1:3] 6 8 10
..$ a2: num [1:3] 2 2 2
..$ a3: num [1:3] 4 4 4
..$ a4: num [1:3] 210 286 778
..- attr(*, ".internal.selfref")=<externalptr>
$ 2.3:Classes ‘data.table’ and 'data.frame': 2 obs. of 4 variables:
..$ a1: int [1:2] 7 9
..$ a2: num [1:2] 2 2
..$ a3: num [1:2] 3 3
..$ a4: num [1:2] 605 215
..- attr(*, ".internal.selfref")=<externalptr>
> str(spl2)
List of 2
$ 1:List of 2
..$ 3:Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
.. ..$ a1: int [1:3] 1 3 5
.. ..$ a2: num [1:3] 1 1 1
.. ..$ a3: num [1:3] 3 3 3
.. ..$ a4: num [1:3] 339 141 819
.. ..- attr(*, ".internal.selfref")=<externalptr>
..$ 4:Classes ‘data.table’ and 'data.frame': 2 obs. of 4 variables:
.. ..$ a1: int [1:2] 2 4
.. ..$ a2: num [1:2] 1 1
.. ..$ a3: num [1:2] 4 4
.. ..$ a4: num [1:2] 872 498
.. ..- attr(*, ".internal.selfref")=<externalptr>
$ 2:List of 2
..$ 4:Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
.. ..$ a1: int [1:3] 6 8 10
.. ..$ a2: num [1:3] 2 2 2
.. ..$ a3: num [1:3] 4 4 4
.. ..$ a4: num [1:3] 210 286 778
.. ..- attr(*, ".internal.selfref")=<externalptr>
..$ 3:Classes ‘data.table’ and 'data.frame': 2 obs. of 4 variables:
.. ..$ a1: int [1:2] 7 9
.. ..$ a2: num [1:2] 2 2
.. ..$ a3: num [1:2] 3 3
.. ..$ a4: num [1:2] 605 215
.. ..- attr(*, ".internal.selfref")=<externalptr>
> spl1$`1.3`
a1 a2 a3 a4
1: 1 1 3 339.3754
2: 3 1 3 141.2481
3: 5 1 3 819.0324
> spl2$`1`
$`3`
a1 a2 a3 a4
1: 1 1 3 339.3754
2: 3 1 3 141.2481
3: 5 1 3 819.0324
$`4`
a1 a2 a3 a4
1: 2 1 4 872.0449
2: 4 1 4 497.9801
> spl2$`1`$`3`
a1 a2 a3 a4
1: 1 1 3 339.3754
2: 3 1 3 141.2481
3: 5 1 3 819.0324
# --- 利用split分组实现组内标准化
library(openxlsx)
library(data.table)
call1<-read.xlsx("./360Downloads/data.xlsx",sheet=1)
data1 <- data.table(call1)[,lapply(.SD,sum),keyby=.(wangdian,weekon),.SDcols=names(call)[4:17]]
head(data1[,1:9],10)
wangdian weekon value1 value2 value3 value4 value5 value6 value7
1: 010ADA 1 0 0 2 2 0 0 0
2: 010ADA 2 0 0 6 3 2 0 1
3: 010ADA 3 0 0 1 0 1 0 0
4: 010ADA 4 0 0 6 3 3 0 0
5: 010ADA 5 0 0 8 3 5 0 0
6: 010AFA 1 0 0 9 2 7 0 0
7: 010AFA 2 0 0 21 2 19 0 0
8: 010AFA 3 0 0 21 4 17 0 0
9: 010AFA 4 0 0 31 5 26 0 0
10: 010AFA 5 0 0 37 9 28 0 0
spl1 <- split(data1,by=c('wangdian'))
res1 <- cbind(data1[,1:2],do.call(rbind,lapply(spl1,function(x) do.call(cbind,lapply(x[,3:16],scale)))))
> head(res1[,1:9],10)
wangdian weekon V1 V2 V3 V4 V5 V6 V7
1: 010ADA 1 NaN NaN -0.8764598 -0.1533930 -1.1437255 NaN -0.4472136
2: 010ADA 2 NaN NaN 0.4719399 0.6135720 -0.1039750 NaN 1.7888544
3: 010ADA 3 NaN NaN -1.2135598 -1.6873230 -0.6238503 NaN -0.4472136
4: 010ADA 4 NaN NaN 0.4719399 0.6135720 0.4159002 NaN -0.4472136
5: 010ADA 5 NaN NaN 1.1461398 0.6135720 1.4556507 NaN -0.4472136
6: 010AFA 1 NaN NaN -1.3789086 -0.8330522 -1.4895499 NaN NaN
7: 010AFA 2 NaN NaN -0.2608746 -0.8330522 -0.0480500 NaN NaN
8: 010AFA 3 NaN NaN -0.2608746 -0.1388420 -0.2883000 NaN NaN
9: 010AFA 4 NaN NaN 0.6708204 0.2082630 0.7928250 NaN NaN
10: 010AFA 5 NaN NaN 1.2298374 1.5966833 1.0330750 NaN NaN
7. 数据集合并 (join)
> dt5 <- data.table(a1=1:5,a2=letters[c(1,2,3,5,6)])
> dt6 <- data.table(a3=6:10,a4=letters[c(2,3,3,6,7)])
> dt5
a1 a2
1: 1 a
2: 2 b
3: 3 c
4: 4 e
5: 5 f
> dt6
a3 a4
1: 6 b
2: 7 c
3: 8 c
4: 9 f
5: 10 g
> merge(dt5,dt6,by.x='a2',by.y='a4') # inner jion
a2 a1 a3
1: b 2 6
2: c 3 7
3: c 3 8
4: f 5 9
> merge(dt5,dt6,by.x='a2',by.y='a4',all = TRUE) # full join
a2 a1 a3
1: a 1 NA
2: b 2 6
3: c 3 7
4: c 3 8
5: e 4 NA
6: f 5 9
7: g NA 10
> merge(dt5,dt6,by.x='a2',by.y='a4',all.x = TRUE) # left join
a2 a1 a3
1: a 1 NA
2: b 2 6
3: c 3 7
4: c 3 8
5: e 4 NA
6: f 5 9
> merge(dt5,dt6,by.x='a2',by.y='a4',all.y = TRUE) # right join
a2 a1 a3
1: b 2 6
2: c 3 7
3: c 3 8
4: f 5 9
5: g NA 10
> dt5[dt5$a2 %in% dt6$a4] # semi join
a1 a2
1: 2 b
2: 3 c
3: 5 f
> dt5[!(dt5$a2 %in% dt6$a4)] # anti join
a1 a2
1: 1 a
2: 4 e
# --- setkey
> setkey(dt5,a2)
> setkey(dt6,a4)
> tables()
> dt5[dt6,on=c('a2'='a4')] # right join
a1 a2 a3
1: 2 b 6
2: 3 c 7
3: 3 c 8
4: 5 f 9
5: NA g 10
> dt6[dt5,on=c('a4'='a2')] # left join
a3 a4 a1
1: NA a 1
2: 6 b 2
3: 7 c 3
4: 8 c 3
5: NA e 4
6: 9 f 5
8. 数据变形(melt/dcast)
> (dt3 <- data.table(a1=c(1:6),a2=rep(1:3,2),a3=c(letters[c(1,2,2,3,5)],NA)))
a1 a2 a3
1: 1 1 a
2: 2 2 b
3: 3 3 b
4: 4 1 c
5: 5 2 e
6: 6 3 NA
> melt(dt3,id.vars = c('a1'),measure.vars = c('a2'))
a1 variable value
1: 1 a2 1
2: 2 a2 2
3: 3 a2 3
4: 4 a2 1
5: 5 a2 2
6: 6 a2 3
> (dt4 <- melt(dt3,id.vars = c('a1'),measure.vars = c('a2','a3'),variable.name = 'meltVar',value.name = 'value1'))
a1 meltVar value1
1: 1 a2 1
2: 2 a2 2
3: 3 a2 3
4: 4 a2 1
5: 5 a2 2
6: 6 a2 3
7: 1 a3 a
8: 2 a3 b
9: 3 a3 b
10: 4 a3 c
11: 5 a3 e
12: 6 a3 NA
Warning message:
In melt.data.table(dt3, id.vars = c("a1"), measure.vars = c("a2", :
'measure.vars' [a2, a3] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'character'. All measure variables not of type 'character' will be coerced to. Check DETAILS in ?melt.data.table for more on coercion.
> dt4[,lapply(.SD,class)]
a1 variable value
1: integer factor character
# --- dcast
> dcast(dt4,a1~meltVar,value.var = 'value1')
a1 a2 a3
1: 1 1 a
2: 2 2 b
3: 3 3 b
4: 4 1 c
5: 5 2 e
6: 6 3 NA
> d1 <- data.table(a1=rep(1:2,each=3),a2=rep(1:2,3),a3=1:6)
> dcast(d1,a1~a2,value.var = 'a3',fun.aggregate = sum)
a1 1 2
1: 1 4 2
2: 2 5 10
> dcast(d1,a1~a2,value.var = 'a3')
Aggregate function missing, defaulting to 'length'
a1 1 2
1: 1 2 1
2: 2 1 2
9. 列名称修改
dt3 <- data.table(a1=c(1:6),a2=rep(1:3,2))
> names(dt3)
[1] "a1" "a2"
> setnames(dt3,c('a1','a2'),c('a3','a4'))
> names(dt3)
[1] "a3" "a4"
> setnames(dt3,c(1,2),c('a5','a6'))
> names(dt3)
[1] "a5" "a6"
.SD, .SDcols 特辑
library(openxlsx)
library(data.table)
call<-read.xlsx("G:/rwork/wuliu/distribution1.xlsx",sheet=1)
# --- 按照wangdian, weekon分组对剩余变量求和
data1 <- data.table(call)[,lapply(.SD,sum),
keyby=.(wangdian,weekon),
.SDcols=names(call)[4:17]]
head(data1)
wangdian weekon linshengzhida kuashengjian G-G
1: 010ADA 1 0 2 2
2: 010ADA 2 0 6 3
3: 010ADA 3 0 1 0
4: 010ADA 4 0 6 3
5: 010ADA 5 0 8 3
6: 010AFA 1 0 9 2
# --- 按照 wangdian 分组将各week的变量值完成标准化处理(scale)
spl1 <- split(data1,by=c('wangdian'))
res1 <- do.call(rbind,lapply(spl1,function(x) do.call(cbind,lapply(x[,3:16],scale))))
res2 <- cbind(data1[,1:2],res1)
colnames(res2) <- colnames(data1)
head(res2)
wangdian weekon linshengzhida kuashengjian G-G
1: 010ADA 1 NaN -0.8764598 -0.1533930
2: 010ADA 2 NaN 0.4719399 0.6135720
3: 010ADA 3 NaN -1.2135598 -1.6873230
4: 010ADA 4 NaN 0.4719399 0.6135720
5: 010ADA 5 NaN 1.1461398 0.6135720
6: 010AFA 1 NaN -1.3789086 -0.8330522
# --- NA 替换成 0
res3 <- res2[,c(.(wangdian=wangdian,weekon=weekon),
lapply(.SD,function(x) ifelse(is.na(x),0,x))),
.SDcols=colnames(res2)[-c(1:2)]]
head(res3)
wangdian weekon linshengzhida kuashengjian G-G
1: 010ADA 1 0 -0.8764598 -0.1533930
2: 010ADA 2 0 0.4719399 0.6135720
3: 010ADA 3 0 -1.2135598 -1.6873230
4: 010ADA 4 0 0.4719399 0.6135720
5: 010ADA 5 0 1.1461398 0.6135720
6: 010AFA 1 0 -1.3789086 -0.8330522