mlr3准备
pacman::p_load(mlr3,mlr3cluster,mlr3viz,GGally,mlr3tuning,mlr3learners,ggfortify,factoextra)
theme_set(theme_bw())
01K-means clustering
参考:浅谈K-means聚类算法
【机器学习】K-means
R语言面向对象之R6 class
R6可变类:初始化 = ,复制$clone(),嵌套(deep = TRUE)
lrnKmeans = lrn("clust.kmeans") #定义一个学习器,用于K-means聚类学习
class(lrnKmeans)
names(lrnKmeans)
lrnKmeans$param_set #查看学习器的参数和对应的参数值。(在没有赋值之前,是默认值)
lrnKmeans$param_set$values #看当前已赋值情况
lrnKmeans$param_set$values=list(centers=3,algorithm='MacQueen') #赋值
lrnKmeans$param_set #查看赋值后的结果
探索 GvHD
data(GvHD, package = "mclust")
gvhdCtrlScale <- as.data.table(scale(GvHD.control)) #scale()降低量纲影响力,均一化:均值/方差
ggscatmat(gvhdCtrlScale)
3.ggscatmat()功能单一,运算速度比ggpairs更快,只能使用连续变量,也就产生下三角为散点图,上三角为相关系数,对角线密度图。
用法:ggscatmat(data, columns = 1:ncol(data), color = NULL, alpha = 1,corMethod = "pearson")
color,表示指定颜色变量。alpha,指定散点图的透明度,默认为1(不透明)。corMethod, 表示指定相关系数的计算方法,默认"pearson",还有"kendall", "spearman"。
或者使用ggpairs:
用法:ggpairs(gvhdCtrlScale,
upper = list(continuous = "density"),
lower = list(continuous = wrap("points", size = 0.4)),
diag = list(continuous = "densityDiag"))
GGally与pairs相关关系图
step1:创建任务
R语言-机器学习实战2
在mlr3中,定义了一个Task对象用来处理机器学习相关的问题。
根据使用者的目的不同,Task可以大致分为几种对象:分类 (TaskClassif ),回归( TaskRegr ), 生存分析(mlr3proba::TaskSurv),密度估计( mlr3proba::TaskDens ),聚类(mlr3cluster::TaskClust ),空间分析 (再划分为:分类和回归,mlr3spatiotempcv::TaskRegrST 和 mlr3spatiotempcv::TaskClassifST) 以及序数回归(TaskOrdinal)。括号里的是创建对应对象的函数。
taskC = TaskClust$new("gvhdCtrlScale", gvhdCtrlScale)
autoplot(taskC)
taskPos = TaskClust$new("gvhdCtrlPos", as.data.table(scale(GvHD.pos)))
1.创建一个聚类任务,采用class$new(id,data)方式
> taskC
<TaskClust:gvhdCtrlScale> (6809 x 4)
* Target: -
* Properties: -
* Features (4):
- dbl (4): CD3, CD4, CD8, CD8b
> class(taskC) #查看类型
[1] "TaskClust" "Task" "R6"
> names(taskC) #查看可以调用的内容
[1] ".__enclos_env__" "uris" "weights"
[4] "order" "groups" "strata"
[7] "data_formats" "feature_types" "ncol"
[10] "nrow" "col_roles" "row_roles"
[13] "properties" "target_names" "feature_names"
[16] "row_names" "row_ids" "hash"
[19] "extra_args" "man" "col_info"
[22] "backend" "task_type" "id"
[25] "clone" "initialize" "droplevels"
[28] "set_col_roles" "set_row_roles" "rename"
[31] "cbind" "rbind" "select"
[34] "filter" "missings" "levels"
[37] "head" "formula" "data"
[40] "print" "format" "help"
> taskC$backend #调用数据
<DataBackendDataTable> (6809x5)
CD4 CD8b CD3 CD8 ..row_id
-0.4373389 0.8752503 -0.2166459 0.1832758 1
0.2595066 0.1289858 0.5752226 -0.3543201 2
-1.2735535 -1.4593936 -1.0738981 0.1139086 3
-1.7576778 -1.9934178 -0.1512623 -0.6491307 4
-1.6403143 -1.8017168 -1.1320169 -0.6057762 5
0.8609942 0.3686120 -0.1730568 -0.2502693 6
[...] (6803 rows omitted)
> taskC$backend$data(rows = 1:9, cols = "CD4") #选择对应的行和列名
CD4
1: -0.4373389
2: 0.2595066
3: -1.2735535
4: -1.7576778
5: -1.6403143
6: 0.8609942
7: -1.1855309
8: -0.4300037
9: 1.1984142
step2:选择学习器
mlr3中的Learner 对象提供了许多R中的机器学习算法,当加载 mlr3learners包后,mlr_learners总共有46种算法,keys为具体聚类算法。
mlr_learners$keys()
> mlr_learners$keys("clust")
[1] "clust.agnes" "clust.ap"
[3] "clust.cmeans" "clust.cobweb"
[5] "clust.dbscan" "clust.diana"
[7] "clust.em" "clust.fanny"
[9] "clust.featureless" "clust.ff"
[11] "clust.kkmeans" "clust.kmeans"
[13] "clust.MBatchKMeans" "clust.meanshift"
[15] "clust.pam" "clust.SimpleKMeans"
[17] "clust.xmeans"
kmeans_lrn = lrn("clust.kmeans")
从自带的学习器字典中,选择“clust.kmeans”学习器,或者使用kmeans_lrn =LearnerClustKMeans$new()
> class(kmeans_lrn)
[1] "LearnerClustKMeans" "LearnerClust" "Learner"
[4] "R6"
> kmeans_lrn
<LearnerClustKMeans:clust.kmeans>
* Model: kmeans
* Parameters: centers=2
* Packages: stats, clue
* Predict Type: partition
* Feature types: logical, integer, numeric
* Properties: complete, exclusive, partitional
step3:参数设置
#查看算法的参数
kmeans_lrn$param_set
kmeans_lrn$param_set$params
#设置算法的参数
kmeans_lrn$param_set$values = list(centers = 3, iter.max = 100, nstart = 10)
Center设置中心点个数;iter设置训练的次数,默认10;nstart随机集合个数,默认1;algorithm算法,默认Hartigan-Wog
> kmeans_lrn$param_set
<ParamSet>
id class lower upper nlevels default value
1: centers ParamUty NA NA Inf 2 2
2: iter.max ParamInt 1 Inf Inf 10
3: algorithm ParamFct NA NA 4 Hartigan-Wong
4: nstart ParamInt 1 Inf Inf 1
5: trace ParamInt 0 Inf Inf 0
> kmeans_lrn$param_set$params
$centers
id class lower upper levels default
1: centers ParamUty NA NA 2
$iter.max
id class lower upper levels default
1: iter.max ParamInt 1 Inf 10
$algorithm
id class lower upper
1: algorithm ParamFct NA NA
levels default
1: Hartigan-Wong,Lloyd,Forgy,MacQueen Hartigan-Wong
$nstart
id class lower upper levels default
1: nstart ParamInt 1 Inf 1
$trace
id class lower upper levels default
1: trace ParamInt 0 Inf 0
> kmeans_lrn$param_set #设置参数后value发生变化
<ParamSet>
id class lower upper nlevels default value
1: centers ParamUty NA NA Inf 2 3
2: iter.max ParamInt 1 Inf Inf 10 100
3: algorithm ParamFct NA NA 4 Hartigan-Wong
4: nstart ParamInt 1 Inf Inf 1 10
5: trace ParamInt 0 Inf Inf 0
step:4训练模型
R语言-机器学习实战3
在taskC上train,可加参数learner$train(task, row_ids=train) ,训练完learner即改变。
set.seed(123)
train_rows = sample(taskC$nrow, 0.8 * taskC$nrow) #按照8:2拆分训练集和测试集
test_rows = setdiff(seq_len(taskC$nrow), train_rows)
kmeans_lrn$train(taskC, row_ids = train_rows)
kmeans_lrn$state #state查看模型信息
> kmeans_lrn$state
$model
K-means clustering with 3 clusters of sizes 1473, 527, 3447
Cluster means: #中心点的值
CD3 CD4 CD8 CD8b
1 -0.01198236 -1.2796089 0.5534520 -0.9119462
2 2.08600789 1.6351170 -0.3622835 -1.3521367
3 -0.31546641 0.2958634 -0.1735216 0.6112573
Clustering vector: #分组索引
[1] 2 1 3 1 2 3 2 1 3 1 2 3 3 3 1 3 1 1 3 3 3 1 3 3 3 3 3 3 3 3
[31] 3 3 1 3 3 1 3 3 1 1 1 3 3 1 3 3 3 1 3 3 3 2 1 1 3 1 1 3 3 2
[61] 2 3 3 3 1 2 3 3 1 3 3 3 3 3 3 3 1 3 2 1 1 1 2 1 3 2 3 3 1 1
[91] 1 3 3 1 1 3 3 1 2 3 3 3 3 1 3 1 3 1 3 1 1 3 3 2 3 1 3 3 3 3
[121] 3 3 1 3 3 1 1 3 3 3 1 3 3 1 1 2 1 3 2 3 1 3 3 3 3 3 3 2 3 3
[151] 3 3 3 1 1 3 3 3 1 3 3 3 3 1 1 3 3 3 3 3 1 3 3 3 2 3 1 3 3 3
[181] 3 3 2 3 3 3 3 3 3 1 1 3 1 3 1 1 1 1 3 3 3 3 1 3 3 2 3 3 3 3
[211] 3 3 3 3 3 1 1 3 1 2 3 2 3 1 2 3 3 1 1 1 3 1 1 3 3 3 1 2 1 3
[241] 3 1 3 2 3 1 2 3 1 3 3 3 3 3 3 3 3 1 3 3 3 2 1 3 1 1 1 1 1 3
[271] 3 3 1 3 3 1 2 3 3 3 3 3 3 3 3 2 3 1 3 1 3 3 3 1 3 3 1 3 3 3
[301] 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 2 1 1 1 3 1 3 1 3 1 2
[331] 3 1 3 1 3 3 1 1 3 3 3 3 3 1 2 1 3 1 3 2 3 3 3 3 3 1 1 3 3 3
[361] 3 1 1 1 3 3 3 3 3 3 1 3 2 3 3 3 3 1 2 3 2 3 1 2 3 3 3 1 3 3
[391] 1 1 3 3 3 3 3 1 2 1 3 3 3 1 3 3 3 1 1 1 1 3 3 3 3 1 3 3 3 3
[421] 2 2 3 3 3 3 3 3 1 2 3 1 1 3 1 3 1 3 3 3 1 1 3 1 1 3 3 3 3 1
[451] 3 3 3 2 3 1 3 3 3 3 1 1 1 3 3 1 3 3 3 3 3 3 1 2 3 2 2 3 3 2
[481] 1 3 1 3 3 3 3 1 3 1 3 3 1 3 3 3 3 1 3 2 3 3 3 3 1 3 3 3 3 3
[511] 1 2 3 3 1 3 1 1 3 3 1 2 3 3 3 2 1 1 3 3 1 3 3 3 3 3 3 3 3 3
[541] 1 3 3 3 3 1 3 3 3 1 3 3 3 1 3 2 1 3 3 3 3 3 2 3 3 1 1 3 3 3
[571] 2 3 2 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3 3 3 2 3 3 3 1 1 3 3
[601] 3 3 1 3 3 1 3 3 1 3 3 3 3 1 1 2 3 3 3 2 3 1 1 3 1 3 3 1 3 1
[631] 1 3 3 1 1 3 2 1 1 3 3 1 3 2 1 3 3 3 3 1 3 1 3 3 1 3 3 3 3 3
[661] 1 1 3 1 3 2 3 3 1 3 3 1 3 1 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3
[691] 1 1 2 3 3 3 3 2 3 1 1 3 1 1 3 1 1 3 3 1 1 3 3 3 3 3 3 3 1 3
[721] 3 3 3 1 1 3 3 1 1 1 3 2 2 1 1 3 3 3 3 2 3 1 3 3 3 3 3 3 2 3
[751] 1 1 1 3 2 3 1 1 1 1 3 1 3 3 3 3 3 1 3 3 2 3 2 3 2 3 3 1 2 3
[781] 3 2 3 3 1 3 3 2 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 3 1 3 3 2 1 2
[811] 3 1 3 3 3 1 3 3 3 3 3 3 3 1 1 3 3 3 3 1 3 2 1 3 3 2 2 3 3 2
[841] 1 3 3 3 1 3 3 2 3 3 3 3 3 1 1 1 2 1 1 3 3 3 3 1 3 3 3 3 1 1
[871] 3 1 3 3 3 3 3 3 3 1 1 3 1 3 1 1 3 2 3 3 3 3 3 3 1 3 3 3 3 2
[901] 3 3 1 3 3 1 3 3 1 3 3 3 3 3 1 3 3 1 3 3 3 3 3 1 1 1 3 3 3 3
[931] 3 3 3 3 3 1 3 3 1 1 1 2 2 1 3 3 1 3 3 3 1 2 1 3 3 1 3 3 3 1
[961] 3 3 3 3 3 3 3 3 3 1 3 3 2 1 1 1 1 3 1 3 1 3 3 3 3 3 3 2 3 3
[991] 1 3 3 3 1 1 3 2 3 3
[ reached getOption("max.print") -- omitted 4447 entries ]
Within cluster sum of squares by cluster: #组内平方和
[1] 5844.0477 812.3538 4307.0784
(between_SS / total_SS = 49.8 %) #组间平方和/总平方和,用于衡量点聚集程度
Available components: #聚类后对象(gvhd.lloyd)的属性
[1] "cluster" "centers" "totss" "withinss"
[5] "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
#cluster每个点的分组 , centers聚类的中心点坐标 , totss总平方和,
#withinss组内平方和, tot.withinss组内总平方和sum(withinss) ,
#betweenss组间平方和totss – tot.withinss, size每个组中的数据点数量,
#iter迭代次数, ifault可能有问题的指标
$log
Empty data.table (0 rows and 3 cols): stage,class,msg
$train_time
[1] 0.06
$train_task
<TaskClust:gvhdCtrlScale> (6809 x 4)
* Target: -
* Properties: -
* Features (4):
- dbl (4): CD3, CD4, CD8, CD8b
> kmeans_lrn$assignments ##直接调用训练出来的结果
[1] 2 1 3 1 2 3 2 1 3 1 2 3 3 3 1 3 1 1 3 3 3 1 3 3 3 3 3 3 3 3
[31] 3 3 1 3 3 1 3 3 1 1 1 3 3 1 3 3 3 1 3 3 3 2 1 1 3 1 1 3 3 2
[61] 2 3 3 3 1 2 3 3 1 3 3 3 3 3 3 3 1 3 2 1 1 1 2 1 3 2 3 3 1 1
[91] 1 3 3 1 1 3 3 1 2 3 3 3 3 1 3 1 3 1 3 1 1 3 3 2 3 1 3 3 3 3
[121] 3 3 1 3 3 1 1 3 3 3 1 3 3 1 1 2 1 3 2 3 1 3 3 3 3 3 3 2 3 3
[151] 3 3 3 1 1 3 3 3 1 3 3 3 3 1 1 3 3 3 3 3 1 3 3 3 2 3 1 3 3 3
[181] 3 3 2 3 3 3 3 3 3 1 1 3 1 3 1 1 1 1 3 3 3 3 1 3 3 2 3 3 3 3
[211] 3 3 3 3 3 1 1 3 1 2 3 2 3 1 2 3 3 1 1 1 3 1 1 3 3 3 1 2 1 3
[241] 3 1 3 2 3 1 2 3 1 3 3 3 3 3 3 3 3 1 3 3 3 2 1 3 1 1 1 1 1 3
[271] 3 3 1 3 3 1 2 3 3 3 3 3 3 3 3 2 3 1 3 1 3 3 3 1 3 3 1 3 3 3
[301] 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 2 1 1 1 3 1 3 1 3 1 2
[331] 3 1 3 1 3 3 1 1 3 3 3 3 3 1 2 1 3 1 3 2 3 3 3 3 3 1 1 3 3 3
[361] 3 1 1 1 3 3 3 3 3 3 1 3 2 3 3 3 3 1 2 3 2 3 1 2 3 3 3 1 3 3
[391] 1 1 3 3 3 3 3 1 2 1 3 3 3 1 3 3 3 1 1 1 1 3 3 3 3 1 3 3 3 3
[421] 2 2 3 3 3 3 3 3 1 2 3 1 1 3 1 3 1 3 3 3 1 1 3 1 1 3 3 3 3 1
[451] 3 3 3 2 3 1 3 3 3 3 1 1 1 3 3 1 3 3 3 3 3 3 1 2 3 2 2 3 3 2
[481] 1 3 1 3 3 3 3 1 3 1 3 3 1 3 3 3 3 1 3 2 3 3 3 3 1 3 3 3 3 3
[511] 1 2 3 3 1 3 1 1 3 3 1 2 3 3 3 2 1 1 3 3 1 3 3 3 3 3 3 3 3 3
[541] 1 3 3 3 3 1 3 3 3 1 3 3 3 1 3 2 1 3 3 3 3 3 2 3 3 1 1 3 3 3
[571] 2 3 2 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3 3 3 2 3 3 3 1 1 3 3
[601] 3 3 1 3 3 1 3 3 1 3 3 3 3 1 1 2 3 3 3 2 3 1 1 3 1 3 3 1 3 1
[631] 1 3 3 1 1 3 2 1 1 3 3 1 3 2 1 3 3 3 3 1 3 1 3 3 1 3 3 3 3 3
[661] 1 1 3 1 3 2 3 3 1 3 3 1 3 1 3 2 3 3 3 3 3 3 3 3 3 3 3 1 3 3
[691] 1 1 2 3 3 3 3 2 3 1 1 3 1 1 3 1 1 3 3 1 1 3 3 3 3 3 3 3 1 3
[721] 3 3 3 1 1 3 3 1 1 1 3 2 2 1 1 3 3 3 3 2 3 1 3 3 3 3 3 3 2 3
[751] 1 1 1 3 2 3 1 1 1 1 3 1 3 3 3 3 3 1 3 3 2 3 2 3 2 3 3 1 2 3
[781] 3 2 3 3 1 3 3 2 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 3 1 3 3 2 1 2
[811] 3 1 3 3 3 1 3 3 3 3 3 3 3 1 1 3 3 3 3 1 3 2 1 3 3 2 2 3 3 2
[841] 1 3 3 3 1 3 3 2 3 3 3 3 3 1 1 1 2 1 1 3 3 3 3 1 3 3 3 3 1 1
[871] 3 1 3 3 3 3 3 3 3 1 1 3 1 3 1 1 3 2 3 3 3 3 3 3 1 3 3 3 3 2
[901] 3 3 1 3 3 1 3 3 1 3 3 3 3 3 1 3 3 1 3 3 3 3 3 1 1 1 3 3 3 3
[931] 3 3 3 3 3 1 3 3 1 1 1 2 2 1 3 3 1 3 3 3 1 2 1 3 3 1 3 3 3 1
[961] 3 3 3 3 3 3 3 3 3 1 3 3 2 1 1 1 1 3 1 3 1 3 3 3 3 3 3 2 3 3
[991] 1 3 3 3 1 1 3 2 3 3
[ reached getOption("max.print") -- omitted 4447 entries ]
对比三种算法Lloyd,MacQueen,Hartigan-Wong的聚类结果,分别对比组内平方总和和组间平方和,要让组内平方和尽量小,组间平方和尽量大。
step5:预测
kmeans_lrn1 = kmeans_lrn$clone() #为了防止误操作,Clone()克隆kmeans_lrn
kmeans_lrn1$predict(taskC,row_ids = test_rows)
autoplot(kmeans_lrn1$predict(taskC,row_ids = test_rows),taskC, type = "scatter" )
kmeansPos = kmeans_lrn$predict(taskPos)
autoplot(kmeansPos, taskPos, type = "pca", frame = F) #需要加载ggfortify,对数据集先进行pca,选取前两个
autoplot(kmeansPos, taskPos, type = "sil", frame = T) #四类在sil指标下的值,越靠近1越好,越靠近-1越不好
> kmeans_lrn1$predict(taskC,row_ids = test_rows)
<PredictionClust> for 1362 observations:
row_id partition
3 1
4 1
7 1
---
6802 3
6804 3
6807 3
step6:调参设置
#要调的参数来自kmeans_lrn$param_set
search = ps( centers = p_int(3,8),
algorithm = p_fct(levels = c("Hartigan-Wong","Lloyd","MacQueen")))
#P_int()中心点值的范围,3到8
#建立一个准则,在参数空间调参
instance = TuningInstanceSingleCrit$new(
task = taskC,
learner = lrn("clust.kmeans",iter.max=100),
resampling = rsmp("holdout"),
measure = msr("clust.db"),
search_space = search,
terminator = trm("none"))
tuner = tnr("grid_search", resolution = 6)
tuner$optimize(instance)
#resolution划分格点,平均切分成6个空间,中心点3到8刚好6份
#所以此处其实无作用,适合centers、algorithm为double类型
#优化后tuner就变了
如果没有聚合记得要调节lrn(iter.max)值
【要调节的超参数】
resampling重抽样,常见重抽样holdout,查看重抽样方法 mlr_resamplings$keys()
measure衡量聚类好坏,db=组内距/组间距,越小越好,其他越大越好。度量函数包括:
clust.ch:Calinski-Harabasz伪F统计量,反映聚类间方差和聚类内方差的比率,CH越大代表着类自身越紧密,类与类之间越分散,即更优的聚类结果。
clust.db:Davies-Bouldin Index(戴维森堡丁指数)(分类适确性指标)(DB,DBI),DB越小意味着类内距离越小,同时类间距离越大。缺点:因使用欧式距离,所以对于环状分布聚类评测很差。
clust.dunn:Dunn Validity Index (邓恩指数)(DVI),DVI越大意味着类间距离越大,同时类内距离越小。缺点:对离散点的聚类测评很高、对环状分布测评效果差。
clust.silhouette:轮廓系数,是聚类效果好坏的一种评价方式。最早由Peter J. Rousseeuw在1986提出。它结合内聚度和分离度两种因素。可以用来在相同原始数据的基础上评价不同算法、或者算法不同运行方式对聚类结果所产生的影响。如果一个簇中的大多数样本具有比较高的轮廓系数(接近1),则簇会有较高的总轮廓系数,则整个数据集的平均轮廓系数越高,则聚类是合适的。如果许多样本点具有低轮廓系数甚至负值(接近-1),则聚类是不合适的,聚类的超参数K可能设定得太大或者太小。
search_space调参空间:在什么空间调
terminator终止条件:在什么情况下终止
> search
<ParamSet>
id class lower upper nlevels default value
1: centers ParamInt 3 8 6 <NoDefault[3]>
2: algorithm ParamFct NA NA 3 <NoDefault[3]>
step7:调参结果
autoplot(instance)
#autoplot(instance,type="parameter")
models = as.data.table(instance$archive$benchmark_result$score(msrs(c("clust.dunn","clust.ch","clust.silhouette"))))
modelmsrs = dplyr::left_join(as.data.table(instance$archive),models)
modelmsrs = tidyr::pivot_longer(modelmsrs[,c(1:3,18:20)], 3:6, names_to = "measures", values_to = "value")
ggplot(modelmsrs, aes(centers, value, color = algorithm)) +
geom_point(size = 1) + geom_line(size =1) +
facet_wrap(~measures, scales = "free_y")
#把其他几个measures合起来,度量标准加入clust.dunn,clust.ch,clust.silhouette
#与刚刚结果进行一个左连接
#scales = "free_y"统一坐标
instance$result_learner_param_vals
#显示最优值
> instance$archive #18种调参方法都找出来,找到db最小的就是最好的
<ArchiveTuning>
centers algorithm clust.db timestamp batch_nr
1: 6 MacQueen 1.04 2021-06-13 11:41:17 1
2: 7 MacQueen 1.19 2021-06-13 11:41:18 2
3: 5 Lloyd 1.05 2021-06-13 11:41:19 3
4: 4 MacQueen 0.72 2021-06-13 11:41:19 4
5: 4 Lloyd 0.72 2021-06-13 11:41:20 5
6: 5 Hartigan-Wong 1.08 2021-06-13 11:41:21 6
7: 3 MacQueen 1.22 2021-06-13 11:41:21 7
8: 3 Hartigan-Wong 0.92 2021-06-13 11:41:22 8
9: 8 Hartigan-Wong 1.26 2021-06-13 11:41:22 9
10: 8 MacQueen 1.15 2021-06-13 11:41:23 10
11: 6 Lloyd 1.15 2021-06-13 11:41:23 11
12: 4 Hartigan-Wong 0.72 2021-06-13 11:41:24 12
13: 7 Lloyd 1.09 2021-06-13 11:41:25 13
14: 8 Lloyd 1.16 2021-06-13 11:41:25 14
15: 6 Hartigan-Wong 1.15 2021-06-13 11:41:26 15
16: 7 Hartigan-Wong 1.31 2021-06-13 11:41:27 16
17: 5 MacQueen 1.09 2021-06-13 11:41:27 17
18: 3 Lloyd 1.00 2021-06-13 11:41:28 18
> instance$result_learner_param_vals
$centers
[1] 4
$iter.max
[1] 100
$algorithm
[1] "MacQueen"
step8:训练模型及预测
kmeans_lrn$param_set$values = instance$result_learner_param_vals
kmeans_lrn$train(taskC,row_ids = train_rows)
kmeans_lrn$state
kmeansC = kmeans_lrn$predict(taskC, row_ids = test_rows)
kmeansPos = kmeans_lrn$predict(taskPos)
autoplot(kmeansPos, taskPos, type = "pca")
> kmeans_lrn$state
$model
K-means clustering with 4 clusters of sizes 339, 525, 1191, 3392
Cluster means:
CD3 CD4 CD8 CD8b
1 1.7150967 -0.9058623 2.84769794 0.6510135
2 2.0719198 1.6207968 -0.39184446 -1.3855953
3 -0.4307912 -1.2808764 0.00401891 -1.2344491
4 -0.3425236 0.2884365 -0.21764604 0.5979080
Clustering vector:
[1] 2 3 4 3 2 4 2 4 4 3 2 4 4 4 1 4 4 3 4 4 4 1 4 4 4 4 4 4 4 4
[31] 4 4 3 4 4 3 4 4 3 3 3 4 4 3 4 4 4 3 4 4 4 2 3 3 4 3 3 4 4 2
[61] 2 4 4 4 3 2 4 4 3 4 4 4 4 1 4 4 3 4 2 3 3 3 2 1 4 2 4 4 3 3
[91] 1 4 4 3 3 4 1 1 2 4 4 4 4 3 4 3 4 3 4 3 3 4 4 2 4 3 4 4 4 4
[121] 4 4 3 4 4 3 3 4 4 4 3 4 4 3 3 2 3 4 2 4 3 4 4 4 4 4 4 2 4 4
[151] 4 4 4 3 1 4 4 4 3 4 4 4 4 1 1 4 4 4 4 4 3 4 4 4 2 4 3 4 4 4
[181] 4 1 2 4 4 4 4 4 4 3 3 4 3 4 3 1 3 1 4 4 4 4 3 4 4 2 4 4 4 4
[211] 4 4 4 4 4 3 3 4 3 2 4 2 4 1 2 4 4 1 3 3 4 3 1 4 4 4 1 2 3 4
[241] 4 3 4 2 4 1 2 4 3 4 4 4 4 4 4 4 4 3 4 4 4 2 3 1 4 3 1 1 3 4
[271] 4 4 1 4 4 1 2 4 4 4 4 4 4 4 4 2 4 3 4 1 4 4 4 3 1 4 3 4 4 4
[301] 4 3 1 4 4 4 4 4 4 4 4 4 4 4 4 4 3 2 4 2 3 3 3 4 3 4 3 4 3 2
[331] 4 3 4 3 4 4 3 3 4 4 4 4 4 3 2 1 4 4 4 2 4 4 4 4 4 3 3 4 4 4
[361] 4 3 1 3 4 4 4 4 4 4 3 4 2 4 4 4 4 3 2 4 2 4 1 2 4 4 4 3 4 4
[391] 3 3 4 1 4 4 4 3 2 3 4 4 4 3 4 4 4 3 3 3 1 4 4 4 4 3 4 4 4 4
[421] 2 2 4 4 4 4 4 4 3 2 4 1 3 1 3 4 3 4 4 4 3 3 4 3 3 4 4 4 4 3
[451] 4 4 4 2 4 3 4 4 4 4 3 3 3 4 4 3 4 4 4 4 4 4 1 2 4 2 2 4 4 2
[481] 1 4 3 4 4 4 4 1 4 3 4 4 3 4 4 4 4 1 4 2 4 4 4 4 4 4 1 4 4 4
[511] 3 2 4 4 3 4 3 3 1 4 3 1 4 4 4 2 1 3 4 4 3 4 4 4 4 4 4 4 4 4
[541] 3 4 4 4 4 3 4 4 4 3 3 4 4 3 4 2 3 4 4 4 4 4 1 4 4 1 3 4 4 4
[571] 2 4 2 4 1 4 4 4 4 4 4 3 3 4 4 4 3 4 4 4 4 4 2 4 4 4 3 3 4 4
[601] 4 4 1 4 4 3 4 4 3 4 4 4 4 3 3 2 4 4 4 2 4 3 3 4 1 4 4 4 4 3
[631] 3 4 4 3 3 4 2 3 3 4 4 1 4 2 3 4 4 4 4 3 4 3 4 4 3 4 4 4 4 4
[661] 3 3 4 3 4 2 4 4 3 4 4 1 4 1 4 2 4 4 4 4 4 4 4 4 4 4 1 3 4 4
[691] 3 3 2 4 4 4 4 2 4 3 3 4 1 3 4 3 3 4 4 3 3 4 4 4 4 4 4 4 3 4
[721] 4 4 4 3 3 4 1 3 3 3 1 2 2 3 3 4 4 4 4 2 4 3 4 4 4 4 4 4 2 4
[751] 1 3 2 4 2 4 3 3 3 3 4 1 4 4 4 4 4 3 4 4 2 4 2 4 2 4 4 3 2 4
[781] 4 2 4 4 3 4 4 2 3 3 4 4 4 4 4 4 3 4 3 3 3 4 4 4 3 4 1 2 3 2
[811] 4 1 4 1 4 1 4 4 4 4 4 4 4 3 1 4 4 4 4 3 4 2 3 4 4 2 2 4 4 2
[841] 1 4 4 4 3 4 4 2 4 4 4 4 4 3 3 1 2 3 1 4 4 1 4 3 4 4 4 4 3 3
[871] 4 3 4 4 4 4 4 4 4 3 1 4 3 4 3 3 4 2 4 4 4 4 4 4 3 4 3 4 4 2
[901] 4 4 3 4 4 3 4 4 1 4 4 4 4 4 3 4 4 3 4 4 4 4 4 3 3 1 4 4 4 4
[931] 4 4 4 4 4 3 4 4 3 3 1 2 2 3 4 3 1 4 4 4 1 2 3 4 4 1 4 4 4 2
[961] 4 4 4 4 4 4 4 4 4 3 4 4 2 1 3 3 3 4 3 4 3 4 4 4 4 4 4 2 4 4
[991] 3 4 4 4 3 3 4 2 4 4
[ reached getOption("max.print") -- omitted 4447 entries ]
> kmeansC
<PredictionClust> for 1362 observations:
row_id partition
3 3
4 3
7 1
---
6802 4
6804 4
6807 4
02分裂式分层聚类
建立分层聚类模型
pacman::p_load(factoextra,patchwork)
data(GvHD, package = "mclust")
gvhdCtrlScale <- as.data.table(scale(GvHD.control))
taskC = TaskClust$new("taskCtrl",backend = gvhdCtrlScale)
train = sample(taskC$nrow, 0.1 * taskC$nrow)
test = setdiff(seq_len(taskC$nrow), train)
hc_lrn = lrn("clust.agnes")
hc_lrn$param_set$params
hc_lrn$param_set$values = list(k = 4, metric = "euclidean", method = "average")
> hc_lrn$param_set$params
$metric
id class lower upper levels default
1: metric ParamFct NA NA euclidean,manhattan euclidean
$stand
id class lower upper levels default
1: stand ParamLgl NA NA TRUE,FALSE FALSE
$method
id class lower upper
1: method ParamFct NA NA
levels default
1: average,single,complete,ward,weighted,flexible,... average
$trace.lev
id class lower upper levels default
1: trace.lev ParamInt 0 Inf 0
$k
id class lower upper levels default
1: k ParamInt 1 Inf 2
$par.method
id class lower upper levels default
1: par.method ParamUty NA NA <NoDefault[3]>
method类与类之间的距离:
"single":最短距离法,一个类中的点和另一个类中的点的最小距离
"complete":最长距离法,一个类中的点和另一个类中的点的最大距离
"median":中间距离法,一个类中的点和另一个类中的点的平均距离
"average":类平均法,两类每对观测间的平均距离
"centroid":重心法,两类重心之间的距离
"mcquitty":相似分析法
"ward":离差平方和法,基于方差分析思想,如果分类合理,则同类样品间离差平方和应当较小,类与类间离差平方和应当较大。"ward.D", "ward.D2"
> hc_lrn #查看设置参数后的模型
<LearnerClustAgnes:clust.agnes>
* Model: agnes
* Parameters: k=4, metric=euclidean, method=average
* Packages: cluster
* Predict Type: partition
* Feature types: logical, integer, numeric
* Properties: complete, exclusive, hierarchical
训练及结果可视化
hc_lrn$train(taskC, row_ids = train)
autoplot(hc_lrn)
autoplot(hc_lrn, k = 4, rect_fill = FALSE, rect = TRUE, rect_border = "red") + theme_bw()
trainHC = cbind(GvHD.control[train,],class = factor(hc_lrn$assignments))
#得到train行所有列,把训练结果加回去
ggscatmat(trainHC, color = "class")
> head(trainHC)
CD4 CD8b CD3 CD8 class
926 474 79 501 217 1
2764 370 49 410 288 1
1470 380 398 161 2 2
2425 406 386 159 143 2
3116 352 384 168 138 2
433 344 325 41 71 2
diana
diana_lrn = lrn("clust.diana")
taskK = TaskClust$new("taskk",gvhdCtrlScale[sample(6809,900),])
diana_lrn$train(taskK)
autoplot(diana_lrn, k = 4, rect_fill = FALSE, rect = TRUE, rect_border = "red") + theme_bw()
> taskK
<TaskClust:taskk> (900 x 4)
* Target: -
* Properties: -
* Features (4):
- dbl (4): CD3, CD4, CD8, CD8b
选择K
search = ps(k = p_int(3,7))
taskK = TaskClust$new("taskK",gvhdCtrlScale[sample(6809,900),])
instance = TuningInstanceSingleCrit$new(
task = taskK,
learner = lrn("clust.agnes"),
resampling = rsmp("cv", folds = 2L),
measure = msr("clust.db"),
search_space = search,
terminator = trm("none")
)
tuner = tnr("grid_search", resolution = 5)
tuner$optimize(instance)
instance$result_learner_param_vals
autoplot(instance, type = "marginal")
> instance$result_learner_param_vals
$k
[1] 6
简化版本(不用R6,但不好调参)
已选好参数可以快速得到结果
建立层级聚类
gvhdDist <- dist(gvhdCtrlScale, method = "euclidean")
gvhdHclust <- hclust(gvhdDist, method = "ward.D2")
# 画树状图
plot(as.dendrogram(gvhdHclust), leaflab = "none") #将聚类得到的对象强制为谱系图
# 剪枝
# cutree()函数用于将hcluster()的输出结果进行剪枝,最终得到指定类别的聚类结果
gvhdCut <- cutree(gvhdHclust, k = 4)
table(gvhdCut) #查看各类数量
# 再画树状图
plot(as.dendrogram(gvhdHclust), leaflab = "none")
rect.hclust(gvhdHclust, k = 9)
#rect.hclust() 在树状图的分支周围绘制矩形,突出显示相应的集群。
# 切割树状图,使其精确地产生 k 簇,或者在高度 h 时切割。(scale 可定义k或h值)
# 将原数据GvHD.control与每一行数据的分组情况gvhdCut 组合到一个数据框
# 用gvhdTib画ggscatmat图,以分组情况作为不同颜色区分
gvhdTib <- dplyr::mutate(GvHD.control, hCluster = as.factor(gvhdCut))
ggscatmat(gvhdTib, color = "hCluster")
1.R中dist()函数计算距离。 dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2) x:数据矩阵或数据框 method:距离计算方法,包括"euclidean"-欧氏距离,"maximum"-切比雪夫距离,"manhattan"-曼哈顿距离(绝对值距离,马氏距离),"canberra"-兰氏距离,"binary"-定性变量距离,"minkowski"-明考斯基距离。 diag,upper:输出距离矩阵的式样,默认只输出下三角矩阵。 p:明氏距离的幂次。
在计算距离之前,一般需要将数据中心化或标准化处理,消除不同量纲或量级的影响。
2.method:类与类之间的距离; "single":最短距离法,一个类中的点和另一个类中的点的最小距离 "complete":最长距离法,一个类中的点和另一个类中的点的最大距离 "median":中间距离法,一个类中的点和另一个类中的点的平均距离 "average":类平均法,两类每对观测间的平均距离 "centroid":重心法,两类重心之间的距离 "mcquitty":相似分析法 "ward":离差平方和法,基于方差分析思想,如果分类合理,则同类样品间离差平方和应当较小,类与类间离差平方和应当较大。"ward.D", "ward.D2" members:取值为NULL或长度为d的向量,用于指定每个待聚类的小类别是由几个样本点组成的。
3.as.dendrogram(object,hang=-1),object是任何可强制转换成dendrogram的对象,它的作用是将系统聚类得到的对象强制为谱系图。
4.cutree()函数用于将hcluster()的输出结果进行剪枝,最终得到指定类别的聚类结果,书写格式为:cutree(tree, k = NULL , h = NULL)。tree:指定函数hcluster()的聚类结果;k:一个整数或向量,用于指定聚类的数目;h:数字标量或向量,用于指定需要剪枝的树的高度。
03基于密度的聚类
k-means 和 分层聚类衡量行间、及行与中心点的距离。
DBSCAN, OPTICS 利用单位空间的样本量,即密度。
DBSCAN
pacman::p_load(mlr3,mlr3cluster,mlr3tuning,mlr3viz,GGally,mclust,dbscan,dplyr) #加载hullplot()所在的包dbscan
theme_set(theme_bw())
step1:建立任务
data(banknote, package = "mclust")
bn = as.data.table(banknote)[,-"Status"] #因为做聚类,所以去掉类别列
bnScaled = as.data.table(scale(bn)) #标准化
task = TaskClust$new(id = "banknoteScaled", backend = bnScaled)
# train_set = sample(task$nrow, 0.8 * task$nrow)
# test_set = setdiff(seq_len(task$nrow), train_set)
class(task)
autoplot(task)
> class(task)
[1] "TaskClust" "Task" "R6"
step2:建立learner,设置超参数
mlr_learners$keys("clust")
dbscan_lrn = lrn("clust.dbscan")
class(dbscan_lrn)
dbscan_lrn$param_set$params #查看参数类型
dbscan_lrn$param_set$values = list(eps = 1.5, minPts = 9) #设置超参数
> mlr_learners$keys("clust")
[1] "clust.agnes" "clust.ap"
[3] "clust.cmeans" "clust.cobweb"
[5] "clust.dbscan" "clust.diana"
[7] "clust.em" "clust.fanny"
[9] "clust.featureless" "clust.ff"
[11] "clust.kkmeans" "clust.kmeans"
[13] "clust.MBatchKMeans" "clust.meanshift"
[15] "clust.pam" "clust.SimpleKMeans"
[17] "clust.xmeans"
> class(dbscan_lrn)
[1] "LearnerClustDBSCAN" "LearnerClust" "Learner"
[4] "R6"
> dbscan_lrn$param_set$params
$eps
id class lower upper levels default
1: eps ParamDbl 0 Inf <NoDefault[3]>
$minPts
id class lower upper levels default
1: minPts ParamInt 0 Inf 5
$borderPoints
id class lower upper levels default
1: borderPoints ParamLgl NA NA TRUE,FALSE TRUE
$weights
id class lower upper levels default
1: weights ParamUty NA NA <NoDefault[3]>
$search
id class lower upper levels default
1: search ParamFct NA NA kdtree,linear,dist kdtree
$bucketSize
id class lower upper levels default
1: bucketSize ParamInt 1 Inf 10
$splitRule
id class lower upper
1: splitRule ParamFct NA NA
levels default
1: STD,MIDPT,FAIR,SL_MIDPT,SL_FAIR,SUGGEST SUGGEST
$approx
id class lower upper levels default
1: approx ParamDbl -Inf Inf 0
> dbscan_lrn$param_set
<ParamSet>
id class lower upper nlevels default
1: approx ParamDbl -Inf Inf Inf 0
2: borderPoints ParamLgl NA NA 2 TRUE
3: bucketSize ParamInt 1 Inf Inf 10
4: eps ParamDbl 0 Inf Inf <NoDefault[3]>
5: minPts ParamInt 0 Inf Inf 5
6: search ParamFct NA NA 3 kdtree
7: splitRule ParamFct NA NA 6 SUGGEST
8: weights ParamUty NA NA Inf <NoDefault[3]>
parents value
1:
2:
3: search
4: 1.5
5: 9
6:
7: search
8:
step3:训练,预测,结果可视化
dbscan_lrn$train(task)
dbscan_lrn$state
dbscan_lrn$assignments
dbPred = dbscan_lrn$predict(task)
bnD = cbind(banknote, assign = dbscan_lrn$assignments, pred = dbPred$partition)
#把train行、预测行所有列加到原数据中
dplyr::count(bnD, Status, pred) #统计每一类各种status数量
ggpairs(bnD,aes(col = factor(pred)),upper = list(continuous = "points"))
> dbscan_lrn$state
$model
DBSCAN clustering for 200 objects.
Parameters: eps = 1.5, minPts = 9
The clustering contains 2 cluster(s) and 19 noise points.
0 1 2
19 94 87
Available fields: cluster, eps, minPts, data
$log
Empty data.table (0 rows and 3 cols): stage,class,msg
$train_time
[1] 0.03
$train_task
<TaskClust:banknoteScaled> (200 x 6)
* Target: -
* Properties: -
* Features (6):
- dbl (6): Bottom, Diagonal, Left, Length, Right, Top
> dbscan_lrn$assignments
[1] 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[31] 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[61] 1 1 1 1 1 1 1 1 1 2 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[91] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 2 2
[121] 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[151] 2 2 0 2 2 2 0 2 2 0 0 2 2 2 2 2 0 0 2 2 0 2 2 2 2 2 2 2 2 0
[181] 2 0 2 2 2 2 0 2 2 0 2 2 2 2 2 2 2 2 2 2
> head(bnD)
Status Length Left Right Bottom Top Diagonal assign pred
1 genuine 214.8 131.0 131.1 9.0 9.7 141.0 0 0
2 genuine 214.6 129.7 129.7 8.1 9.5 141.7 1 1
3 genuine 214.8 129.7 129.7 8.7 9.6 142.2 1 1
4 genuine 214.8 129.7 129.6 7.5 10.4 142.0 1 1
5 genuine 215.0 129.6 129.7 10.4 7.7 141.8 0 0
6 genuine 215.7 130.8 130.5 9.0 10.1 141.4 0 0
> dplyr::count(bnD, Status, pred)
Status pred n
1 counterfeit 0 14
2 counterfeit 2 86
3 genuine 0 5
4 genuine 1 94
5 genuine 2 1
step4:超参数优化
instance = TuningInstanceSingleCrit$new(
task = task,
learner = dbscan_lrn,
resampling = rsmp("holdout"),
measure = msr("clust.db"),
search_space = ps(eps = p_dbl(1.2, 1.9),
minPts = p_int(2,9)),
terminator = trm("none"))
tuner = tnr("grid_search", resolution = 8)
tuner$optimize(instance)
instance$archive
instance$result_learner_param_vals
autoplot(instance)
#autoplot(instance, type = "surface")
> instance$archive
<ArchiveTuning>
eps minPts clust.db timestamp batch_nr
1: 1.6 5 1.49 2021-06-13 23:38:58 1
2: 1.3 6 2.10 2021-06-13 23:39:00 2
3: 1.4 2 1.01 2021-06-13 23:39:02 3
4: 1.8 9 1.75 2021-06-13 23:39:04 4
5: 1.8 6 1.75 2021-06-13 23:39:06 5
6: 1.7 3 0.69 2021-06-13 23:39:07 6
7: 1.5 7 1.49 2021-06-13 23:39:08 7
8: 1.6 2 0.76 2021-06-13 23:39:08 8
9: 1.9 9 1.75 2021-06-13 23:39:09 9
10: 1.9 5 0.69 2021-06-13 23:39:10 10
......
eps minPts clust.db timestamp batch_nr
> instance$result_learner_param_vals
$eps
[1] 1.7
$minPts
[1] 3
step5:超参数优化比较
models = as.data.table(instance$archive$benchmark_result$score(msrs(c("clust.dunn","clust.ch","clust.silhouette"))))
modelmsrs = dplyr::left_join(as.data.table(instance$archive),models)
modelmsrs = tidyr::pivot_longer(modelmsrs[,c(1:3,18:20)], 3:6,names_to = "measures", values_to = "value")
msrsMean <- modelmsrs %>%
group_by(eps, minPts, measures) %>%
summarise(mvalue = mean(value, na.rm = TRUE)) %>%
ungroup()
ggplot(msrsMean, aes(minPts, y = mvalue, color = factor(eps))) +
geom_jitter()+
facet_wrap(~measures, scale = "free_y")
step6:简单版本
bnDbscan = dbscan::dbscan(bnScaled, minPts = 9, eps = 1.5)
banknote %>% mutate(clusters = factor(bnDbscan$cluster)) %>%
ggpairs(aes(col = clusters))
OPTICS密度聚类
DBSCAN在输入参数的选取上比较困难,即DBSCAN对输入参数比较敏感。当给定全局参数eps和minPts时,会存在问题:不同的全局参数会得到不同的聚类结果。
OPTICS(Ordering Points To Identify the Clustering Structure, OPTICS)实际上是DBSCAN算法的一种有效扩展,主要解决对输入参数敏感的问题。它使得基于密度的聚类结构能够呈现出一种特殊的顺序,该顺序所对应的聚类结构包含了每个层级的聚类的信息,并且便于分析。
OPTICS并不显示产生的结果类簇,而是为聚类分析生成一个排序,这个排序代表了各样本点基于密度的聚类结构。换句话说,从这个排序中可以得到基于任何参数eps和minPts的DBSCAN算法的聚类结果。
基本概念:
核心距离:假定P是核心对象,人为给定一个阈值A,然后计算关于P点满足阈值A的最小的半径R,即在R内,P最少有给定A个点数。
可达距离:对象q到对象p的可达距离是指p的核心距离和p与q之间欧几里得距离之间的较大值。如果p不是核心对象,p和q之间的可达距离没有意义。
bnOptics = dbscan::optics(bnScaled, minPts = 9)
plot(bnOptics)
# 使用xi方法提取不同密度的层次聚类
bnOpticsXi <- dbscan::extractXi(bnOptics, xi = 0.05)
bnOpticsXi
hullplot(bn, bnOpticsXi)
#当密度为0.05时,数据集被聚为了3类
banknote %>% mutate(clusters = factor(bnOpticsXi$cluster)) %>% # 添加类别列,转换为因子
ggpairs(aes(col = clusters),upper = list(continuous = "points"))
> bnOpticsXi
OPTICS ordering/clustering for 200 objects.
Parameters: minPts = 9, eps = 3.48814235035374, eps_cl = NA, xi = 0.05
The clustering contains 3 cluster(s) and 1 noise points.
Available fields: order, reachdist, coredist, predecessor,
minPts, eps, eps_cl, xi, clusters_xi,
cluster
应用高斯混合聚类
混合模型聚类指用多个概率分布拟合数据,通过修改分布的参数来识别聚类。最常见的是高斯混合模型,用正态分布拟合数据。
bnMix <- Mclust(bn)
plot(bnMix)
bnMix$modelName
bnMix$G
summary(bnMix)