数据分析:数据降维方法的实现

前言

随着大数据时代的到来,我们常常需要对高纬度数据进行降维处理后再对数据整体结构进行可视化,进而发现数据内在的区别,为后续数据分析提供初步的参考。

这里主要介绍三类数据分析方法的R实现过程:

1.PCA主成分分析;

2.tSNE,基于概率密度分布计算能保留更多组间和个体差异的方法;

3.UMAP,非线性降维方法。

导入数据

options(warn = 0)
library(dplyr)
library(tibble)
library(ggplot2)
options(warn = 0)

ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

Principal Component Analysis

PCAFun <- function(dataset = ExprSet_LOG2Impute ){
  
  # dataset = ExprSet_LOG2Impute 
  
  require(convert)
  metadata <- pData(dataset)
  profile <- exprs(dataset)
  
  # pca 
  pca <- prcomp(scale(t(profile), center = T, scale = T))
  require(factoextra)
  eig <- get_eig(pca)
  # explains variable 
  explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
  # principal component score of each sample
  score <- inner_join(pca$x %>% data.frame() %>% 
                        dplyr::select(c(1:2)) %>% 
                        rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
  
  
  pl <- ggplot(score, aes(x=PC1, y=PC2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              labs(x=explains[1], y=explains[2])+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$PC1) - 8,
                       y = min(score$PC1),
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

PCA_LOG2Impute <- PCAFun(dataset = ExprSet_LOG2Impute)
PCA_LOG2Impute 
ggsave("Mass_ProteinsLOG2Impute_PCA.pdf", PCA_LOG2Impute, width = 5, height = 5, dpi = 300)

Rtsne

RtsneFun <- function(dataset = ExprSet_LOG2Impute,
                     perpl = 10){
  
  # dataset = ExprSet_LOG2Impute
  # perpl = 10
  
  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))
  
  # Rtsne
  require(Rtsne)
  #set.seed(123)
  Rtsne <- Rtsne(profile, 
                 dims=2, 
                 perplexity=perpl,
                 verbose=TRUE, 
                 max_iter=500, 
                 eta=200)
  point <- Rtsne$Y %>% data.frame() %>% 
    dplyr::select(c(1:2)) %>%
    setNames(c("tSNE1", "tSNE2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
  
  pl <- ggplot(score, aes(x=tSNE1, y=tSNE2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$tSNE1) - 8,
                       y = max(score$tSNE2)-5,
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

Rtsne_LOG2Impute <- RtsneFun(dataset = ExprSet_LOG2Impute, perpl = 10)
Rtsne_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_Rtsne.pdf", Rtsne_LOG2Impute, width = 5, height = 5, dpi = 300)

UMAP: a non-linear dimensionality reduction algorithm

UMAPFun <- function(dataset = ExprSet_LOG2Impute){
  
  # dataset = ExprSet_LOG2Impute
  
  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))
  
  # umap 
  require(umap)
  umap <- umap::umap(profile)
  
  point <- umap$layout %>% data.frame() %>%
    setNames(c("UMAP1", "UMAP2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))   
  
  pl <- ggplot(score, aes(x=UMAP1, y=UMAP2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$UMAP1),
                       y = min(score$UMAP2),
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

UMAP_LOG2Impute <- UMAPFun(dataset = ExprSet_LOG2Impute)
UMAP_LOG2Impute 
ggsave("Mass_ProteinsLOG2Impute_UMAP.pdf", UMAP_LOG2Impute, width = 5, height = 5, dpi = 300)

Notes: 三组数据的整体性差异如果较为明显,一般可以在不同的降维方法上都有所体现。从分组看,NC组和PC、PT组均存在明显的差异,后续数据分析应该着重关注 NC vs PC, NC vs PT,但也可以对PC vs PT做比较分析。

Systemic information

sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] umap_0.2.7.0                                        Rtsne_0.15                                         
 [3] vegan_2.5-6                                         lattice_0.20-41                                    
 [5] permute_0.9-5                                       factoextra_1.0.7                                   
 [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 IlluminaHumanMethylationEPICmanifest_0.3.0         
 [9] minfi_1.36.0                                        bumphunter_1.32.0                                  
[11] locfit_1.5-9.4                                      iterators_1.0.13                                   
[13] foreach_1.5.1                                       Biostrings_2.58.0                                  
[15] XVector_0.30.0                                      SummarizedExperiment_1.20.0                        
[17] MatrixGenerics_1.2.1                                matrixStats_0.58.0                                 
[19] GenomicRanges_1.42.0                                GenomeInfoDb_1.26.4                                
[21] IRanges_2.24.1                                      S4Vectors_0.28.1                                   
[23] eulerr_6.1.0                                        UpSetR_1.4.0                                       
[25] ggplot2_3.3.3                                       DEP_1.12.0                                         
[27] convert_1.64.0                                      marray_1.68.0                                      
[29] limma_3.46.0                                        Biobase_2.50.0                                     
[31] BiocGenerics_0.36.0                                 data.table_1.14.0                                  
[33] tibble_3.1.0                                        dplyr_1.0.5                                        

loaded via a namespace (and not attached):
  [1] utf8_1.2.1                shinydashboard_0.7.1      reticulate_1.18           gmm_1.6-5                
  [5] tidyselect_1.1.0          RSQLite_2.2.5             AnnotationDbi_1.52.0      htmlwidgets_1.5.3        
  [9] grid_4.0.2                BiocParallel_1.24.1       norm_1.0-9.5              munsell_0.5.0            
 [13] codetools_0.2-18          preprocessCore_1.52.1     DT_0.18                   withr_2.4.1              
 [17] colorspace_2.0-0          knitr_1.31                rstudioapi_0.13           mzID_1.28.0              
 [21] labeling_0.4.2            GenomeInfoDbData_1.2.4    farver_2.1.0              bit64_4.0.5              
 [25] rhdf5_2.34.0              vctrs_0.3.7               generics_0.1.0            xfun_0.20                
 [29] BiocFileCache_1.14.0      R6_2.5.0                  doParallel_1.0.16         illuminaio_0.32.0        
 [33] clue_0.3-58               bitops_1.0-6              rhdf5filters_1.2.0        cachem_1.0.4             
 [37] reshape_0.8.8             DelayedArray_0.16.3       assertthat_0.2.1          promises_1.2.0.1         
 [41] scales_1.1.1              gtable_0.3.0              Cairo_1.5-12.2            affy_1.68.0              
 [45] sandwich_3.0-0            rlang_0.4.10              genefilter_1.72.0         mzR_2.24.1               
 [49] GlobalOptions_0.1.2       splines_4.0.2             rtracklayer_1.50.0        GEOquery_2.58.0          
 [53] impute_1.64.0             yaml_2.2.1                reshape2_1.4.4            BiocManager_1.30.12      
 [57] GenomicFeatures_1.42.2    httpuv_1.5.5              tools_4.0.2               nor1mix_1.3-0            
 [61] affyio_1.60.0             ellipsis_0.3.1            jquerylib_0.1.3           RColorBrewer_1.1-2       
 [65] siggenes_1.64.0           MSnbase_2.16.1            Rcpp_1.0.6                plyr_1.8.6               
 [69] sparseMatrixStats_1.2.1   progress_1.2.2            zlibbioc_1.36.0           purrr_0.3.4              
 [73] RCurl_1.98-1.3            prettyunits_1.1.1         openssl_1.4.3             GetoptLong_1.0.5         
 [77] cowplot_1.1.0             zoo_1.8-8                 ggrepel_0.9.1.9999        cluster_2.1.0            
 [81] tinytex_0.31              magrittr_2.0.1            RSpectra_0.16-0           circlize_0.4.10          
 [85] pcaMethods_1.80.0         mvtnorm_1.1-1             ProtGenerics_1.22.0       evaluate_0.14            
 [89] hms_1.0.0                 mime_0.10                 xtable_1.8-4              XML_3.99-0.6             
 [93] mclust_5.4.7              gridExtra_2.3             shape_1.4.5               compiler_4.0.2           
 [97] biomaRt_2.46.3            ncdf4_1.17                crayon_1.4.1              htmltools_0.5.1.1        
[101] mgcv_1.8-34               later_1.1.0.1             tidyr_1.1.3               DBI_1.1.1                
[105] dbplyr_2.1.1              ComplexHeatmap_2.6.2      MASS_7.3-53.1             tmvtnorm_1.4-10          
[109] rappdirs_0.3.3            Matrix_1.3-2              readr_1.4.0               cli_2.4.0                
[113] vsn_3.58.0                imputeLCMD_2.0            quadprog_1.5-8            pkgconfig_2.0.3          
[117] GenomicAlignments_1.26.0  MALDIquant_1.19.3         xml2_1.3.2                annotate_1.68.0          
[121] bslib_0.2.4               rngtools_1.5              multtest_2.46.0           beanplot_1.2             
[125] doRNG_1.8.2               scrime_1.3.5              stringr_1.4.0             digest_0.6.27            
[129] rmarkdown_2.7             base64_2.0                DelayedMatrixStats_1.12.3 curl_4.3                 
[133] shiny_1.6.0               Rsamtools_2.6.0           rjson_0.2.20              jsonlite_1.7.2           
[137] nlme_3.1-150              lifecycle_1.0.0           Rhdf5lib_1.12.1           askpass_1.1              
[141] fansi_0.4.2               pillar_1.6.0              fastmap_1.1.0             httr_1.4.2               
[145] survival_3.2-10           glue_1.4.2                png_0.1-7                 bit_4.0.4                
[149] sass_0.3.1                stringi_1.4.6             HDF5Array_1.18.1          blob_1.2.1               
[153] memoise_2.0.0   

Reference

  1. How to change Legend of ggplot2

  2. How to change ggplot facet labels

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
禁止转载,如需转载请通过简信或评论联系作者。
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 216,496评论 6 501
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 92,407评论 3 392
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 162,632评论 0 353
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,180评论 1 292
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,198评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,165评论 1 299
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,052评论 3 418
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,910评论 0 274
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,324评论 1 310
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,542评论 2 332
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,711评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,424评论 5 343
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,017评论 3 326
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,668评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,823评论 1 269
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,722评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,611评论 2 353

推荐阅读更多精彩内容