GSEA分析


GSEA基因集富集分析

## 背景:
基因集富集分析

1.fgsea

### https://bioconductor.org/packages/release/bioc/vignettes/fgsea/inst/doc/fgsea-tutorial.html
rm(list = ls())

library(fgsea)
library(data.table)
library(ggplot2)
data(examplePathways)
data(exampleRanks)
set.seed(42)
fgseaRes <- fgsea(pathways = examplePathways,
                  stats = exampleRanks,
                  minSize = 15,
                  maxSize = 500,
                  nperm = 10000)
psw <- examplePathways[["5991130_Programmed_Cell_Death"]]

plotEnrichment(psw, exampleRanks) +
  labs(title = "Programmed Cell Death")

index <- grep("5991130", fgseaRes$pathway)
anno <- fgseaRes[index, c("NES", "pval", "padj")]
lab <- paste0(names(anno), "=", round(anno, 3), collapse = "\n")

plotEnrichment(psw, exampleRanks) +
  labs(title = "Programmed Cell Death") +
  annotate("text", 0, as.numeric(fgseaRes[index, "ES"]) * .9, label = lab, hjust = 0, vjust = 0)

#### 多个
topPathwaysUp <- fgseaRes[ES > 0,][head(order(pval), n = 10), pathway]
topPathwaysDown <- fgseaRes[ES < 0,][head(order(pval), n = 10), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
plotGseaTable(examplePathways[topPathways], exampleRanks, fgseaRes,
              gseaParam = 0.5)

### 精简
collapsedPathways <- collapsePathways(fgseaRes[order(pval)][padj < 0.01],
                                      examplePathways, exampleRanks)
mainPathways <- fgseaRes[pathway %in% collapsedPathways$mainPathways][
  order(-NES), pathway]
plotGseaTable(examplePathways[mainPathways], exampleRanks, fgseaRes,
              gseaParam = 0.5)

## 自带数据
rnk.file <- system.file("extdata", "naive.vs.th1.rnk", package = "fgsea")
ranks <- read.table(rnk.file, header = TRUE, colClasses = c("character", "numeric"))
ranks <- setNames(ranks$t, ranks$ID)

gmt.file <- system.file("extdata", "mouse.reactome.gmt", package = "fgsea")
pathways <- gmtPathways(gmt.file)
fgseaRes <- fgsea(pathways, ranks, minSize = 15, maxSize = 500, nperm = 10000)

2.clusterprofiler

rm(list = ls())
library(clusterProfiler)

# 差异分析得到的表达矩阵
gene <- rownames(Diff_data)
gene = bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
gene <- dplyr::distinct(gene, SYMBOL, .keep_all = TRUE)
gene_df <- data.frame(logFC = allDiff$logFC, SYMBOL = rownames(allDiff))
gene_df <- merge(gene_df, gene, by = "SYMBOL")

### genelist获取
geneList <- gene_df$logFC
names(geneList) = gene_df$ENTREZID
geneList = sort(geneList, decreasing = TRUE)

## 读入hallmarks gene set,broad网站
hallmarks <- read.gmt("h.all.v7.1.entrez.gmt")
y <- GSEA(geneList, TERM2GENE = hallmarks)

### 作图
cnetplot(y, foldChange = geneList)
y2 <- setReadable(y, "org.Hs.eg.db", keyType = "ENTREZID")
cnetplot(y2, showCategory = 4,
         foldChange = geneList,
         colorEdge = T)

### 看整体分布
dotplot(y, showCategory = 12, split = ".sign") + facet_grid(~.sign)

### 选择需要呈现的来作图
yd <- data.frame(y)
library(enrichplot)
gseaplot2(y, "HALLMARK_MYC_TARGETS_V2", color = "red", pvalue_table = T)
gseaplot2(y, 11, color = "red", pvalue_table = T)
ridgeplot(y)
gseaplot2(y, geneSetID = 1:3)

### 自己添加文字加文字
index <- "HALLMARK_MYC_TARGETS_V2"
gseaplot2(y, index, color = "green")

anno <- yd[index, c("enrichmentScore", "NES", "pvalue", "p.adjust")]
colnames(anno)[1] <- "ES"
lab <- paste0(names(anno), "=", round(anno, 3), collapse = "\n")

p <- gseaplot2(y, index, color = "green")
p[[1]] <- p[[1]] + annotate("text", 15000, -0.6, label = lab, hjust = 0, vjust = 0, size = 5)
p

3.misgdb

library(msigdbr)
msigdbr_species()
m_df = msigdbr(species = "Homo sapiens")
library(dplyr)
m_df %>%
  distinct(gs_cat, gs_subcat) %>%
  arrange(gs_cat, gs_subcat)

### 人
h_df = msigdbr(species = "Homo sapiens", category = "H")

### 鼠
m_df = msigdbr(species = "Mus musculus", category = "H")

### 选择的方式
m_df1 = msigdbr(species = "Mus musculus", category = "C2", subcategory = "CGP")
dd <- msigdbr(species = "Mus musculus")
colnames(dd)

###选取鼠的C2,CGP,entrizID
library(dplyr)
m_df2 <- dd %>%
  filter(species_name == "Mus musculus") %>%
  filter(gs_cat == "C2", gs_subcat == "CGP") %>%
  select(gs_name, entrez_gene)

### 选人,hallmarks,两列
dd <- msigdbr(species = "Homo sapiens")
h_df2 <- dd %>%
  filter(gs_cat == "H") %>%
  select(gs_name, entrez_gene)

### 比较
h_df3 <- read.gmt("h.all.v7.1.entrez.gmt")
m_list = split(h_df2$entrez_gene, h_df2$gs_name)

### fgsea的输入
pathways <- gmtPathways("h.all.v7.1.entrez.gmt")
### https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html

4.experdata

y <- as.numeric(exprSet[1,])
rownames <- rownames(exprSet)
cor_data_df <- data.frame(rownames)
### 批量相关性分析
for (i in 1:length(rownames)) {
  test <- cor.test(as.numeric(exprSet[i,]), y, method = "spearman")
  cor_data_df[i, 2] <- test$estimate
  cor_data_df[i, 3] <- test$p.value
}
names(cor_data_df) <- c("symbol", "correlation", "pvalue")

## geneList 
geneList <- cor_data_df$correlation
names(geneList) = cor_data_df$symbol
geneList = sort(geneList, decreasing = TRUE)


### 基因集
library(msigdbr)
dd <- msigdbr(species = "Homo sapiens")
hallmarks <- dd %>%
  filter(gs_cat == "H") %>%
  select(gs_name, gene_symbol)

### GSEA分析
library(clusterProfiler)
y <- GSEA(geneList, TERM2GENE = hallmarks)

### 看整体分布
dotplot(y, showCategory = 12, split = ".sign") + facet_grid(~.sign)

5.gsva+limma

library(GSVA)
kegggmt <- read.gmt("c2.cp.kegg.v7.1.symbols.gmt")
colnames(kegggmt)
kegg_list = split(kegggmt$gene, kegggmt$term)
## 下面有个报错错误: useNames = NA is defunct. Instead, specify either useNames = TRUE or useNames = FALSE.
## 把包降级 remotes::install_version("matrixStats", version="1.1.0")
kegg1 <- gsva(expr = as.matrix(exprSet), kegg_list, kcdf = "Gaussian", method = "gsva", parallel.sz = 1)

keggSet <- getGmt("c2.cp.kegg.v7.1.symbols.gmt")
kegg2 <- gsva(expr = as.matrix(exprSet), keggSet, kcdf = "Gaussian", method = "gsva", parallel.sz = 1)

library(pheatmap)
##用名称提取部分数据用作热图绘制
group <- c(rep("con", 3), rep("treat", 3))
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(kegg2)

pheatmap(kegg2,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         annotation_col = annotation_col,
         annotation_legend = TRUE,
         show_rownames = F,
         scale = "row",
         color = colorRampPalette(c("blue", "white", "red"))(100),
         cellwidth = 80, cellheight = 2,
         fontsize = 10)


library(limma)
exprSet <- kegg2
### 差异分析

group <- c(rep("con", 3), rep("treat", 3))
## 分组变成向量,并且限定leves的顺序
## levels里面,把对照组放在前面
group <- factor(group, levels = c("con", "treat"), ordered = F)
## 1.构建比较矩阵
design <- model.matrix(~group)
## 比较矩阵命名
colnames(design) <- levels(group)

##2.线性模型拟合
fit <- lmFit(exprSet, design)
##3.贝叶斯检验
fit2 <- eBayes(fit)
#4.输出差异分析结果,其中coef的数目不能操过design的列数
# 此处的2代表的是第二列和第一列的比较
allDiff = topTable(fit2, adjust = 'fdr', coef = 2, number = Inf) 

6.ssgsva

### 每个细胞对应表达的mark基因(类似通路对应基因)
expr <- data.table::fread("exprMat.txt", data.table = F)
rownames(expr) <- expr[, 1]
expr <- expr[, -1]
expr <- as.matrix(expr)

### 3.使用ssGSEA量化免疫浸润
library(GSVA)
gsva_data <- gsva(expr, cellMarker, method = "ssgsea")