差异分析总结


差异分析

1.limma 得到差异表达矩阵

rm(list = ls())
#################################################
exprSet <- read.table("data/GSE42872_series_matrix.txt.gz",
                      comment.char = "!",
                      stringsAsFactors = F,
                      header = T)
### 第一列变成行名
rownames(exprSet) <- exprSet[, 1]
exprSet <- exprSet[, -1]
################################################
### 2.数据预处理,探针ID转换,探针去重
## 自动log化
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = T))
LogC <- (qx[5] > 100) ||
  (qx[6] - qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) {
  ex[which(ex <= 0)] <- NaN
  ## 取log2
  exprSet <- log2(ex)
  print("log2 transform finished")
}else {
  print("log2 transform not needed")
}

library(limma)
boxplot(exprSet, outline = FALSE, notch = T, las = 2)
### 该函数默认使用quntile 矫正差异
exprSet = normalizeBetweenArrays(exprSet)
boxplot(exprSet, outline = FALSE, notch = T, las = 2)
exprSet <- as.data.frame(exprSet)
#################################################################
### 3.探针基因名转换
platformMap <- data.table::fread("resource/platformMap.txt", data.table = F)
index <- "GPL6244"
paste0(platformMap$bioc_package[grep(index, platformMap$gpl)], ".db")
library(hugene10sttranscriptcluster.db)
probe2symbol_df <- toTable(get("hugene10sttranscriptclusterSYMBOL"))
#################################################################
### 4.探针转换以及去重,获得最终的表达矩阵
library(dplyr)
library(tibble)
exprSet1 <- exprSet %>%
  rownames_to_column("probe_id") %>%
  inner_join(probe2symbol_df, by = "probe_id") %>%
  select(-probe_id) %>%
  select(symbol, everything()) %>%
  mutate(rowMean = rowMeans(.[, -1])) %>%
  arrange(desc(rowMean)) %>%
  distinct(symbol, .keep_all = T) %>%
  select(-rowMean) %>%
  column_to_rownames("symbol")
#################################################################
### 5.使用limma来做芯片的差异分析
group <- c(rep("con", 3), rep("treat", 3))
### levels里面,把对照组放在前面
group <- factor(group, levels = c("con", "treat"))
### 主成分分析PCA:提前预测结果
### 行是样本列是基因
res.pca <- prcomp(t(exprSet1), scale = TRUE)
library(factoextra)
fviz_pca_ind(res.pca, col.ind = group)
### 构建比较矩阵
design <- model.matrix(~group)
### 比较矩阵命名
colnames(design) <- levels(group)
### 2.线性模型拟合
fit <- lmFit(exprSet1, design)
### 3.贝叶斯检验
fit2 <- eBayes(fit)
### 4.输出差异分析结果,其中coef的数目不能操过design的列数
### 此处的2代表的是design中第二列和第一列的比较
allDiff = topTable(fit2, adjust = 'fdr', coef = 2, number = Inf)
###################################################################################
### 定义差异基因:差异倍数2倍,矫正后的p值小于0.05
library(dplyr)
diffgene <- allDiff %>%
  filter(adj.P.Val < 0.05) %>%
  filter(abs(logFC) > 1)

diffgene2 <- subset(allDiff, abs(logFC) > 1 & adj.P.Val < 0.05)

作图

### 1.探针ID转换,2.行列转置,3,添加分组信息。
exprSet <- as.data.frame(t(exprSet1))
dd <- cbind(group = group, exprSet)

## 2.作图展示一个差异基因
my_comparisons <- list(
  c("treat", "con")
)

diffplot <- function(gene) {
  my_comparisons <- list(
    c("treat", "con")
  )
  library(ggpubr)
  ggboxplot(
    dd, x = "group", y = gene,
    color = "group", palette = c("#00AFBB", "#E7B800"),
    add = "jitter"
  ) +
    stat_compare_means(comparisons = my_comparisons, method = "t.test")
}

diffplot("CD36")
diffplot("MOXD1")

## 3.作图展示多个差异基因
genelist <- rownames(exprSet1)[1:6]
data <- dd[, c("group", genelist)]
library(tidyr)
data9 <- data %>%
  pivot_longer(cols = -1,
               names_to = "gene",
               values_to = "expression")
## 多基因作图
ggplot(data = data9, aes(x = group, y = expression, fill = group)) +
  geom_boxplot() +
  geom_jitter() +
  theme_bw() +
  facet_grid(. ~ gene) +
  stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")
##################################################
### 4.作图展示一群差异基因
### 画热图
library(dplyr)
diffgene <- allDiff %>%
  filter(adj.P.Val < 0.05) %>%
  filter(abs(logFC) > 3)
## 3.用名称提取部分数据用作热图绘制
heatdata <- exprSet1[rownames(diffgene),]
## 4.制作一个分组信息用于注释
group <- c(rep("con", 3), rep("treat", 3))
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(heatdata)
library(pheatmap)
library(viridisLite)
pheatmap(heatdata, #热图的数据
         cluster_rows = TRUE, #行聚类
         cluster_cols = TRUE, #列聚类,可以看出样本之间的区分度
         annotation_col = annotation_col, #标注样本分类
         annotation_legend = TRUE, # 显示注释
         show_rownames = T, # 显示行名
         scale = "row", #以行来标准化,这个功能很不错
         color = viridis(10, alpha = 1, begin = 0.5, end = 1, direction = 1), #调色
         #filename = "heatmap_F.pdf",#是否保存
         cellwidth = 30, # 格子宽度
         cellheight = 12, # 格子高度
         fontsize = 10 # 字体大小
)
##################################################
### 5.作图展示所有基因差异
### 画火山图,订制火山图
library(ggplot2)
library(ggrepel)
library(dplgeneyr)

data <- allDiff
data$gene <- rownames(data)
logFCfilter = 1
logFCcolor = 3
index = data$adj.P.Val < 0.05 & abs(data$logFC) > logFCfilter
data$group <- 0
data$group[index & data$logFC > 0] = 1
data$group[index & data$logFC < 0] = -1
data$group <- factor(data$group, levels = c(1, 0, -1), labels = c("Up", "NS", "Down"))

ggplot(data = data, aes(x = logFC, y = -log10(adj.P.Val), color = group)) +
  geom_point(alpha = 0.8, size = 1.2) +
  scale_color_manual(values = c("darkred", "black", "royalblue4")) +
  labs(x = "log2 (fold change)", y = "-log10 (adj.P.Val)") +
  theme(plot.title = element_text(hjust = 0.4)) +
  geom_hline(yintercept = -log10(0.05), lty = 4, lwd = 0.6, alpha = 0.8) +
  geom_vline(xintercept = c(-logFCfilter, logFCfilter), lty = 4, lwd = 0.6, alpha = 0.8) +
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black")) +
  theme(legend.position = "top") +
  geom_point(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05), alpha = 0.8, size = 3, col = "green2") +
  geom_text_repel(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05),
                  aes(label = gene), col = "black", alpha = 0.8)
##################################################
### 6.探索基因变化带来的功能改变
### 基因集富集分析GSEA的用法
################################################
library(clusterProfiler)
gene <- rownames(allDiff)
gene = bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
## 去重
gene <- dplyr::distinct(gene, SYMBOL, .keep_all = TRUE)
gene_df <- data.frame(logFC = allDiff$logFC,
                      SYMBOL = rownames(allDiff))
gene_df <- merge(gene_df, gene, by = "SYMBOL")

geneList <- gene_df$logFC
names(geneList) = gene_df$SYMBOL
geneList = sort(geneList, decreasing = TRUE)
### GSEA
### hallmarks gene set
hallmarks <- read.gmt("resource/h.all.v7.1.symbols.gmt")
### 主程序GSEA
y <- GSEA(geneList, TERM2GENE = hallmarks)
yd <- as.data.frame(y)
### 看整体分布
library(ggplot2)
dotplot(y, showCategory = 30, split = ".sign") + facet_grid(~.sign)
library(enrichplot)
gseaplot2(y, "HALLMARK_E2F_TARGETS", color = "red", pvalue_table = T)
gseaplot2(y, 10, color = "red", pvalue_table = T)
gseaplot2(y, geneSetID = 1:3)

2. DEseq2

rm(list = ls())
### https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE153250
### 1.读入表达矩阵
### 要求, 行数基因,列是样本,内容是counts
exprSet <- data.table::fread("data/exprSet_counts.csv", data.table = F)
#########################################################
### 2.准备 metadata文件
### (用于注释样本信息,至少两列,一列是样本名称,一列是分组信息)
metadata <- data.table::fread("data/metadata.txt", data.table = F, header = F)
rownames(metadata) <- metadata[, 1]
metadata <- metadata[colnames(exprSet)[-1],]
### 添加分组信息
metadata$group <- ifelse(grepl("ESR1", metadata$V2), "treat", "control")
metadata = metadata[, c(1, 3)]
colnames(metadata) <- c("sample", "group")
#########################################################
### 3.核心环节,构建dds对象
### 要记住四个参数
### 要有数据countData,这里就是exprSet
### 要有分组信息,在colData中,这里是metadata
### design部分是分组信息,格式是~group
### 第一列如果是基因名称,需要自动处理,设置参数tidy=TRUE
### 对象是个复合体
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = exprSet,
                              colData = metadata,
                              design = ~group,
                              tidy = TRUE)
### 筛选样本,counts函数提取表达量数据
dds <- dds[rowSums(counts(dds)) > 1,]
#########################################################
### 4.数据质量判断
### vst标准化处理
vsd <- vst(dds, blind = FALSE)
### 内置函数plotPCA进行主成分分析画图
plotPCA(vsd, "group")
### 用内置函数plotCounts来进行快速,简易作图
### 找到阳性基因,此处ESR1
### dds来自于上一步
### gene 输入的是ensemble ID
### intgroup 输入的是metadata中分组信息
plotCounts(dds, gene = "ENSG00000091831", intgroup = c("group"))

### 导出标准化后的表达数据
### assay函数提取vst标准化后的数据,保存数据用于热图
exprSet_vst <- as.data.frame(assay(vsd))
### 保存数据,用于表达量作图,比如差异分析,热图
#########################################################
### 5.正式运行DESeq主程序
dds <- DESeq(dds)
#########################################################
### 6.logFC矫正,RNAseq很重要的一步
### contrast参数设置
### 依次是,1.分组信息(metadata中的列) 2.处理组,3.对照组
contrast = c("group", "treat", "control")
### results函数获取差异分析的结果
dd1 <- results(dds, contrast = contrast, alpha = 0.05)
### 内置函数plotMA作图
plotMA(dd1, ylim = c(-5, 5))
### logFC矫正
dd2 <- lfcShrink(dds, contrast = contrast, res = dd1, type = "ashr")
plotMA(dd2, ylim = c(-5, 5))
#########################################################
### 7.导出差异分析的结果
library(dplyr)
library(tibble)
library(tidyr)
res <- dd2 %>%
  as.data.frame() %>%
  rownames_to_column("gene_id")
#########################################################
### 8.基因注释
library(AnnotationDbi)
library(org.Hs.eg.db)
res$symbol <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "SYMBOL",
                     keytype = "ENSEMBL",
                     multiVals = "first")
res$entrez <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "ENTREZID",
                     keytype = "ENSEMBL",
                     multiVals = "first")
colnames(res) <- c("gene_id", "baseMean", "logFC", "lfcSE", "P.Value", "adj.P.Val", "gene", "entrez")

作图

library(dplyr)
ensemble_symbol = res %>%
  dplyr::select(gene_id, gene) %>%
  filter(gene != "")

### 2.准备表达量数据
exprSet <- cbind(gene_id = rownames(exprSet_vst), exprSet_vst)
exprSet <- merge(ensemble_symbol, exprSet, by = "gene_id")

### 4.基因名称去重(保留最大值法)
### 列转行名一定要去重,因为行名不支持重复
exprSet <- exprSet %>%
  dplyr::select(-gene_id) %>%
  mutate(newcolumn = rowMeans(.[, -1])) %>%
  arrange(desc(newcolumn)) %>%
  distinct(gene, .keep_all = T) %>%
  dplyr::select(-newcolumn)

rownames(exprSet) <- exprSet[, 1]
exprSet <- exprSet[, -1]
### 6.行列转置
exprSet <- t(exprSet)
exprSet <- as.data.frame(exprSet)
### 7.添加分组
exprSet <- cbind(group = metadata$group, exprSet)
###########################################################
## 2.一个基因作图
diffplot <- function(gene) {
  my_comparisons <- list(
    c("treat", "control")
  )
  library(ggpubr)
  ggboxplot(
    exprSet, x = "group", y = gene,
    color = "group", palette = c("#00AFBB", "#E7B800"),
    add = "jitter"
  ) +
    stat_compare_means(comparisons = my_comparisons, method = "t.test")
}

### AGR3,ESR1,SLC4A10, ALPP,VSIR,PLA2G2F
diffplot("AGR3")
diffplot("ALPP")
diffplot("ESR1")

###########################################################
## 3.多个基因作图查看
## 先把基因提取出来
genelist <- c("AGR3", "ESR1", "SLC4A10", "ALPP", "VSIR", "PLA2G2F")
data <- exprSet[, c("group", genelist)]
library(tidyr)
data <- data %>%
  pivot_longer(cols = -1,
               names_to = "gene",
               values_to = "expression")
## 多基因作图
ggplot(data = data, aes(x = group, y = expression, fill = group)) +
  geom_boxplot() +
  geom_jitter() +
  theme_bw() +
  facet_grid(. ~ gene) +
  stat_compare_means(comparisons = my_comparisons, label = "p.signif", method = "t.test")
##################################
### 4.如何展示一群基因的表达结果
### 筛选差异基因
library(dplyr)
diffgene <- res %>%
  filter(gene != "") %>%
  filter(adj.P.Val < 0.05) %>%
  filter(abs(logFC) > 2)

library(pheatmap)
heatdata <- exprSet[diffgene$gene,]
### 制作一个分组信息用于注释
annotation_col <- data.frame(group = metadata$group)
rownames(annotation_col) <- metadata$sample
### 如果注释出界, 可以通过调整格子比例和字体修正
pheatmap(heatdata, #热图的数据
         cluster_rows = TRUE, #行聚类
         cluster_cols = TRUE, #列聚类,可以看出样本之间的区分度
         annotation_col = annotation_col, #标注样本分类
         annotation_legend = TRUE, # 显示注释
         show_rownames = F, # 显示行名
         show_colnames = F, # 显示行名
         scale = "row", #以行来标准化,这个功能很不错
         color = colorRampPalette(c("blue", "white", "red"))(100), #调色
         #filename = "heatmap_F.pdf",#是否保存
         cellwidth = 25, cellheight = 0.5, # 格子比例
         fontsize = 10)
################################################################
### 5.如何展示所有的基因变化
### 火山图
library(ggplot2)
library(ggrepel)
data <- res
logFCfilter = 1
logFCcolor = 4
index = data$adj.P.Val < 0.05 & abs(data$logFC) > logFCfilter
data$group <- 0
data$group[index & data$logFC > 0] = 1
data$group[index & data$logFC < 0] = -1
data$group <- factor(data$group, levels = c(1, 0, -1), labels = c("Up", "NS", "Down"))
### 正式画图
ggplot(data = data, aes(x = logFC, y = -log10(adj.P.Val), color = group)) +
  geom_point(alpha = 0.8, size = 1.2) +
  scale_color_manual(values = c("darkred", "grey50", "royalblue4")) +
  labs(x = "log2 (fold change)", y = "-log10 (adj.P.Val)") +
  theme(plot.title = element_text(hjust = 0.4)) +
  geom_hline(yintercept = -log10(0.05), lty = 4, lwd = 0.6, alpha = 0.8) +
  geom_vline(xintercept = c(-logFCfilter, logFCfilter), lty = 4, lwd = 0.6, alpha = 0.8) +
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black")) +
  theme(legend.position = "top") +
  geom_point(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05), alpha = 0.8, size = 3, col = "green4") +
  geom_text_repel(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05),
                  aes(label = gene), col = "black", alpha = 0.8)
###############################
library(ggplot2)
library(ggrepel)
data <- res
logFCfilter = 1
logFCcolor = 4
index = data$adj.P.Val < 0.05 & abs(data$logFC) > logFCfilter
data$group <- 0
data$group[index & data$logFC > 0] = 1
data$group[index & data$logFC < 0] = -1
data$group <- factor(data$group, levels = c(1, 0, -1), labels = c("Up", "NS", "Down"))
ggplot(data = data, aes(x = log2(baseMean), y = logFC, color = group)) +
  geom_point(alpha = 0.8, size = 1.2) +
  scale_color_manual(values = c("darkred", "grey50", "royalblue4")) +
  labs(y = "log2 (Fold Change)", x = "log2 (Base Mean)") +
  theme(plot.title = element_text(hjust = 0.4)) +
  geom_hline(yintercept = c(logFCfilter, -logFCfilter), lty = 2, lwd = 1) +
  geom_hline(yintercept = 0, lwd = 1.2) +
  theme_bw() +
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black")) +
  theme(legend.position = "top") +
  geom_point(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05), alpha = 0.8, size = 3, col = "green4") +
  geom_text_repel(data = subset(data, abs(logFC) >= logFCcolor & adj.P.Val < 0.05),
                  aes(label = gene), col = "black", alpha = 0.8)
##############################################################
### 6.GSEA 分析
library(dplyr)
gene_df <- res %>%
  dplyr::select(gene_id, logFC, gene) %>%
  ## 去掉NA
  filter(gene != "") %>%
  distinct(gene, .keep_all = T)

geneList <- gene_df$logFC
names(geneList) = gene_df$gene
geneList = sort(geneList, decreasing = TRUE)

library(clusterProfiler)
hallmarks <- read.gmt("resource/h.all.v7.1.symbols.gmt")
### GSEA 分析
gseahallmarks <- GSEA(geneList, TERM2GENE = hallmarks)
yd <- as.data.frame(gseahallmarks)

library(ggplot2)
dotplot(gseahallmarks, showCategory = 30, split = ".sign") + facet_grid(~.sign)
library(enrichplot)
pathway.id = "HALLMARK_ESTROGEN_RESPONSE_EARLY"
gseaplot2(gseahallmarks, color = "red", geneSetID = pathway.id, pvalue_table = T)
pathway.id = "HALLMARK_TNFA_SIGNALING_VIA_NFKB"
gseaplot2(gseahallmarks, color = "red", geneSetID = pathway.id, pvalue_table = T)

limma构建多组分析比较 和 批次矫正

library(GEOquery)

gset = getGEO('GSE32575', destdir = ".", getGPL = F)
## 获取ExpressionSet对象,包括的表达矩阵和分组信息
gset = gset[[1]]
## 获取分组信息
pdata = pData(gset)
## 只要后36个,本次选择的这36个是配对的。
group = c(rep('before', 18), rep('after', 18))
group = factor(group, levels = c("before", "after"))
exprSet = exprs(gset)
exprSet = exprSet[, -seq(1, 12)]
boxplot(exprSet, outline = FALSE, notch = T, col = group, las = 2)
library(limma)
exprSet = normalizeBetweenArrays(exprSet)
boxplot(exprSet, outline = FALSE, notch = T, col = group, las = 2)
exprSet = as.data.frame(exprSet)


ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = T))
LogC <- (qx[5] > 100) ||
  (qx[6] - qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { ex[which(ex <= 0)] <- NaN
  exprSet <- log2(ex)
  print("log2 transform finished") }else { print("log2 transform not needed") }
platformMap <- data.table::fread("resource/platformMap.txt")

### 探针ID注释
index <- "GPL6102"
paste0(platformMap$bioc_package[grep(index, platformMap$gpl)], ".db")

library("illuminaHumanv2.db")

probe2symbol_df <- toTable(get("illuminaHumanv2SYMBOL"))

library(dplyr)
library(tibble)
exprSet <- exprSet %>%
  rownames_to_column("probe_id") %>%
  inner_join(probe2symbol_df, by = "probe_id") %>%
  select(-probe_id) %>%
  select(symbol, everything()) %>%
  mutate(rowMean = rowMeans(.[, -1])) %>%
  arrange(desc(rowMean)) %>%
  distinct(symbol, .keep_all = T) %>%
  select(-rowMean) %>%
  column_to_rownames("symbol")

res.pca <- prcomp(t(exprSet), scale = TRUE)
library(factoextra)
fviz_pca_ind(res.pca, col.ind = group)
#######################################################
### 第1种构建model.matrix的方法
group = c(rep('before', 18), rep('after', 18))
group = factor(group, levels = c("before", "after"))
design <- model.matrix(~group)
fit <- lmFit(exprSet, design)
fit2 <- eBayes(fit)
### 此处的2代表的是design中第二列和第一列的比较
allDiff1 = topTable(fit2, adjust = 'fdr', coef = ncol(design), number = Inf, p.value = 0.05)
#######################################################
### 在此基础上进行配对分析
group = c(rep('before', 18), rep('after', 18))
group = factor(group, levels = c("before", "after"))
pairinfo = factor(rep(1:18, 2))
design = model.matrix(~pairinfo + group)
fit <- lmFit(exprSet, design)
fit2 <- eBayes(fit)
allDiff2 = topTable(fit2, adjust = 'fdr', coef = ncol(design), number = Inf, p.value = 0.05)
######################################################
### 第2种构建model.matrix的方法
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
## 比较矩阵,其中treat - con根据自己的组别更改
contrast.matrix <- makeContrasts(after - before, levels = design)
fit <- lmFit(exprSet, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
## 提取差异结果,注意这里的coef是1
allDiff3 = topTable(fit2, adjust = 'fdr', coef = 1, number = Inf, p.value = 0.05)
######################################################
### 利用第2种构建model.matrix的方法进行多组分析
group <- c(rep("con", 12), rep("drugA", 12), rep("drugB", 12))
group <- factor(group, levels = c("con", "drugA", "drugB"))
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
contrast.matrix <- makeContrasts(drugA - con,
                                 drugB - con,
                                 drugA - drugB,
                                 levels = design)
fit <- lmFit(exprSet, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
### 分别提取差异基因
allDiff1 = topTable(fit2, adjust = 'fdr', coef = 1, number = Inf)
allDiff2 = topTable(fit2, adjust = 'fdr', coef = 2, number = Inf)
allDiff3 = topTable(fit2, adjust = 'fdr', coef = 3, number = Inf)
###########################################################
### 芯片数据的批次矫正
library(sva)
library(bladderbatch)
data(bladderdata)
#bladder 的属性是EsetExpressionSet,所以可以用pData和exprs方法
### 提取注释数据
pheno <- pData(bladderEset)
### 提取表达数据
edata <- exprs(bladderEset)
########################################################
## 聚类画图展示批次信息
dist_mat <- dist(t(edata))
res.hc <- hclust(dist_mat, method = "complete")
plot(res.hc, labels = pheno$batch)
plot(res.hc, labels = pheno$cancer)
### 校正批次效应,model可以有也可以没有,
### 如果有,也就是告诉combat,有些分组本来就有差别,不要给我矫枉过正
model <- model.matrix(~as.factor(pheno$cancer))
###############################################################################
combat_edata <- ComBat(dat = edata, batch = pheno$batch, mod = model)
###############################################################################
library(factoextra)
dist_mat <- dist(t(combat_edata))
res.hc <- hclust(dist_mat, method = "complete")
plot(res.hc, labels = pheno$batch)
plot(res.hc, labels = pheno$cancer)

deseq2 配对和多分组

### Deseq2的配对和多分组
##################################################################
### 接下来用Deseq2来处理转录组的counts数据
rm(list = ls())
exprSet <- as.data.frame(expr_df)
TCGA_id <- colnames(exprSet)[-1]
### 创建分组信息
sample <- ifelse(substring(TCGA_id, 14, 15) == "01", "cancer", "recurr")
sample <- factor(sample, levels = c("recurr", "cancer"))
### 获取配对信息,如果不是配对样本,就不需要这个信息
paire_info <- as.factor(as.numeric(as.factor(substring(TCGA_id, 1, 12))))
### 创建metadata
metadata <- data.frame(TCGA_id, sample, paire_info)
##################################################
### 核心环节,构建dds对象
### 要有数据countData,
### 要有分组信息,在colData中
### design部分,本次是配对
### 如果不是配对的是这样:design=~sample
### 第一列如果有基因名称,需要处理,所以tidy=TRUE
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = exprSet,
                              colData = metadata,
                              design = ~paire_info + sample,
                              tidy = TRUE)

### 过滤
dds <- dds[rowSums(counts(dds)) > 1,]
### 数据标准化
vsd <- vst(dds, blind = FALSE)
### PCA,主成分分析,看分类
plotPCA(vsd, "sample")
plotPCA(vsd, "paire_info")
#################################################################
###,DESeq2主程序
dds <- DESeq(dds)
plotCounts(dds, gene = "ENSG00000137745.10", intgroup = c("sample"))
### 增加returnData参数,可以把刚才的数据导出来,自己做图
plotdata <- plotCounts(dds, gene = "ENSG00000137745.10", intgroup = c("sample", "paire_info"), returnData = T)
library(ggplot2)
ggplot(plotdata, aes(x = sample, y = count, col = sample)) +
  geom_jitter() +
  theme_bw()

library(ggplot2)
ggplot(plotdata, aes(x = sample, y = count, fill = sample)) +
  geom_boxplot() +
  geom_point(size = 2, alpha = 0.5) +
  geom_line(aes(group = paire_info), colour = "black", linetype = "11", size = 1) +
  theme_bw() +
  theme(legend.position = "none")
### 三个大步骤,基因注释,行列转置,添加分组分析。
#######################################################################
### logFC矫正
### results获取差异分析的结果
contrast <- c("sample", "recurr", "cancer")
dd1 <- results(dds, contrast = contrast, alpha = 0.05)
### MA作图
plotMA(dd1, ylim = c(-5, 5))
### logFC矫正
dd2 <- lfcShrink(dds, contrast = contrast, res = dd1, type = "normal")
plotMA(dd2, ylim = c(-5, 5))

library(dplyr)
library(tibble)
library(tidyr)
### 导出差异分析的结果
res <- dd2 %>%
  as.data.frame() %>%
  rownames_to_column("gene_id") %>%
  separate(gene_id, into = c("gene_id"), sep = "\\.")

library(AnnotationDbi)
library(org.Hs.eg.db)
res$symbol <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "SYMBOL",
                     keytype = "ENSEMBL",
                     multiVals = "first")
res$entrez <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "ENTREZID",
                     keytype = "ENSEMBL",
                     multiVals = "first")
### 修改logFC,p值的名称,为的是跟火山图的代码匹配
colnames(res) <- c("gene_id", "baseMean", "logFC", "lfcSE", "stat", "P.Value", "adj.P.Val", "gene", "entrez")

3.edgeR

rm(list = ls())
### 使用edgeR分析
########################################################################
### 1.准备数据
rownames(exprSet) = exprSet[, 1]
exprSet = exprSet[, -1]
group = metadata$group
########################################################################
### 2.构建对象
library(edgeR)
y <- DGEList(counts = exprSet, group = group)
counts <- y$counts
samples <- y$samples
########################################################################
### 3.数据过滤
keep <- filterByExpr(y)
y <- y[keep, , keep.lib.sizes = FALSE]
barplot(y$samples$lib.size * 1e-6, ylab = "Library size (millions)")
########################################################################
### 4.Normalization
y <- calcNormFactors(y, method = "TMM")
### MD plot
plotMD(cpm(y, log = TRUE), column = 1)
abline(h = 0, col = "red", lty = 2, lwd = 2)
### MDS plot
group = factor(y$samples$group)
colors <- rep(c("blue", "red"), 6)
plotMDS(y, col = colors[group])
########################################################################
### 5. Estimating the dispersion
### y <- estimateDisp(y, design)
### 1.y <- estimateGLMCommonDisp(y, design)
### 2.y <- estimateGLMTrendedDisp(y, design)
### 3.y <- estimateGLMTagwiseDisp(y, design)
design <- model.matrix(~group)
rownames(design) <- colnames(y)
y <- estimateDisp(y, design, robust = TRUE)
fit <- glmQLFit(y, design, robust = TRUE)
#########################################################################
### 6.差异分析
qlf <- glmQLFTest(fit)
plotMD(qlf)
res = as.data.frame(topTags(qlf, n = nrow(y)))
res = cbind(gene_id = rownames(res), res)
########################################################################
### 7.基因注释
### 当前基因名称是ENSEMBL
library(AnnotationDbi)
library(org.Hs.eg.db)
### 增加基因名称SYMBOL
res$symbol <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "SYMBOL",
                     keytype = "ENSEMBL",
                     multiVals = "first")
### 增加ENTREZID
res$entrez <- mapIds(org.Hs.eg.db,
                     keys = res$gene_id,
                     column = "ENTREZID",
                     keytype = "ENSEMBL",
                     multiVals = "first")
colnames(res) <- c("gene_id", "logFC", "logCPM", "F", "P.Value", "adj.P.Val", "gene", "entrez")
#############################################################
### 8.跟Deseq2的比较
### 从影响的通路来比较
library(dplyr)
gene_df <- res %>%
  dplyr::select(gene_id, logFC, gene) %>%
  filter(gene != "") %>%
  distinct(gene, .keep_all = T)

geneList <- gene_df$logFC
names(geneList) = gene_df$gene
geneList = sort(geneList, decreasing = TRUE)

library(clusterProfiler)

hallmarks <- read.gmt("resource/h.all.v7.1.symbols.gmt")
### GSEA 分析
gseahallmarks <- GSEA(geneList, TERM2GENE = hallmarks)
yd <- as.data.frame(gseahallmarks)
library(ggplot2)
dotplot(gseahallmarks, showCategory = 30, split = ".sign") + facet_grid(~.sign)

edgeR_GSEA <- as.data.frame(gseahallmarks)

y <- gseahallmarks
Deseq2_GSEA <- as.data.frame(y)

edgeR_GSEA <- edgeR_GSEA[, c("ID", "NES")]
colnames(edgeR_GSEA) <- c("ID", "edgeR_NES")

Deseq2_GSEA <- Deseq2_GSEA[, c("ID", "NES")]
colnames(Deseq2_GSEA) <- c("ID", "Deseq2_NES")

data = dplyr::full_join(edgeR_GSEA, Deseq2_GSEA, by = "ID")

library(ggplot2)
ggplot(data, aes(edgeR_NES, Deseq2_NES)) +
  geom_point()

cor.test(data$edgeR_NES, data$Deseq2_NES, method = c("spearman"))

library(enrichplot)
pathway.id = "HALLMARK_E2F_TARGETS"
gseaplot2(y, color = "red", geneSetID = pathway.id, pvalue_table = T)

4.自己写差异分析

rm(list = ls())
### 写循环自己算差异
group <- c(rep("con",3),rep("treat",3))

### CD36
gene = "CD36"
data = as.data.frame(t(exprSet))
dd = t.test(data[,gene] ~ group)
treatdata = mean(data[,gene][4:6])
controltdata = mean(data[,gene][1:3])

logFC = log2(treatdata/controltdata)
logFC = round(logFC,3)
meanexp = mean(data[,gene])

alldiff = data.frame()
data = as.data.frame(t(exprSet))
for (i in 1:ncol(data)) {
  gene = colnames(data)[i]
  genevalue = data[,gene]

  dd = t.test(genevalue ~ group)
  p.value = dd$p.value

  treatdata = mean(genevalue[4:6])
  controltdata = mean(genevalue[1:3])

  logFC = treatdata-controltdata
  logFC = round(logFC,3)

  meanexp = mean(genevalue)

  alldiff[i,1] = gene
  alldiff[i,2] = logFC
  alldiff[i,3] = meanexp
  alldiff[i,4] = p.value
}
colnames(alldiff) <- c("gene","logFC","baseMean","P.Value")
### 矫正后的p值
alldiff$adj.P.Val <- p.adjust(alldiff$P.Value,method = "fdr")