GEO数据分析


GEO数据

背景:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE42872

基本流程:

## 1. 下载数据
library(GEOquery)
f = 'GSE42872_eSet.Rdata'
if (!file.exists(f)) {
  gset <- getGEO('GSE42872', destdir = ".",
                 AnnotGPL = F,     ## 注释文件
                 getGPL = F)       ## 平台文件
  save(gset, file = f)
}
a <- read.table("./GSE42872_series_matrix.txt.gz", header = T, sep = "\t", fill = T, quote = "", comment.char = "!")
## 2. 探针id转换
pd <- gset$GSE42872_series_matrix.txt.gz@phenoData@data
# dat1 <- exprs(gset[[1]])
dat <- exprs(gset[[1]])

library(hugene10sttranscriptcluster.db)
ids <- toTable(hugene10sttranscriptclusterSYMBOL)

dat_final <- dat[rownames(dat) %in% ids$probe_id,]
ids <- ids[match(rownames(dat_final), ids$probe_id),]

# 探针对应的基因一样时选平均数最大的
tmp <- by(dat_final, ids$symbol, function(x) {
  rownames(x)[which.max(rowMeans(x))]
})
dat_final_res <- dat_final[tmp,]
# x = dat_final[ids[,2]=="IGKC",]
# dat_final[rownames(x)[which.max(rowMeans(x))],]
## 3. 差异分析
group_list <- unlist(lapply(as.character(pd$title), function(x) {
  strsplit(x, " ")[[1]][4] }
))

library(limma)

design <- model.matrix(~0 + factor(group_list))
colnames(design) = levels(factor(group_list))
rownames(design) = colnames(dat_final_res)

contrast.matrix <- makeContrasts(paste0(unique(group_list), collapse = "-"),
                                 levels = design)

deg <- function(exprSet, design, contrast.matrix) {
  fit <- lmFit(exprSet, design)
  fit2 <- contrasts.fit(fit, contrast.matrix)
  fit2 <- eBayes(fit2)
  tempOutput = topTable(fit2, coef = 1, n = Inf)
  nrDEG = na.omit(tempOutput)
  return(nrDEG)
}

res <- deg(dat_final_res, design, contrast.matrix)

## volcano 
library(ggpubr)
plot(res$logFC, -log10(res$P.Value))
res$v = -log10(res$P.Value)
ggscatter(res, x = "logFC", y = "v", size = 0.5)
res$g = ifelse(res$P.Value > 0.01, 'stable',
               ifelse(res$logFC > 2, 'up',
                      ifelse(res$logFC < -2, 'down', 'stable')) #接上句else 否则:接下来开始判断那些logFC <1.5 的基因,再if 判断:如果logFC <1.5,则为down(下调)基因,否则为stable基因
)
res$name = rownames(res)
ggscatter(res, x = "logFC", y = "v", size = 0.5, color = 'g')
ggscatter(res, x = "logFC", y = "v", color = "g", size = 0.5,
          label = "name", repel = T,
          #label.select = rownames(df)[df$g != 'stable'] ,
          label.select = head(rownames(res)), #挑选一些基因在图中显示出来
          palette = c("#00AFBB", "#E7B800", "#FC4E07"))

ggscatter(res, x = "AveExpr", y = "logFC", size = 0.2)
res$p_c = ifelse(res$P.Value < 0.001, 'p<0.001',
                 ifelse(res$P.Value < 0.01, '0.001<p<0.01', 'p>0.01'))
ggscatter(res, x = "AveExpr", y = "logFC", color = "p_c", size = 0.2,
          palette = c("green", "red", "black"))


### pheatmap
library(pheatmap)

x = res$logFC
names(x) = rownames(res)
cg = c(names(head(sort(x), 100)),
       names(tail(sort(x), 100)))
pheatmap(dat[cg,], show_colnames = F, show_rownames = F)
n = t(scale(t(dat[cg,]))) #通过“scale”对log-ratio数值进行归一化,现在的dat是行名为探针,列名为样本名,由于scale这个函数应用在不同组数据间存在差异时,需要行名为样本,因此需要用t(dat[cg,])来转换,最后再转换回来
pheatmap(n, show_colnames = F, show_rownames = F)
n[n > 2] = 2
n[n < -2] = -2
pheatmap(n, show_colnames = F, show_rownames = F)
ac = data.frame(g = group_list)
rownames(ac) = colnames(n)
pheatmap(n, show_colnames = F,
         show_rownames = F,
         cluster_cols = F,
         annotation_col = ac,) #列名注释信息为ac即分组信息
## 4.结果注释(富集分析)
ibrary(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)

res$probe_id <- rownames(res)
res <- merge(res, ids, by = "probe_id")


df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
           toType = c("ENTREZID"),
           OrgDb = org.Hs.eg.db)

final = merge(res, df, by.y = 'SYMBOL', by.x = 'symbol')

gene_up = final[final$g == 'UP', 'ENTREZID']
gene_down = final[final$g == 'DOWN', 'ENTREZID']
gene_diff = c(gene_up, gene_down)
gene_all = as.character(final[, 'ENTREZID'])

## GO kegg database analysis
if (T) {
  ###   over-representation test
  install.packages("R.utils")
  R.utils::setOption("clusterProfiler.download.method", 'auto')
  ## 这里因为clusterProfiler需要 r 4.2 以上,后面的 kegg 和 go的暂时没跑出来 ,后面创建个新环境跑
  kk.up <- enrichKEGG(gene = gene_up,
                      organism = 'hsa',
                      universe = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff = 0.9)
  head(kk.up)[, 1:6]
  dotplot(kk.up); ggsave('kk.up.dotplot.png')
  kk.down <- enrichKEGG(gene = gene_down,
                        organism = 'hsa',
                        universe = gene_all,
                        pvalueCutoff = 0.9,
                        qvalueCutoff = 0.9)
  head(kk.down)[, 1:6]
  dotplot(kk.down); ggsave('kk.down.dotplot.png')
  kk.diff <- enrichKEGG(gene = gene_diff,
                        organism = 'hsa',
                        pvalueCutoff = 0.05)
  head(kk.diff)[, 1:6]
  dotplot(kk.diff); ggsave('kk.diff.dotplot.png')

  kegg_diff_dt <- as.data.frame(kk.diff)
  kegg_down_dt <- as.data.frame(kk.down)
  kegg_up_dt <- as.data.frame(kk.up)
  down_kegg <- kegg_down_dt[kegg_down_dt$pvalue < 0.05,]; down_kegg$group = -1
  up_kegg <- kegg_up_dt[kegg_up_dt$pvalue < 0.05,]; up_kegg$group = 1

  kegg_plot <- function(up_kegg, down_kegg) {
    dat = rbind(up_kegg, down_kegg)
    colnames(dat)
    dat$pvalue = -log10(dat$pvalue)
    dat$pvalue = dat$pvalue * dat$group

    dat = dat[order(dat$pvalue, decreasing = F),]

    g_kegg <- ggplot(dat, aes(x = reorder(Description, order(pvalue, decreasing = F)), y = pvalue, fill = group)) +
      geom_bar(stat = "identity") +
      scale_fill_gradient(low = "blue", high = "red", guide = FALSE) +
      scale_x_discrete(name = "Pathway names") +
      scale_y_continuous(name = "log10P-value") +
      coord_flip() +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5)) +
      ggtitle("Pathway Enrichment")
  }

  g_kegg = kegg_plot(up_kegg, down_kegg)


  ggsave(g_kegg, filename = 'kegg_up_down.png')

  ###  GSEA 
  data(geneList, package = "DOSE")
  geneList = final$logFC
  names(geneList) = final$ENTREZID
  geneList = sort(geneList, decreasing = T)

  kk_gse <- gseKEGG(geneList = geneList,
                    organism = 'hsa',
                    nPerm = 1000,
                    minGSSize = 120,
                    pvalueCutoff = 0.9,
                    verbose = FALSE)
  head(kk_gse)[, 1:6]
  gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))

  down_kegg <- kk_gse[kk_gse$pvalue < 0.05 & kk_gse$enrichmentScore < 0,]; down_kegg$group = -1
  up_kegg <- kk_gse[kk_gse$pvalue < 0.05 & kk_gse$enrichmentScore > 0,]; up_kegg$group = 1

  g_kegg = kegg_plot(up_kegg, down_kegg)
  print(g_kegg)
  ggsave(g_kegg, filename = 'kegg_up_down_gsea.png')


}

{

  g_list = list(gene_up = gene_up,
                gene_down = gene_down,
                gene_diff = gene_diff)

  if (F) {
    go_enrich_results <- lapply(g_list, function(gene) {
      lapply(c('BP', 'MF', 'CC'), function(ont) {
        cat(paste('Now process ', ont))
        ego <- enrichGO(gene = gene,
                        universe = gene_all,
                        OrgDb = org.Hs.eg.db,
                        ont = ont,
                        pAdjustMethod = "BH",
                        pvalueCutoff = 0.99,
                        qvalueCutoff = 0.99,
                        readable = TRUE)

        return(ego)
      })
    })
  }

  n1 = c('gene_up', 'gene_down', 'gene_diff')
  n2 = c('BP', 'MF', 'CC')
  for (i in 1:3) {
    for (j in 1:3) {
      fn = paste0('dotplot_', n1[i], '_', n2[j], '.png')
      cat(paste0(fn, '\n'))
      png(fn, res = 150, width = 1080)
      print(dotplot(go_enrich_results[[i]][[j]]))
      dev.off()
    }
  }

}
## 5.生存分析 
# 用my.surv <- surv(OS_MONTHS,OS_STATUS=='DECEASED')构建生存曲线。
# 用kmfit2 <- survfit(my.surv~TUMOR_STAGE_2009)来做某一个因子的KM生存曲线。
# 用 survdiff(my.surv~type, data=dat)来看看这个因子的不同水平是否有显著差异,其中默认用是的logrank test 方法。
# 用coxph(Surv(time, status) ~ ph.ecog + tt(age), data=lung) 来检测自己感兴趣的因子是否受其它因子(age,gender等等)的影响。
load(file = './GSE11121_survival/step1-output.Rdata')
exprSet = dat
dim(exprSet)
colnames(phe) = c('event', 'grade', 'node', 'size', 'time')
phe = as.data.frame(apply(phe, 2, as.numeric))
boxplot(phe$size)
phe$size = ifelse(phe$size > median(phe$size), 'big', 'small')

library(survival)
library(survminer)
# 利用ggsurvplot快速绘制漂亮的生存曲线图
sfit <- survfit(Surv(time, event) ~ size, data = phe)
summary(sfit)
ggsurvplot(sfit, conf.int = F, pval = TRUE)
ggsurvplot(sfit, palette = c("#E7B800", "#2E9FDF"),
           risk.table = TRUE, pval = TRUE,
           conf.int = TRUE, xlab = "Time in months",
           ggtheme = theme_light(),
           ncensor.plot = TRUE)
## 多个 ggsurvplots作图生存曲线代码合并代码公布
sfit1 = survfit(Surv(time, event) ~ size, data = phe)
sfit2 = survfit(Surv(time, event) ~ grade, data = phe)
splots <- list()
splots[[1]] <- ggsurvplot(sfit1, pval = TRUE, data = phe, risk.table = TRUE)
splots[[2]] <- ggsurvplot(sfit2, pval = TRUE, data = phe, risk.table = TRUE)
# Arrange multiple ggsurvplots and print the output
arrange_ggsurvplots(splots, print = TRUE, ncol = 2, nrow = 1, risk.table.height = 0.4)
# 可以看到grade跟生存显著相关,而size跟病人生存的关系并不显著。

## 挑选感兴趣的基因做差异分析
phe$CBX4 = ifelse(exprSet['CBX4',] > median(exprSet['CBX4',]), 'high', 'low')
table(phe$CBX4)
ggsurvplot(survfit(Surv(time, event) ~ CBX4, data = phe), conf.int = F, pval = TRUE)
phe$CBX6 = ifelse(exprSet['CBX6',] > median(exprSet['CBX6',]), 'high', 'low')
table(phe$CBX6)
ggsurvplot(survfit(Surv(time, event) ~ CBX6, data = phe), conf.int = F, pval = TRUE)
## 批量生存分析 使用  logrank test 方法
mySurv = with(phe, Surv(time, event))
log_rank_p <- apply(exprSet, 1, function(gene) {
  #gene=exprSet[1,]
  phe$group = ifelse(gene > median(gene), 'high', 'low')
  data.survdiff = survdiff(mySurv ~ group, data = phe)
  p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
  return(p.val)
})
log_rank_p = sort(log_rank_p)
head(log_rank_p)
boxplot(log_rank_p)
phe$H6PD = ifelse(exprSet['H6PD',] > median(exprSet['H6PD',]), 'high', 'low')
table(phe$H6PD)
ggsurvplot(survfit(Surv(time, event) ~ H6PD, data = phe), conf.int = F, pval = TRUE)

# 前面如果我们使用了WGCNA找到了跟grade相关的基因模块,然后在这里就可以跟生存分析的显著性基因做交集
# 这样就可以得到既能跟grade相关,又有临床预后意义的基因啦。

## 批量生存分析 使用 coxph 回归方法
mySurv = with(phe, Surv(time, event))
cox_results <- apply(exprSet, 1, function(gene) {
  group = ifelse(gene > median(gene), 'high', 'low')
  survival_dat <- data.frame(group = group, grade = phe$grade, size = phe$size, stringsAsFactors = F)
  m = coxph(mySurv ~ grade + size + group, data = survival_dat)

  beta <- coef(m)
  se <- sqrt(diag(vcov(m)))
  HR <- exp(beta)
  HRse <- HR * se

  #summary(m)
  tmp <- round(cbind(coef = beta, se = se, z = beta / se, p = 1 - pchisq((beta / se)^2, 1),
                     HR = HR, HRse = HRse,
                     HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1) / HRse)^2, 1),
                     HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                     HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
  return(tmp['grouplow',])

})
cox_results = t(cox_results)
table(cox_results[, 4] < 0.05)
cox_results[cox_results[, 4] < 0.05,]

length(setdiff(rownames(cox_results[cox_results[, 4] < 0.05,]),
               names(log_rank_p[log_rank_p < 0.05])
))
length(setdiff(names(log_rank_p[log_rank_p < 0.05]),
               rownames(cox_results[cox_results[, 4] < 0.05,])
))
length(unique(names(log_rank_p[log_rank_p < 0.05]),
              rownames(cox_results[cox_results[, 4] < 0.05,])
))
save(log_rank_p,cox_results,exprSet,phe,file = 'surviva.Rdata')