转录组分析的基本流程


转录组

背景资料 
# https://mp.weixin.qq.com/s/a3y46NNNO-wardO3XWwh0w

基本流程

流程:
1. 参考基因组、基因注释文件 下载 
2. 质控,需要fastqc及multiqc等
3. 比对:hisat2
4. 计数和归一化
5. 差异分析


流程参考:https://f1000research.com/articles/4-1070/v1
step1:
cat SRR_Acc_List.txt |while read id ;do (prefetch  ${id} &);done
ls /public/project/RNA/airway/sra/*  |while read id;do ( nohup fastq-dump --gzip --split-3 -O ./ ${id} & );done
step2:
ls *gz |xargs fastqc -t 10
multiqc ./ 
step3: 去除低质量的和有接头的
trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o $dir  $fq1 $fq2 &
step4:
lls *gz|cut -d"_" -f 1 |sort -u |while read id;do
ls -lh ${id}_1_val_1.fq.gz   ${id}_2_val_2.fq.gz 
hisat2 -p 10 -x /public/reference/index/hisat/hg38/genome -1 ${id}_1_val_1.fq.gz   -2 ${id}_2_val_2.fq.gz  -S ${id}.hisat.sam
done

ls *.sam|while read id ;do (samtools sort -O bam -@ 5  -o $(basename ${id} ".sam").bam   ${id});done
rm *.sam 
ls *.bam |xargs -i samtools index {}
# ls *.bam |xargs -i samtools flagstat -@ 10  {}  > 
ls *.bam |while read id ;do ( nohup samtools flagstat -@ 1 $id >  $(basename ${id} ".bam").flagstat  & );done
step5:
gtf="/public/reference/gtf/gencode/gencode.v25.annotation.gtf.gz"   
featureCounts -T 5 -p -t exon -g gene_id  -a $gtf -o  all.id.txt  1>counts.id.log 2>&1 &
|htseq-count -f bam -r pos ../clean/sample.bam  /public/reference/gtf/gencode/gencode.v25.annotation.gtf.gz > SRR1039508.count.txt 

差异分析

### ---------------
###
### Firstly run DEseq2 
###
### ---------------
suppressMessages(library(DESeq2))
colData <- data.frame(row.names = colnames(exprSet), group_list = group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,
                              colData = colData,
                              design = ~group_list)
dds <- DESeq(dds)

res <- results(dds, contrast = c("group_list", "treat_2", "control"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG_treat_2 = as.data.frame(resOrdered)
DEG_treat_2 = na.omit(DEG_treat_2)

write.csv(DEG_treat_2, "DEG_treat_2_deseq2.results.csv")


# 热图
library(pheatmap)
choose_gene = head(rownames(need_DEG), 50) ## 50 maybe better
choose_matrix = exprSet[choose_gene,]
choose_matrix = t(scale(t(choose_matrix)))
pheatmap(choose_matrix, filename = paste0(n, '_need_DEG_top50_heatmap.png'))


logFC_cutoff <- with(need_DEG, mean(abs(log2FoldChange)) + 2 * sd(abs(log2FoldChange)))
# logFC_cutoff=1

need_DEG$change = as.factor(ifelse(need_DEG$pvalue < 0.05 & abs(need_DEG$log2FoldChange) > logFC_cutoff,
                                   ifelse(need_DEG$log2FoldChange > logFC_cutoff, 'UP', 'DOWN'), 'NOT')
)
this_tile <- paste0('Cutoff for logFC is ', round(logFC_cutoff, 3),
                    '\nThe number of up gene is ', nrow(need_DEG[need_DEG$change == 'UP',]),
                    '\nThe number of down gene is ', nrow(need_DEG[need_DEG$change == 'DOWN',])
)

#火山图
library(ggplot2)
g = ggplot(data = need_DEG,
           aes(x = log2FoldChange, y = -log10(pvalue),
               color = change)) +
  geom_point(alpha = 0.4, size = 1.75) +
  theme_set(theme_set(theme_bw(base_size = 20))) +
  xlab("log2 fold change") +
  ylab("-log10 p-value") +
  ggtitle(this_tile) +
  theme(plot.title = element_text(size = 15, hjust = 0.5)) +
  scale_colour_manual(values = c('blue', 'black', 'red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g, filename = paste0(n, '_volcano.png'))


if (F) {
  png("DESeq2_qc_dispersions.png", 1000, 1000, pointsize = 20)
  plotDispEsts(dds, main = "Dispersion plot")
  dev.off()

  rld <- rlogTransformation(dds)
  exprMatrix_rlog = assay(rld)

  x = apply(exprMatrix_rlog, 1, mean)
  y = apply(exprMatrix_rlog, 1, mad)
  plot(x, y)

  png("DESeq2_RAWvsNORM.png", height = 800, width = 800)
  par(cex = 0.7)
  n.sample = ncol(exprSet)
  if (n.sample > 40) par(cex = 0.5)
  cols <- rainbow(n.sample * 1.2)
  par(mfrow = c(2, 2))
  boxplot(exprSet, col = cols, main = "expression value", las = 2)
  boxplot(exprMatrix_rlog, col = cols, main = "expression value", las = 2)
  hist(as.matrix(exprSet))
  hist(exprMatrix_rlog)
  dev.off()

}
### ---------------
###
### Then run edgeR 
###
### ---------------
library(edgeR)
g = factor(group_list)
g = relevel(g, g1)
d <- DGEList(counts = exprSet, group = g)
keep <- rowSums(cpm(d) > 1) >= 2
table(keep)
d <- d[keep, , keep.lib.sizes = FALSE]
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
d$samples

dge = d
design <- model.matrix(~0 + factor(group_list))
rownames(design) <- colnames(dge)
colnames(design) <- levels(factor(group_list))

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)

lrt <- glmLRT(fit, contrast = c(-1, 1))
nrDEG = topTags(lrt, n = nrow(dge))
nrDEG = as.data.frame(nrDEG)
head(nrDEG)
# DEG_edgeR = nrDEG
# nrDEG = DEG_edgeR[, c(1, 5)]
# colnames(nrDEG) = c('log2FoldChange', 'pvalue')
# draw_h_v(exprSet, nrDEG, paste0(pro, '_edgeR'))
### ---------------
###
### Then run limma 
###
### --------------- 
suppressMessages(library(limma))
design <- model.matrix(~0 + factor(group_list))
colnames(design) = levels(factor(group_list))
rownames(design) = colnames(exprSet)
design

dge <- DGEList(counts = exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log = TRUE, prior.count = 3)

v <- voom(dge, design, plot = TRUE, normalize = "quantile")
fit <- lmFit(v, design)

group_list
con = paste0(g2, '-', g1)
cat(con)
cont.matrix = makeContrasts(contrasts = c(con), levels = design)
fit2 = contrasts.fit(fit, cont.matrix)
fit2 = eBayes(fit2)

tempOutput = topTable(fit2, coef = con, n = Inf)
DEG_limma_voom = na.omit(tempOutput)
head(DEG_limma_voom)