单细胞多样本分析Seurat


Seurat多样本

## 背景
多样本分析流程
https://satijalab.org/seurat/archive/v3.0/immune_alignment.html
https://github.com/satijalab/seurat-wrappers

1.读取多个样本数据并合并

library(Seurat)
### 样本1
scdata1 <- data.table::fread('data/immune_control_expression_matrix.txt.gz', data.table = F)
rownames(scdata1) <- scdata1$V1
scdata1 <- scdata1[, -1]
scobj1 <- CreateSeuratObject(counts = scdata1,
                             project = "pbmc_stim",
                             min.cells = 3,
                             min.features = 200)
scobj1@meta.data$group = "STIM"

### 样本2
scdata2 <- data.table::fread("data/immune_stimulated_expression_matrix.txt.gz", data.table = F)
rownames(scdata2) <- scdata2$V1
scdata2 <- scdata2[, -1]
scobj2 <- CreateSeuratObject(counts = scdata2,
                             project = "pbmc_ctrl",
                             min.cells = 3,
                             min.features = 200)
scobj2@meta.data$group = "CTRL"

### 合并数据
data <- list(scobj1, scobj2)
scobj <- merge(x = data[[1]], y = data[-1])

### 也可以在GEO下载数据后自己调整
### https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE96583
library(Matrix)
scdata1 <- Matrix::readMM("data/GSE96583/GSM2560248_2.1.mtx.gz")
gene_id1 <- read.table("data/GSE96583/GSE96583_batch2.genes.tsv.gz")
barcode1 <- read.table("data/GSE96583/GSM2560248_barcodes.tsv.gz")
rownames(scdata1) <- gene_id1$V2
colnames(scdata1) <- barcode1$V1

### 读取另外一个数据
scdata2 <- Matrix::readMM("data/GSE96583/GSM2560249_2.2.mtx.gz")
gene_id2 <- read.table("data/GSE96583/GSE96583_batch2.genes.tsv.gz")
barcode2 <- read.table("data/GSE96583/GSM2560249_barcodes.tsv.gz")
rownames(scdata2) <- gene_id2$V2
colnames(scdata2) <- barcode2$V1

### 如果需要Read10X 就需要三个文件,
### 改造一下,barcodes.tsv.gz, features.tsv.gz,matrix.mtx.gz
scdata <- Read10X(data.dir = "data/GSE96583/stim/")
scobj <- CreateSeuratObject(counts = scdata,
                            project = "pbmc_stim",
                            min.cells = 3,
                            min.features = 200)
scobj@meta.data$group = "STIM"


scdata <- Read10X(data.dir = "data/GSE96583/ctrl/")
scobj <- CreateSeuratObject(counts = scdata,
                            project = "pbmc_ctrl",
                            min.cells = 3,
                            min.features = 200)
scobj@meta.data$group = "CTRL"

2.多样本之间进行批次矫正

### 不批次矫正的结果 两个样本 "umap_native"和“umap”有明显的区分,矫正后样本融合在一起了
library(harmony)
library(SeuratWrappers)

scobj@meta.data$percent.mt <- PercentageFeatureSet(scobj, pattern = "^MT-")
VlnPlot(scobj, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))


## 筛选
scobj <- subset(scobj, subset = nFeature_RNA > 200 &
  nFeature_RNA < 1500
  &
  percent.mt < 5)

## 标准化
scobj <- NormalizeData(scobj)
scobj <- FindVariableFeatures(scobj, selection.method = "vst", nfeatures = 2000)
scobj <- ScaleData(scobj, features = rownames(scobj))
scobj <- RunPCA(scobj, features = VariableFeatures(object = scobj), reduction.name = "pca")
ElbowPlot(scobj)
scobj <- RunUMAP(scobj, reduction = "pca", dims = 1:15, reduction.name = "umap_native")
## 批次矫正
scobj <- RunHarmony(scobj, reduction = "pca", group.by.vars = "group", reduction.save = "harmony")
scobj <- RunUMAP(scobj, reduction = "harmony", dims = 1:15, reduction.name = "umap")

p1 <- DimPlot(scobj, reduction = "umap_native", group.by = "group")
p2 <- DimPlot(scobj, reduction = "umap", group.by = "group")
p1 + p2


cobj <- FindNeighbors(scobj, reduction = "harmony", dims = 1:30)
### 设置多个resolution选择合适的resolution
scobj <- FindClusters(scobj, resolution = seq(0.2, 1.2, 0.1))

library(clustree)
clustree(scobj)

## 上述看图后 resloution 结果为0.5 差不多 后面的结果跟0.5 没有区别
scobj@meta.data$seurat_clusters <- scobj@meta.data$RNA_snn_res.0.5
Idents(scobj) <- "seurat_clusters"
DimPlot(scobj, reduction = "umap", label = T)
DimPlot(scobj, reduction = "umap", group.by = "group")
DimPlot(scobj, reduction = "umap", split.by = "group")

scobj@assays$RNA$scale.data <- as.matrix(scobj@assays$RNA$scale.data)
scobj@assays$RNA$scale.data <- matrix()

FeaturePlot(scobj, features = "MS4A1", order = TRUE, reduction = "umap")
scobj@reductions$umap_naive <- NULL
saveRDS(scobj, file = "out/harmoy.rds")

3.分群

### 先分大群, 
### http://bio-bigdata.hrbmu.edu.cn/CellMarker/
### B: "MS4A1", "CD79A"
### NK: "GNLY", "NKG7"
### T: "CD3E","CD8A","CD4","IL7R", 
### Monocyte: "CD14", "FCGR3A", "LYZ"
### DC "FCER1A"
### Megakaryocytes/Platelet: "PPBP"
### Erythrocytes: "HBB","HBA2"

marker_genes <- c("MS4A1", "CD79A", "CD19")
VlnPlot(scobj, features = marker_genes)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 2)

marker_genes <- c("CD14", "FCGR3A", "LYZ")
VlnPlot(scobj, features = marker_genes)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 2)

### 汇总画图
marker_genes <- c("MS4A1",
                  "GNLY", "NKG7",
                  "CD3E", "CD8A", "CD4", "IL7R",
                  "CD14", "FCGR3A", "LYZ",
                  "FCER1A",
                  "PPBP"
)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 5)

## cDC
marker_genes <- c("CLEC9A", "ITGAM", "ITGAE", "FCER1A")
## pDC
marker_genes <- c("IL3RA", "HLA-DRA")

## T细胞以及B细胞激活
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4110661/
marker_genes <- c("CCR7", "SELL", "CREM", "CD69")
VlnPlot(scobj, features = marker_genes)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 3)

### 不好确定的时候,换一种作图方式(上述的群不好分辨的时候)
library(Nebulosa)
marker_genes <- c("CCR7", "SELL", "CREM", "CD69")
plot_density(scobj, features = marker_genes) + plot_layout(ncol = 2)

### 找出所有的marker
### 还有哪些群不确定的找它排名靠前的marker ,查marker的文献,能查到其图谱就可以
scobj <- JoinLayers(scobj)
all_markers <- FindAllMarkers(scobj)

library(dplyr)
top_markers <- all_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice(1:15) %>%
  ungroup()

#### 拿到群的注释结果,进行marker对应的群注释
Idents(scobj) <- "seurat_clusters"
### 给每个群添加注释
scobj <- RenameIdents(scobj,
                      "0" = "CD14 Mono",
                      "1" = "CD4 Naive T",
                      "2" = "CD4 Memory T",
                      "3" = "CD16 Mono",
                      "4" = "B cell",
                      "5" = "CD8 T",
                      "6" = "NK",
                      "7" = "T activated",
                      "8" = "DC",
                      "9" = "B Activated",
                      "10" = "Mk",
                      "11" = "pDC",
                      "12" = "Mono/Mk Doublets",
                      "13" = "Eryth"
)
DimPlot(scobj, reduction = "umap", label = T)
scobj@meta.data$celltype = Idents(scobj)

### 加入筛选细胞
Idents(scobj) <- "seurat_clusters"
scobj_subset <- subset(scobj, subset = seurat_clusters != 12)
DimPlot(scobj_subset, reduction = "umap", label = T)

scobj_subset1 <- subset(scobj, subset = celltype != "Mono/Mk Doublets")
DimPlot(scobj_subset1, reduction = "umap", label = T)

## 不要小群
scobj_subset2 <- subset(scobj, subset = seurat_clusters %in% c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9))
DimPlot(scobj_subset2, reduction = "umap", label = T)

## 反选
scobj_subset3 <- subset(scobj, subset = celltype %in% c("Mono/Mk Doublets", "Mk", "Eryth"), invert = TRUE)
DimPlot(scobj_subset3, reduction = "umap", label = T)

## 自己确定 反选
`%notin%` <- Negate(`%in%`)
scobj_subset4 <- subset(scobj, celltype %notin% c("Mono/Mk Doublets", "pDC", "Mk"))
DimPlot(scobj_subset4, reduction = "umap", label = T)

## 保存注释结果
Idents(scobj) <- "celltype"
DimPlot(scobj, reduction = "umap", label = T)
#saveRDS(scobj,file = "output/hamony_annotaion.rds")

## 对分好的1群想再分
Idents(scobj) <- "seurat_clusters"
## snn是shared nearest neighbors
## nn是nearest neighbors
scobj1 <- FindSubCluster(
  scobj,
  cluster = "1",
  graph.name = "RNA_snn",
  subcluster.name = "RNA_snn_res.0.5_c1_sub",
  resolution = 0.5
)

Idents(scobj1) <- "RNA_snn_res.0.5_c1_sub"
scobj1 <- RenameIdents(scobj1,
                       "0" = "CD14 Mono",
                       "1_0" = "CD4 Naive T",
                       "1_1" = "CD4 Naive T",
                       "1_2" = "CD8 Naive T",
                       "2" = "CD4 Memory T",
                       "3" = "CD16 Mono",
                       "4" = "B cell",
                       "5" = "CD8 T",
                       "6" = "NK",
                       "7" = "T activated",
                       "8" = "DC",
                       "9" = "B Activated",
                       "10" = "Mk",
                       "11" = "pDC",
                       "12" = "Mono/Mk Doublets",
                       "13" = "Eryth"
)

DimPlot(scobj1, reduction = "umap", label = T)
scobj1@meta.data$celltype_sub = Idents(scobj1)
plot_density(scobj1, c("CD8A", "CCR7"), joint = TRUE) + plot_layout(nrow = 1)
### 要注意reduction指定的问题
### https://hbctraining.github.io/scRNA-seq_online/lessons/09_merged_SC_marker_identification.html
### https://satijalab.org/seurat/articles/integration_introduction.html
### https://satijalab.org/seurat/archive/v3.0/immune_alignment.html

4.注释结果可视化

DimPlot(scobj, reduction = "umap")
DimPlot(scobj, reduction = "umap", label = T)
DimPlot(scobj, reduction = "umap", label = T) + NoLegend()
scCustomize::DimPlot_scCustom(scobj, figure_plot = TRUE)

DimPlot(scobj, reduction = "umap", split.by = "group")
DimPlot(scobj, reduction = "umap", split.by = "group", label = T) + NoLegend()


data <- as.data.frame(table(scobj$group, scobj$celltype))
colnames(data) <- c("group", "CellType", "Freq")
library(dplyr)
df <- data %>%
  group_by(group) %>%
  mutate(Total = sum(Freq)) %>%
  ungroup() %>%
  mutate(Percent = Freq / Total) %>%
  as.data.frame()

df$CellType <- factor(df$CellType, levels = unique(df$CellType))


library(ggplot2)
p <- ggplot(df, aes(x = group, y = Percent, fill = CellType)) +
  geom_bar(position = "fill", stat = "identity", color = 'white', alpha = 1, width = 0.95) +
  #scale_fill_manual(values = mycol) +
  scale_y_continuous(expand = c(0, 0)) +
  theme_classic()
p

### 换种形式
### facet_wrap 可以设置行列参数
ggplot(df, aes(x = group, y = Percent, fill = group)) +
  geom_bar(stat = "identity") +
  facet_wrap(~CellType, nrow = 2) +
  theme_bw() +
  NoLegend()

ggplot(df, aes(x = CellType, y = Percent, fill = group)) +
  geom_bar(stat = "identity") +
  facet_wrap(~group, nrow = 2) +
  theme_bw() +
  NoLegend()

### 筛选数据 和前面seurat的subset比较,理解泛型函数
data <- subset(df, subset = !CellType %in% c("Mono/Mk Doublets", "Mk", "Eryth", "DC", "pDC"))
ggplot(data, aes(x = group, y = Percent, fill = group)) +
  geom_bar(stat = "identity") +
  facet_wrap(~CellType, nrow = 3) +
  theme_bw() +
  NoLegend()


### 可视化展示DotPlot 和 Clustered_DotPlot
library(dplyr)
top_markers <- all_markers %>%
  group_by(cluster) %>%
  arrange(desc(avg_log2FC)) %>%
  slice(1:5) %>%
  ungroup() %>%
  pull(gene) %>%
  unique()
DotPlot(scobj1, features = top_markers)
DotPlot(scobj1, features = top_markers) + RotatedAxis()
DotPlot(scobj1, features = top_markers) +
  coord_flip() +
  RotatedAxis()
DotPlot(scobj1, features = top_markers, dot.scale = 4) +
  coord_flip() +
  RotatedAxis() +
  theme(axis.text.y = element_text(size = 8))

scCustomize::Clustered_DotPlot(scobj1, features = top_markers)

### 选择作图基因
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5",
                     "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1",
                     "GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ", "PRSS57")
### 绘图,主要时候cols参数,给了两个颜色,split.by区分多组
DotPlot(scobj1, features = markers.to.plot,
        cols = c("blue", "red"),
        dot.scale = 6,
        split.by = "group") +
  RotatedAxis() +
  theme(axis.text.y = element_text(size = 8)) +
  theme(axis.text.x = element_text(size = 8))

### marker 热图,需要scale data
DoHeatmap(scobj1, features = top_markers)

### Identity的大小修改,通过size参数
DoHeatmap(scobj1, features = top_markers, size = 3)

### 基因的大小修改theme(axis.text.y = element_text(size = 8))
DoHeatmap(scobj1, features = top_markers, size = 3) +
  theme(axis.text.y = element_text(size = 8))

### subset和downsample 可以随机取每个群的细胞数
DoHeatmap(subset(scobj1, downsample = 50), features = top_markers, size = 3) +
  theme(axis.text.y = element_text(size = 8))


library(scRNAtoolVis)
AverageHeatmap(object = scobj1, markerGene = top_markers)
## 修改基因的字号
AverageHeatmap(object = scobj1, markerGene = top_markers, fontsize = 8)
## 组间比较
AverageHeatmap(object = subset(scobj1, group == "CTRL"), markerGene = top_markers, fontsize = 8) +
  AverageHeatmap(object = subset(scobj, group == "STIM"), markerGene = top_markers, fontsize = 8)

5.差异分析、富集分析

scobj <- JoinLayers(scobj)

scobj$celltype.stim <- paste(scobj$celltype, scobj$group, sep = "_")

Idents(scobj) <- "celltype.stim"
table(scobj$celltype.stim)

sce.markers <- FindAllMarkers(object = scobj,
                              only.pos = TRUE,
                              min.pct = 0.25,
                              thresh.use = 0.25)

library(dplyr)
markers <- sce.markers %>%
  filter(p_val_adj < 0.001)

library(clusterProfiler)
### 名称转换
gid <- bitr(unique(markers$gene), 'SYMBOL', 'ENTREZID', OrgDb = 'org.Hs.eg.db')
### 交叉合并
colnames(gid)[1] <- "gene"
markers <- merge(markers, gid, by = 'gene')

library(tidyr)
markers <- markers %>%
  separate(cluster, into = c("celltype", "group"), sep = "_", remove = F) %>%
  filter(!celltype %in% c("Eryth", "Mk", "Mono/Mk Doublets"))


### 多组富集分析(组组比较)
x = compareCluster(ENTREZID ~ celltype + group, data = markers, fun = 'enrichKEGG')

### 绘图
library(ggplot2)
dotplot(x, label_format = 60, x = "group") +
  facet_grid(~celltype) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
### 换图
dotplot(x, label_format = 60, x = "celltype") +
  facet_grid(~group) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

6.基于差异分析的GSEA分析

### 基于差异分析的GSEA分析
### 同一群细胞中, 处理和对照的差异
interferon.response <- FindMarkers(scobj,
                                   ident.1 = "CD14 Mono_STIM",
                                   ident.2 = "CD14 Mono_CTRL",
                                   logfc.threshold = 0)

gene_df <- interferon.response

### https://www.gsea-msigdb.org/gsea/index.jsp
geneList <- gene_df$avg_log2FC
names(geneList) = rownames(gene_df)
geneList = sort(geneList, decreasing = TRUE)
library(clusterProfiler)
#################################################################
### 1.kegg 通路
genesets <- read.gmt("data/c2.cp.kegg.v2022.1.Hs.symbols.gmt")
y <- GSEA(geneList, TERM2GENE = genesets)
yd <- as.data.frame(y)
### 看整体分布
dotplot(y, showCategory = 12, split = ".sign") + facet_grid(~.sign)
library(enrichplot)
gseaplot2(y, "KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY", color = "red", pvalue_table = T)
gseaplot2(y, "KEGG_RIG_I_LIKE_RECEPTOR_SIGNALING_PATHWAY", color = "red", pvalue_table = T)

### 2.hallmarks gene set
genesets <- read.gmt("data/h.all.v2022.1.Hs.symbols.gmt")
### GSEA
y <- GSEA(geneList, TERM2GENE = genesets)
yd <- as.data.frame(y)
library(ggplot2)
dotplot(y, showCategory = 30, split = ".sign") + facet_grid(~.sign)
library(enrichplot)
gseaplot2(y, "HALLMARK_INTERFERON_ALPHA_RESPONSE", color = "red", pvalue_table = T)

### 3.转录因子
genesets <- read.gmt("resource/ENCODE_TF_ChIP-seq_2015.txt")
y <- GSEA(geneList, TERM2GENE = genesets)
yd <- as.data.frame(y)
dotplot(y, showCategory = 30, split = ".sign") + facet_grid(~.sign)
gseaplot2(y, "STAT2 K562 hg19", color = "red", pvalue_table = T)

### 4.免疫相关基因集
genesets <- read.gmt("data/c7.all.v2022.1.Hs.symbols.gmt")
y <- GSEA(geneList, TERM2GENE = genesets)
yd <- as.data.frame(y)
dotplot(y, showCategory = 5, split = ".sign", label_format = 60) + facet_grid(~.sign)
gseaplot2(y, "HARALAMBIEVA_PBMC_M_M_R_II_AGE_11_22YO_VACCINATED_VS_UNVACCINATED_7YR_UP", color = "red", pvalue_table = T)
### 基因集文件 https://maayanlab.cloud/Enrichr/#libraries