基于Seurat包单细胞流程
## 背景:
https://satijalab.org/seurat/articles/get_started_v5_new
https://github.com/gao-lab/GLUE/
https://github.com/carmonalab/ProjecTILs
https://pair-code.github.io/understanding-umap/
1.基本步骤
rm(list = ls())
options(BioC_mirror = "https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
library(Seurat)
scdata <- Read10X(data.dir = "./data/10x/filtered_gene_bc_matrices/hg19")
scobj <- CreateSeuratObject(counts = scdata, project = "pbmc",
min.cells = 3, min.features = 200)
library(dplyr)
scobj <- scobj %>%
NormalizeData() %>% #归一化
FindVariableFeatures() %>% #筛选特征,找高变基因
ScaleData() %>% #标准化
RunPCA() %>% #降维
RunUMAP(dims = 1:10) %>% #非线性降维
FindNeighbors() %>% #建立SNN
FindClusters(resolution = 0.5) #分群5
DimPlot(scobj, reduction = "umap", label = TRUE)
Idents(scobj) <- "seurat_clusters"
scobj <- RenameIdents(scobj,
"0" = "Naive CD4+ T",
"1" = "CD14+ Mono",
"2" = "Memory CD4+",
"3" = "B cell",
"4" = "CD8+ T",
"5" = "FCGR3A+ Mono",
"6" = "NK",
"7" = "DC",
"8" = "Platelet"
)
DimPlot(scobj, reduction = "umap", label = T, repel = T) + NoLegend()
1. 质控、筛选、找高变基因
scobj <- CreateSeuratObject(counts = scdata, project = "pbmc",
min.cells = 3, min.features = 200)
### 数据质控
### 主要PercentageFeatureSet函数计算线粒体含量
### 人类使用pattern = "^MT-",小鼠使用pattern = "^mt-"
scobj@meta.data$percent.mt = PercentageFeatureSet(scobj, pattern = "^MT-")
scobj[["percent.mt"]] <- PercentageFeatureSet(scobj, pattern = "^MT-")
scobj@meta.data
VlnPlot(scobj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
### 正式筛选,筛选的是细胞,最终细胞减少
### nFeature_RNA > 200
### nFeature_RNA < 2500
### percent.mt < 5
scobj <- subset(scobj, subset = nFeature_RNA > 200 &
nFeature_RNA < 2500 &
percent.mt < 5)
## 数据标准化: NormalizeData
### 先除以总数,再乘以系数,然后取log
scobj <- NormalizeData(scobj, normalization.method = "LogNormalize", scale.factor = 10000)
## 特征筛选(高变基因)
scobj <- FindVariableFeatures(scobj, selection.method = "vst", nfeatures = 2000)
### 高变基因可视化
### 使用VariableFeatures 函数提取高变基因
top10 <- head(VariableFeatures(scobj), 10)
### 使用VariableFeaturePlot 画图
plot1 <- VariableFeaturePlot(scobj) ## 横坐标基因平均表达量,纵坐标为变异度
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
t1 <- cbind(as.data.frame(rownames(scobj@assays$RNA@features)), scobj@assays$RNA@meta.data)
### 自己画
t1 <- cbind(as.data.frame(rownames(scobj@assays$RNA@features)), scobj@assays$RNA@meta.data)
colnames(t1)[1] <- "gene_name"
rownames(t1) <- t1$gene_name
library(ggplot2)
ggplot(t1, aes(x = vf_vst_counts_mean, y = vf_vst_counts_variance.standardized, color = vf_vst_counts_variable)) +
geom_point() +
scale_color_manual(values = c("black", "red")) +
scale_x_log10() +
theme_bw() +
labs(x = "Average Expression", y = "Standardized Variance")
##加标签
library(ggrepel)
ggplot(t1, aes(x = vf_vst_counts_mean, y = vf_vst_counts_variance.standardized, color = vf_vst_counts_variable)) +
geom_point() +
scale_color_manual(values = c("black", "red")) +
scale_x_log10() +
theme_bw() +
labs(x = "Average Expression", y = "Standardized Variance") +
geom_text_repel(data = t1[top10,], label = top10, color = "black")
2.数据缩放、降维
### 降维之前的必备操作
### 缩放的效果是,基因的平均值是0,方差是1
## 保存时把它删掉
scobj <- ScaleData(scobj, features = rownames(scobj))
scale_data <- scobj@assays$RNA@layers$scale.data
hist(apply(scale_data, 1, mean))
hist(apply(scale_data, 1, var))
hist(matrixStats::rowMeans2(scale_data))
hist(matrixStats::rowVars(scale_data))
## PCA线性降维
scobj <- RunPCA(scobj, features = VariableFeatures(object = scobj))
DimPlot(scobj, reduction = "pca")
##pca data
pca_data <- as.data.frame(scobj@reductions$pca@cell.embeddings)
ggplot(pca_data, aes(x = PC_1, y = PC_2, color = "red")) + geom_point()
### 选择合适的PCA维度
ElbowPlot(scobj) ##这里图片显示10之后pca就没啥变化了
##umap非线性降维,依赖pca结果
scobj <- RunUMAP(scobj, dims = 1:10)
umap_data <- as.data.frame(scobj@reductions$umap@cell.embeddings)
ggplot(umap_data, aes(x = umap_1, y = umap_2, color = "red")) + geom_point()
3.聚类分群
### 聚类分群
### 找紧邻,dims = 1:10 跟UMAP相同
scobj <- FindNeighbors(scobj, dims = 1:10)
### 分群(需要在找近邻后)
scobj <- FindClusters(scobj, resolution = 0.5) ### 会在metadata中增加两列数据"RNA_snn_res.0.5" "seurat_clusters"
DimPlot(scobj, reduction = "umap", label = T) ## 将umap结果分群
metdaat <- scobj@meta.data
scobj <- FindClusters(scobj, resolution = seq(0.2, 1.2, 0.2)) ## 分辨率改为0.2到1.2 ,分群结果细致
DimPlot(scobj, reduction = "umap", label = T) ## 将umap结果分群
library(clustree)
clustree(scobj) ## 看不同分辨率下哪些群被分的 更细致了
### 选择特定分辨率得到的分群此处为RNA_snn_res.0.5
scobj@meta.data$seurat_clusters <- scobj@meta.data$RNA_snn_res.0.5
DimPlot(scobj, reduction = "umap", label = T)
4.分群注释
### 分群注释
### 先分大群, marker哪里来
### http://bio-bigdata.hrbmu.edu.cn/CellMarker/ (此网站找细胞的marker基因)
### B: "MS4A1", "CD79A"
### NK: "GNLY", "NKG7"
### T: "CD3E","CD8A","CD4","IL7R",
### Monocyte: "CD14", "FCGR3A", "LYZ"
### DC "FCER1A"
### Platelet: "PPBP"
"MS4A1" %in% rownames(scobj)
"CD79A" %in% rownames(scobj)
### 使用VlnPlot画marker小提琴图
VlnPlot(scobj, features = c("MS4A1", "CD79A"))
### 使用FeaturePlot画出特征分布图
FeaturePlot(scobj, features = c("MS4A1", "CD79A"), order = TRUE, ncol = 2)
VlnPlot(scobj, features = c("GNLY", "NKG7"))
FeaturePlot(scobj, features = c("GNLY", "NKG7"), order = TRUE, ncol = 2)
VlnPlot(scobj, features = c("CD3E", "CD8A", "CD4", "IL7R"))
FeaturePlot(scobj, features = c("CD3E", "CD8A", "CD4", "IL7R"), order = TRUE, ncol = 2)
VlnPlot(scobj, features = c("CD14", "FCGR3A", "LYZ"))
FeaturePlot(scobj, features = c("CD14", "FCGR3A", "LYZ"), order = TRUE, ncol = 2)
VlnPlot(scobj, features = c("FCER1A", "PPBP"))
FeaturePlot(scobj, features = c("FCER1A", "PPBP"), order = TRUE, ncol = 2)
marker_genes <- c("MS4A1", "CD79A",
"GNLY", "NKG7",
"CD3E", "CD8A", "CD4", "IL7R",
"CD14", "FCGR3A", "LYZ",
"FCER1A",
"PPBP"
)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 5)
### 汇总画图
marker_genes <- c("MS4A1", "CD79A",
"GNLY", "NKG7",
"CD3E", "CD8A", "CD4", "IL7R",
"CD14", "FCGR3A", "LYZ",
"FCER1A",
"PPBP"
)
FeaturePlot(scobj, features = marker_genes, order = TRUE, ncol = 5)
### 再确定细节
marker_genes <- c("CD3E", "CD4", "CD8A",
"CCR7", "SELL", "CREM", "TCF7", "S100A4")
FeaturePlot(scobj, features = marker_genes, order = T, ncol = 5)
marker_genes <- c("CCR7", "SELL", "CREM", "TCF7")
FeaturePlot(scobj, features = marker_genes, order = T, ncol = 2)
marker_genes <- c("RPS12", "FASLG")
FeaturePlot(scobj, features = marker_genes, order = T, ncol = 2)
### 不好确定的时候,换一种作图方式
library(Nebulosa)
marker_genes <- c("CCR7", "SELL", "TCF7", "S100A4")
plot_density(scobj, features = marker_genes) + plot_layout(ncol = 2)
### 确定不了的时候自己分析maker
all_markers <- FindAllMarkers(object = scobj)
all_markers$rank <- all_markers$pct.1 / all_markers$pct.2
## 这里得到了差异表达矩阵,后续会写一下对它单细胞的富集分析(见下一篇)
## all_markers 这里自己找的时候找特异性强的,尽量其他群不表达
## 这里找avg_logfc差异倍数大的,pct1大一点,pct2小点,这样特异性强
marker_genes <- c("RPS12", "FASLG")
FeaturePlot(scobj, features = marker_genes, order = T, ncol = 2)
library(dplyr)
top10_markers <- all_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice(1:10) %>%
ungroup()
5. 确定注释结果
### 确认群的个数
Idents(scobj) <- "seurat_clusters"
head(Idents(scobj))
## 给每个群加注释
scobj <- RenameIdents(scobj,
"0" = "Naive CD4+ T",
"1" = "CD14+ Mono",
"2" = "Memory CD4+",
"3" = "B cell",
"4" = "CD8+ T",
"5" = "FCGR3A+ Mono",
"6" = "NK",
"7" = "DC",
"8" = "Platelet"
)
DimPlot(scobj, label = T)
head(Idents(scobj))
scobj@meta.data$celltype = Idents(scobj)
metadata <- scobj@meta.data
### 保存结果
saveRDS(scobj, file = "output/Seurat_single_sample_scobj.rds")
6.注释结果可视化
DimPlot(scobj, reduction = "umap", label = T)
DimPlot(scobj, reduction = "umap", label = T) + NoLegend()
scCustomize::DimPlot_scCustom(scobj, figure_plot = TRUE)
### 各个细胞群,画图展示
### 出现在scobj@meta.data 中的 列都可以FeaturePlot 展示
scobj@meta.data$cd14 <- ifelse(scobj@meta.data$celltype == "CD14+ Mono", 1, 0)
FeaturePlot(scobj, features = "cd14")
desgin <- model.matrix(~0 + metadata$celltype)
desgin <- as.data.frame(desgin)
colnames(desgin) <- levels(metadata$celltype)
scobj@meta.data <- cbind(scobj@meta.data, desgin)
FeaturePlot(scobj, features = levels(metadata$celltype))
### 可视化展示DotPlot
### 因子来限定细胞的顺序
Idents(scobj) <- factor(Idents(scobj), levels = c("B cell",
"NK",
"Naive CD4+ T",
"Memory CD4+",
"CD8+ T",
"CD14+ Mono",
"FCGR3A+ Mono",
"DC",
"Platelet"))
markers.to.plot <- c("MS4A1", "CD79A",
"GNLY", "NKG7",
"CD3E", "CD8A", "CD4", "IL7R",
"CD14", "FCGR3A", "LYZ",
"FCER1A",
"PPBP")
DotPlot(scobj, features = markers.to.plot, dot.scale = 8)
DotPlot(scobj, features = markers.to.plot, dot.scale = 8) + RotatedAxis()
DotPlot(scobj, features = markers.to.plot, dot.scale = 8) +
coord_flip() +
RotatedAxis()
### Clustered_DotPlot
### 可以自己挑选marker来呈现
### 也可以使用FindAllMarkers批量提取marker
all_markers <- FindAllMarkers(object = scobj)
library(dplyr)
top5_markers <- all_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice(1:5) %>%
ungroup() %>%
pull(gene) %>%
unique()
DotPlot(scobj, features = top5_markers) +
coord_flip() +
RotatedAxis()
scCustomize::Clustered_DotPlot(scobj, features = top5_markers)
### marker 热图
DoHeatmap(scobj, features = top5_markers)
### Identity的大小修改,通过size参数
DoHeatmap(scobj, features = top5_markers, size = 3)
### 基因的大小修改theme(axis.text.y = element_text(size = 8))
DoHeatmap(scobj, features = top5_markers, size = 3) +
theme(axis.text.y = element_text(size = 8))
### subset和downsample 可以随机取每个群的细胞数
DoHeatmap(subset(scobj, downsample = 50), features = top5_markers, size = 3) +
theme(axis.text.y = element_text(size = 8))