GRN 基因调控网络
## 背景
http://www.seqyuan.com/GRN.html
https://lishensuo.github.io/posts/bioinfo/010%E5%8D%95%E7%BB%86%E8%83%9E%E5%88%86%E6%9E%90%E5%B7%A5%E5%85%B7--%E8%BD%AC%E5%BD%95%E5%9B%A0%E5%AD%90pyscenic/
## 数据securt多样本数据
外源刺激 -> TF(GRN) -> 转录组的变化 -> 通路/生物学过程的变化
1.pySCENIC数据准备
## 准备pySCENIC的输入文件:
## 1. 基因表达矩阵, meta cell matrix (For grnboost, step1)
## 2. TF list, download from
### (1) 自定义
### (2) SCENIC: https://resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt
### (3) AnimalTFDB: http://bioinfo.life.hust.edu.cn/AnimalTFDB4/static/download/TF_list_final/Homo_sapiens_TF
## 3. 参考文件:
### (1) motif ranking tracks (.feather, 二进制文件, rows are motifs, columns are genes, values are ranks)
### (2) motif annotation (.tbl, txt文件, motif -> TFs)
rm(list = ls())
library(Seurat)
library(tidyverse)
makeMetaCells <- function(seu, min.cells = 10, reduction = "umap", dims = 1:2, k.param = 10, cores = 1) {
seu <- seu %>%
FindNeighbors(reduction = reduction, dims = dims, k.param = k.param) %>%
FindClusters(res = 50)
metadata <- seu@meta.data
metadata$METACELL_ID <- factor(metadata$seurat_clusters)
dge_mat <- seu[["RNA"]]@counts
dge_mat_mc <- parallel::mclapply(levels(metadata$METACELL_ID), function(xx) {
cells <- rownames(subset(metadata, METACELL_ID == xx))
Matrix::rowSums(dge_mat[, cells])
}, mc.cores = cores)
dge_mat_mc <- do.call(cbind, dge_mat_mc)
metacell_metadata <-
metadata[["METACELL_ID"]] %>%
table() %>%
as.data.frame()
colnames(metacell_metadata) <- c("METACELL_ID", "CELL_COUNT")
rownames(metacell_metadata) <- metacell_metadata[["METACELL_ID"]]
kept.cells <- subset(metacell_metadata, CELL_COUNT >= min.cells)[["METACELL_ID"]]
metacells <- list(
mat = dge_mat_mc[, kept.cells],
metadata = metacell_metadata[kept.cells,]
)
colnames(metacells$mat) <- paste0(seu@project.name, ".METACELL_", kept.cells)
rownames(metacells$metadata) <- colnames(metacells$mat)
metacells
}
### 读入数据
## clusters were annotated after batch effect removal via harmony.
seu <- readRDS("data/infb-pbmc.seurat.rds")
DimPlot(seu, group.by = "celltype", split.by = "group") & ggsci::scale_color_d3("category20")
### 创建meta cell
## 按样本处理
seu.list <- SplitObject(seu, split.by = "group")
seu.list[[1]]@project.name <- "pbmc_stim"
seu.list[[2]]@project.name <- "pbmc_ctrl"
metacells.list <- lapply(seq_along(seu.list), function(ii) {
makeMetaCells(
seu = seu.list[[ii]],
min.cells = 10,
reduction = "umap",
dims = 1:2,
k.param = 10,
cores = 1)
})
mc.mat <- lapply(metacells.list, function(mc) mc$mat) %>% Reduce(cbind, .)
mc.cellmeta <- lapply(metacells.list, function(mc) mc$metadata) %>% Reduce(rbind, .)
## meta cell
seu2 <- CreateSeuratObject(mc.mat)
seu2 <- NormalizeData(seu2)
FeatureScatter(seu, feature1 = "CD8A", feature2 = "CD8B") +
FeatureScatter(seu2, feature1 = "CD8A", feature2 = "CD8B")
### 在这里做metacell的好处就是在干扰素刺激下 STAT1和STAT2两个转录因子应该有很强的相关性,但原始数据没有,但是做了metacell就有了相关性
## 上述的代码暂时没能理解为啥做了,两个转录因子有这么强的相关性,后续过来理解下
FeatureScatter(seu, feature1 = "STAT1", feature2 = "STAT2") +
FeatureScatter(seu2, feature1 = "STAT1", feature2 = "STAT2")
## 准备pySCENIC的输入文件
## (1) TF list文件(optional),可以使用预定义的TF list,例如pySCENIC官方提供的,或者animalTFDB提供的。
motif2tfs <- data.table::fread("cisTarget_db/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl")
TFs <- sort(unique(motif2tfs$gene_name))
writeLines(TFs, "cisTarget_db/hsa_hgnc_tfs.motifs-v10.txt")
## (2) meta cell matrix (for step1): *.csv or *.loom
## (2.1) 过滤低表达基因
expr.in.cells <- rowSums(mc.mat > 0)
mc.mat <- mc.mat[expr.in.cells >= 5,]
## (2.2) 过滤不在cisTargetDB中的基因
cisdb <- arrow::read_feather("cisTarget_db/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather")
genes.use <- intersect(colnames(cisdb), rownames(mc.mat))
mc.mat <- mc.mat[genes.use,]
## 保存为loom文件
loom <- SCopeLoomR::build_loom(
file.name = "output/00-2.mc_mat_for_step1.loom",
dgem = mc.mat,
default.embedding = NULL
)
loom$close()
## 释放loom对象,允许其它文件占用loom文件
rm(loom)
gc()
2.pySCENIC流程
## inputs
f_loom_grn = .. / output / 00 - 2.mc_mat_for_step1.loom
## outputs
grn_output=../output/step1_adj.tsv
ctx_output=../output/step2_reg.tsv
## reference
f_tfs=../cisTarget_db/hsa_hgnc_tfs.motifs-v10.txt
f_motif_path=../cisTarget_db/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl
f_db_500bp=../cisTarget_db/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather
f_db_10kb=../cisTarget_db/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather
#### 1. build GRN
pyscenic grn \
--seed 777 \
--num_workers 10 \
--method grnboost2 \
-o $grn_output \
$f_loom_grn \
/cisdb/$f_tfs
# time arboreto_with_multiprocessing.py \
# $f_loom_grn \
# $f_tfs \
# --method grnboost2 \
# --output $grn_output \
# --num_workers 10 \
# --seed 777
#### 2. cisTarget
time pyscenic ctx \
$grn_output \
$f_db_500bp $f_db_10kb \
--annotations_fname$f_motif_path\
--expression_mtx_fname $f_loom_grn\
--output$ctx_output\
--num_workers10
###### 这里推荐用 singularity ,后续学一下这个容器用法
#### 上述结果转换格式
import sys
import pandas as pd
from pyscenic.cli.utils import load_signatures
# pyscenic ctx的输出文件
regulon_file = "/data/step2_reg.tsv"
project = "/data/ifnb_pbmc"
# 最小的regulon基因数,用于过滤regulon
min_regulon_size = 10
def get_motif_logo(regulon):
base_url = "http://motifcollections.aertslab.org/v10nr_clust/logos/"
for elem in regulon.context:
if elem.endswith('.png'):
return (base_url + elem)
sys.stdout.flush()
regulons = load_signatures(regulon_file)
select_cols = [i.name for i in regulons if len(i.genes) >= min_regulon_size]
gmt_file = project + ".regulons.gmt"
txt_file = project + ".regulons.txt"
fo1 = open(gmt_file, 'w')
fo2 = open(txt_file, 'w')
for i in regulons:
if i.name in select_cols:
motif = get_motif_logo(i)
genes = "\t".join(i.genes)
tf = "%s(%sg)" % (i.transcription_factor, len(i.genes))
fo1.write("%s\t%s\t%s\n" % (tf, motif, genes))
fo2.write("%s\t%s\t%s\n" % (tf, motif, genes.replace("\t", ",")))
3.pySCENIC的结果探索:用各细胞的转录因子分析
(1)刺激后影响最大的细胞类型是什么
library(Seurat)
library(tidyverse)
library(patchwork)
seu <- subset(seu, celltype %in% c("Mono/Mk Doublets", "Eryth"), invert = TRUE)
DimPlot(seu, group.by = "celltype", reduction = "umap", label = T)
celltype.levels <- c("CD14 Mono", "CD16 Mono", "DC", "pDC", "B cell", "B Activated",
"NK", "CD8 T", "CD4 Memory T", "T activated", "CD4 Naive T", "Mk")
seu$celltype <- factor(seu$celltype, levels = celltype.levels)
## 导入regulon (gene list)
regulons <- clusterProfiler::read.gmt("output/02-ifnb_pbmc.regulons.gmt")
## data.frame -> list, list中的每个元素为一个gene set
rg.names <- unique(regulons$term)
regulon.list <- lapply(rg.names, function(rg) {
subset(regulons, term == rg)$gene
})
names(regulon.list) <- sub("[0-9]+g", "\\+", rg.names)
summary(sapply(regulon.list, length))
print(regulon.list[1])
ComputeModuleScore <- function(x, ...) UseMethod('ComputeModuleScore')
ComputeModuleScore.default <- function(x, gene.sets, min.size = 20, batch.size = 500, cores = 1, ...) {
if (!is.list(gene.sets)) {
stop("'gene.sets' should be a list or data.frame!")
}
gene.sets <- gene.sets[sapply(gene.sets, length) >= min.size]
n.cells <- ncol(x)
batches <- floor((1:n.cells - 1) / batch.size)
batch.levels <- unique(batches)
aucell <- function(i) {
dge.tmp <- x[, batches == i]
cr <- AUCell::AUCell_buildRankings(dge.tmp, nCores = 1, plotStats = F, verbose = F)
auc <- AUCell::AUCell_calcAUC(gene.sets, cr, nCores = 1, verbose = F)
AUCell::getAUC(auc)
}
auc_scores <- parallel::mclapply(batch.levels, aucell, mc.cores = cores)
do.call(cbind, auc_scores)
}
ComputeModuleScore.Seurat <- function(x, gene.sets, min.size = 20, batch.size = 500, cores = 1, assay = Seurat::DefaultAssay(x), ...) {
dge <- x[[assay]]@counts
ras_mat <- ComputeModuleScore.default(x = dge, gene.sets, min.size, batch.size, cores)
x[["AUCell"]] <- Seurat::CreateAssayObject(data = ras_mat)
return(x)
}
## 用AUCell计算RAS matrix
## RAS = regulon activity score
seu <- ComputeModuleScore(seu, gene.sets = regulon.list, min.size = 10, cores = 1)
seu
DefaultAssay(seu) <- "AUCell"
p1 <- FeaturePlot(seu, features = "STAT2(+)", split.by = "group")
p2 <- FeaturePlot(seu, features = "STAT2", split.by = "group")
(p1 / p2) & scale_color_viridis_c()
VlnPlot(seu, group.by = "celltype", features = "STAT2(+)", pt.size = 0,
split.by = "group", split.plot = TRUE, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = "STAT2", pt.size = 0,
split.by = "group", split.plot = TRUE, cols = c("blue", "red"))
## 用RAS matrix计算UMAP
seu <- RunUMAP(object = seu,
features = rownames(seu),
metric = "correlation", # 注意这里用correlation效果最好
reduction.name = "umapRAS",
reduction.key = "umapRAS_")
## 可视化:UMAP on harmony
p1 <- DimPlot(seu, reduction = "umap", group.by = "celltype") +
ggsci::scale_color_d3("category20") +
NoLegend()
p2 <- DimPlot(seu, reduction = "umap", group.by = "group") + NoLegend()
## 可视化:UMAP on RAS
##(批次矫正后一定程度掩盖了组间差别,此方法在批次矫正后也一定程度上显示了组间的差别)
## 这里确保组间的差异为什么不是批次,因为基因打分是天然去批次,给的是基因名,而不是表达量
p3 <- DimPlot(seu, reduction = "umapRAS", group.by = "celltype") + ggsci::scale_color_d3("category20")
p4 <- DimPlot(seu, reduction = "umapRAS", group.by = "group")
(p1 + p3) / (p2 + p4)
## 推测:INFB对髓系细胞的影响更大(cd14 和 cd16)
## 换一种方式:PCA
DefaultAssay(seu) <- "AUCell"
seu <- ScaleData(seu)
seu <- RunPCA(object = seu,
features = rownames(seu),
reduction.name = "pcaRAS",
reduction.key = "pcaRAS_")
## 可视化:PCA on RAS
p3 <- DimPlot(seu, reduction = "pcaRAS", group.by = "celltype") + ggsci::scale_color_d3("category20")
p4 <- DimPlot(seu, reduction = "pcaRAS", group.by = "group")
p3 + p4
## PC1 encoding the regulons related to cell type
## PC2 encoding the regulons affected by INFB treatment
## The INFB induced transcriptome shift is orthogonal to the cell identity transcriptional programs.
VlnPlot(seu, group.by = "celltype", features = "pcaRAS_1", pt.size = 0,
split.by = "group", split.plot = TRUE, cols = c("blue", "red"))
VlnPlot(seu, group.by = "celltype", features = "pcaRAS_2", pt.size = 0,
split.by = "group", split.plot = TRUE, cols = c("blue", "red"))
(2)哪些转录因子(TF)响应了外源刺激后导致样本的的转录组变化?
A:鉴定出差异的调控的转录因子,其在组间和在哪种细胞类型中变化最大
library(Seurat)
library(tidyverse)
library(patchwork)
#### Method 1: feature loadings of PCA ####
p1 <- DimPlot(seu, reduction = "pcaRAS", group.by = "celltype") + ggsci::scale_color_d3("category20")
p2 <- DimPlot(seu, reduction = "pcaRAS", group.by = "group")
p1 + p2
VlnPlot(seu, group.by = "celltype", features = "pcaRAS_2", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F, cols = c("blue", "red"))
## PC2-high-loading regulons
## gene expression matrix (cells x genes) = cell embeddings (cells x nPCs) %*% feature loadings (nPCs x genes)
## Here, only the high variable genes were involved.
## If you want to fetch all genes loadings, you should perform ProjectDim first
# seu <- ProjectDim(seu, reduction = "pcaRAS", do.center = T)
# We didn't need do this because we use all regulons to perform PCA.
Loadings(seu, reduction = "pcaRAS")
VizDimLoadings(seu, reduction = "pcaRAS", dims = 2, balanced = T, projected = F)
VlnPlot(seu, group.by = "celltype", features = "FOS(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F, cols = c("blue", "red")) + ylab("TF activity")
#### Method 2: Variance Decomposition ####
VarDecompose <- function(data, meta.data, vd.vars, genes = "all", cores = -1) {
## check params
if (missing(data) ||
missing(meta.data) ||
missing(vd.vars)) {
stop("Must provide 'data', 'meta.data', and 'vd.vars'.")
}
if (is.null(colnames(meta.data)) || is.null(rownames(meta.data))) {
stop("The row and column names of 'meta.data' should be provided.")
}
if (is.null(colnames(data)) || is.null(rownames(data))) {
stop("The row and column names of 'data' should be provided.")
}
if (!all(rownames(data) == rownames(meta.data))) {
stop("The row names of 'data' and 'meta.data' should be matched.")
}
if (!all(vd.vars %in% colnames(meta.data))) {
vd.vars.404 <- setdiff(vd.vars, colnames(meta.data))
stop(paste("vd.vars:", vd.vars.404, "is(are) not found in 'meta.data'"))
}
if (length(genes) == 1) {
if (genes == "all") {
genes <- colnames(data)
} else {
stop("'genes' should be 'all' or a vector.")
}
} else {
genes.404 <- setdiff(genes, colnames(data))
if (length(genes.404) > 0) {
warning(paste(length(genes.404), "gene(s) are not found in 'data'."))
genes <- setdiff(genes, genes.404)
}
}
cores <- ifelse(cores > 0, cores, parallel::detectCores())
## prepare data for VD
vd.vars.str <- sapply(vd.vars, function(xx) sprintf("(1|%s)", xx))
modelFormulaStr <- paste("expression ~", paste(vd.vars.str, collapse = "+"))
data.use <- cbind(data[, genes], meta.data)
## exe VD
vd.res <- do.call(rbind, parallel::mclapply(genes, function(genename) {
data.model <- data.use[, c(vd.vars, genename)]
colnames(data.model) <- c(vd.vars, "expression")
tryCatch({
model <- suppressWarnings(lme4::lmer(stats::as.formula(modelFormulaStr), data = data.model, REML = TRUE, verbose = FALSE))
results <- as.data.frame(lme4::VarCorr(model))
rownames(results) <- results$grp
results <- results[c(vd.vars, "Residual"),]
frac.var <- results$vcov / sum(results$vcov)
res.tmp <- c("OK", frac.var)
},
error = function(e) {
print(e)
res.tmp <- c("FAIL", rep(-1, length(vd.vars) + 1))
})
names(res.tmp) <- c("status", vd.vars, "residual")
as.data.frame(as.list(res.tmp)) # return
}, mc.cores = cores)
)
rownames(vd.res) <- genes
vd.res %<>% as.data.frame()
vd.res <- vd.res %>% dplyr::mutate(gene = rownames(vd.res), .before = 1)
# vd.res <- vd.res %>% as.data.frame() %>% dplyr::mutate(gene = rownames(.), .before=1)
for (i in 3:ncol(vd.res)) {
vd.res[[i]] %<>% as.numeric()
}
return(vd.res)
}
DefaultAssay(seu) <- "AUCell"
vd.vars <- c("celltype", "group")
meta.data <- seu@meta.data[, vd.vars]
ras.data <- FetchData(seu, vars = rownames(seu))
vd.res <- VarDecompose(data = ras.data, meta.data = meta.data, vd.vars = vd.vars, cores = 5)
## PCA和VD结果的一致性
vd.res$PC1.loadings <- Loadings(seu, reduction = "pcaRAS")[vd.res$gene, 1]
vd.res$PC2.loadings <- Loadings(seu, reduction = "pcaRAS")[vd.res$gene, 2]
to.label <- subset(vd.res, celltype > 0.25 & group > 0.15)
ggplot(vd.res, aes(celltype, group, color = abs(PC1.loadings))) +
geom_point(size = 2) +
geom_hline(yintercept = 0.15, color = "blue", linetype = "dashed") +
geom_vline(xintercept = 0.25, color = "blue", linetype = "dashed") +
ggrepel::geom_text_repel(inherit.aes = F, data = to.label, aes(celltype, group, label = gene)) +
xlab("Fraction of variance by cell type") +
ylab("Fraction of variance by group") +
scale_color_viridis_c() +
theme_bw(base_size = 15)
ggplot(vd.res, aes(celltype, group, color = abs(PC2.loadings))) +
geom_point(size = 2) +
xlab("Fraction of variance by cell type") +
ylab("Fraction of variance by group") +
scale_color_viridis_c() +
theme_bw(base_size = 15)
ggplot(vd.res, aes(group)) +
geom_density()
to.label <- subset(vd.res, group > 0.4)
ggplot(vd.res, aes(PC2.loadings, group)) +
geom_point(size = 2) +
ggrepel::geom_text_repel(inherit.aes = F, data = to.label, aes(PC2.loadings, group, label = gene)) +
ylab("Fraction of variance by group") +
theme_bw(base_size = 15)
VlnPlot(seu, group.by = "celltype", features = "HESX1(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F, cols = c("blue", "red")) + ylab("TF activity")
FeaturePlot(seu, reduction = "umap", features = "HESX1(+)", split.by = "group")
VlnPlot(seu, group.by = "celltype", features = "ETV6(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F, cols = c("blue", "red")) + ylab("TF activity")
FeaturePlot(seu, reduction = "umap", features = "ETV6(+)", split.by = "group")
VlnPlot(seu, group.by = "celltype", features = "IRF7(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F, cols = c("blue", "red")) + ylab("TF activity")
FeaturePlot(seu, reduction = "umap", features = "IRF7(+)", split.by = "group")
#### Method 3: DE test ####
DERegulon <- function(seu, celltype, group, test.use = "wilcox") {
seu$new.group <- paste(seu[[celltype, drop = T]], seu[[group, drop = T]], sep = "_")
Idents(seu) <- factor(seu$new.group)
celltypes <- levels(seu[[celltype, drop = T]])
groups <- levels(seu[[group, drop = T]])
seu2 <- subset(seu, group == groups[1])
Idents(seu2) <- seu2[[celltype, drop = T]]
baseline.levels <- AverageExpression(seu2, assays = "AUCell")$AUCell
de.list <- lapply(celltypes, function(ct) {
message(glue::glue("processing {ct} ..."))
ct1 <- paste(ct, groups[1], sep = "_")
ct2 <- paste(ct, groups[2], sep = "_")
de <- FindMarkers(seu, ident.1 = ct1, ident.2 = ct2, test.use = "wilcox", fc.name = "avg_diff", logfc.threshold = 0)
de$change <- ifelse(de$avg_diff > 0,
paste("up in", groups[1]),
paste("up in", groups[2]))
de$avg_diff <- -de$avg_diff
de$diff_rate <- de$avg_diff / baseline.levels[rownames(de), ct]
de$group <- ct
return(de)
})
names(de.list) <- celltypes
de.list
}
#' regulon activity ~ (0,1)
format_change.df <- . %>%
mutate(change = case_when(p_val_adj < 1e-6 & avg_diff > 0.3 ~ '+++',
p_val_adj < 1e-6 &
avg_diff > 0.1 &
avg_diff <= 0.3 ~ '++',
p_val_adj < 1e-6 &
avg_diff > 0.05 &
avg_diff <= 0.1 ~ '+',
p_val_adj < 1e-6 & avg_diff < -0.3 ~ '---',
p_val_adj < 1e-6 &
avg_diff < -0.1 &
avg_diff >= -0.3 ~ '--',
p_val_adj < 1e-6 &
avg_diff < -0.05 &
avg_diff >= -0.1 ~ '-',
TRUE ~ "")) %>%
mutate(gene = rownames(.)) %>%
dplyr::select(gene, change, group)
#'
format_change <- function(mylist, celltype.levels) {
f.change <- lapply(mylist, format_change.df) %>%
Reduce(rbind, .) %>%
pivot_wider(names_from = "group", values_from = "change")
f.change <- f.change %>% dplyr::select(c("gene", all_of(celltype.levels)))
f.change[is.na(f.change)] <- ""
f.change
}
DefaultAssay(seu) <- "AUCell"
# should be factors
# change = STIM / CTRL
seu$group <- factor(seu$group, levels = c("CTRL", "STIM"))
de.list <- DERegulon(seu, celltype = "celltype", group = "group", test.use = "wilcox")
# avg_diff = STIM - CTRL
View(de.list[[1]])
summary(de.list[[1]]$avg_diff)
ggplot(de.list[[1]], aes(diff_rate, -log10(p_val_adj))) +
geom_point()
de.regulons <- format_change(mylist = de.list, celltype.levels = levels(seu$celltype))
data.use <- full_join(vd.res, de.regulons, by = "gene")
## check results
VlnPlot(seu, group.by = "celltype", features = "THRB(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F)
FeaturePlot(seu, reduction = "umap", split.by = "group", features = "THRB(+)") &
scale_color_viridis_c()
FeaturePlot(seu, reduction = "umap", split.by = "group", features = "THRB") &
scale_color_viridis_c()
B:将这些影响的转录因子 聚类 起来,让其之间有相关性,变成哪些转录因子网络受到影响变化
### network_plot
`%notin%` <- Negate(`%in%`)
RegNetVis <- function(regulators, edge.list, nodes.show = NULL, size.by = "num_target", topN = 10) {
regulators <- arrange(regulators, cluster, desc(num_target))
regulators$rank <- factor(regulators$TF, levels = regulators$TF)
regulators <- regulators %>%
group_by(cluster) %>%
mutate(rank_within_cluster = rank(-num_target, ties.method = "first")) %>%
ungroup()
to_label <- as.vector(regulators[regulators$rank_within_cluster <= topN,]$TF)
regulators.plot <- subset(regulators, !is.na(regulators$cluster))
edge.list.plot <- edge.list
if (!is.null(nodes.show)) {
regulators.plot <- subset(regulators.plot, TF %in% nodes.show)
edge.list.plot <- subset(edge.list.plot, TF %in% nodes.show & target %in% nodes.show)
}
ggplot() +
geom_segment(data = edge.list.plot,
mapping = aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
alpha = 0.05) +
geom_point(data = regulators.plot,
mapping = aes(x = fr1, y = fr2, color = cluster, size = get(size.by))) +
guides(size = guide_legend(title = size.by)) +
ggrepel::geom_text_repel(data = regulators.plot %>% subset(TF %in% to_label),
mapping = aes(x = fr1, y = fr2, color = cluster, label = TF),
max.overlaps = Inf, show.legend = F) +
ggsci::scale_fill_d3() +
ggsci::scale_color_d3() +
scale_size_area(max_size = 4) +
coord_fixed() +
theme_void() +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank())
}
RegModuleBar <- function(regulators, topN = 10) {
regulators <- arrange(regulators, cluster, desc(num_target))
regulators$rank <- factor(regulators$TF, levels = regulators$TF)
regulators.plot <- regulators %>%
group_by(cluster) %>%
slice_head(n = topN) %>%
ungroup()
regulators.plot <- subset(regulators.plot, !is.na(regulators.plot$cluster))
ggplot(regulators.plot) +
geom_bar(aes(x = rev(rank), y = num_target, fill = cluster), stat = "identity") +
geom_text(aes(x = rev(rank), y = 1, label = TF), hjust = 0) +
coord_flip() +
expand_limits(y = -100) +
labs(x = "") +
ggsci::scale_fill_d3() +
theme_classic(base_size = 15) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.line.y = element_blank())
}
RegulonGraphVis <- function(data, tf.show, targets.show, edge.alpha = .6, colors = c("grey", "orange"),
edge.color = "lightblue", layout = "stress", prop = 0.05, n = NULL) {
edges <- data[, c("TF", "target", "importance")]
names(edges)[1:2] <- c("from", "to")
edges <- edges %>%
filter(from %in% tf.show) %>%
group_by(from) %>%
arrange(desc(importance))
if (!is.null(prop)) {
edges <- edges %>% slice_head(prop = prop)
} else {
edges <- edges %>% slice_head(n = n)
}
targets.show.df <- edges %>%
filter(to %in% targets.show)
nodes <- data.frame(name = unique(union(edges$from, edges$to)))
nodes$annot <- ifelse(nodes$name %in% unique(data$TF), "TF", "target")
g <- tbl_graph(nodes = nodes, edges = edges)
lw.breaks <- quantile(edges$importance, c(.01, .5, .99))
lw.ranges <- c(.3, 2)
ggraph(g, layout = layout) +
geom_edge_link(aes(width = importance), alpha = edge.alpha, color = edge.color) +
geom_node_point(aes(filter = name %in% targets.show.df$to), size = 5, color = "black") +
geom_node_point(aes(color = annot, size = annot)) +
geom_node_text(aes(filter = annot == "TF" & name %notin% targets.show, label = name), size = 4, color = "black", repel = T) +
geom_node_text(aes(filter = annot == "TF" & name %in% targets.show, label = name), size = 4, color = "red", repel = T) +
geom_node_text(aes(filter = name %in% targets.show.df$to & name %notin% tf.show, label = name), size = 3, color = "red", repel = T) +
scale_color_manual(values = colors) +
scale_size_manual(values = c(4, 10)) +
scale_edge_width_continuous(breaks = lw.breaks, labels = round(lw.breaks, 1), range = lw.ranges) +
guides(color = guide_legend(title = "", override.aes = list(size = 4, alpha = 1)),
size = guide_legend(title = "")) + # not work yet
theme_graph()
}
VDPlot <- function(vd.res, y, group.by, group.show) {
vd.res <- arrange(vd.res, desc(get(y)))
vd.res <- subset(vd.res, !is.na(cluster))
vd.plots <- vd.res %>%
mutate(pt.size = ifelse(get(group.by) == group.show, 2, .5),
type = ifelse(get(group.by) == group.show, group.show, "others"),
rank = 1:nrow(.))
ggplot(vd.plots, aes(rank, get(y))) +
geom_point(aes(color = type), size = vd.plots$pt.size) +
ggrepel::geom_text_repel(data = subset(vd.plots, get(group.by) == group.show),
aes(rank, get(y), label = gene), color = "red") +
scale_color_manual(values = c("red", "grey")) +
guides(color = guide_legend(override.aes = list(size = 3))) +
labs(x = "Rank", y = sprintf("Fraction of variance across %s", y)) +
theme_classic(base_size = 16) +
theme(
legend.title = element_blank(),
legend.position = c(1, 1),
legend.justification = c(1, 1),
axis.text = element_text(color = "black")
)
}
library(tidyverse)
#### Method 1: regulatory graph based clustering ####
## regulatory graph: nodes - tfs, edges - regulatory links (only focus on TFs)
## Hypothesis: the TFs regulating each other play roles together (The positive feedback).
## 导入TF-target的调控信息
tf2target <- clusterProfiler::read.gmt("output/02-ifnb_pbmc.regulons.gmt")
tf2target$TF <- sub("\\([0-9]+g\\)", "", tf2target$term)
colnames(tf2target)[1:2] <- c("regulon", "target")
tf2target$id <- paste0(tf2target$TF, "-", tf2target$target)
## 导入TF-target的可信度 (valued by feature importance)
adj <- data.table::fread("output/step1_adj.tsv", sep = "\t", header = T)
adj$id <- paste0(adj$TF, "-", adj$target)
adj <- subset(adj, id %in% tf2target$id)
## 合并调控信息和相关性信息
data.use <- left_join(adj, tf2target, by = c("id", "TF", "target"))
summary(data.use$importance)
data.use <- subset(data.use, importance >= 1) # cut.off by importance
## 统计每个TF有多少个target
regulators <- data.use %>%
group_by(TF) %>%
summarise(num_target = n()) %>%
arrange(desc(num_target)) %>%
as.data.frame()
rownames(regulators) <- regulators$TF
head(regulators)
## 建立TF之间的调控关系图
hub.genes <- regulators$TF ## TFs
edge.list <- data.use %>% subset(target %in% hub.genes & TF %in% hub.genes)
edge.igraph <- igraph::make_undirected_graph(
edges = mapply(c, edge.list$TF, edge.list$target, SIMPLIFY = F) %>% Reduce(f = c)
)
## 降维:force directed layout
set.seed(1024)
fr.layout <- igraph::layout_with_fr(edge.igraph)
nodes.in.grpah <- igraph::vertex_attr(edge.igraph)[[1]]
regulators[nodes.in.grpah, "fr1"] = fr.layout[, 1]
regulators[nodes.in.grpah, "fr2"] = fr.layout[, 2]
head(regulators)
edge.list$x_start = regulators[edge.list$TF, "fr1"]
edge.list$y_start = regulators[edge.list$TF, "fr2"]
edge.list$x_end = regulators[edge.list$target, "fr1"]
edge.list$y_end = regulators[edge.list$target, "fr2"]
## 图聚类
## Louvain算法
set.seed(1024)
clusters <- igraph::cluster_louvain(edge.igraph, resolution = 1)
regulators[clusters$names, "Louvain_cluster"] <- LETTERS[clusters$membership]
## Leiden算法
clusters <- leiden::leiden(edge.igraph, resolution_parameter = 1, seed = 1024)
regulators[nodes.in.grpah, "Leiden_cluster"] <- LETTERS[clusters]
regulators$cluster <- regulators$Louvain_cluster # 可视化必须的
## Network plot
RegNetVis(regulators = regulators, edge.list = edge.list, size.by = "num_target", topN = 5)
RegModuleBar(regulators = regulators, topN = 5)
## Combine the variance decomposition results
source("R/vd_plot.R")
vd.res <- readRDS("output/VD_res.rds")
vd.res$TF <- sub("\\(\\+\\)", "", vd.res$gene)
vd.res$cluster <- regulators[vd.res$TF,]$cluster
rownames(vd.res) <- vd.res$TF
regulators$var.group <- vd.res[rownames(regulators),]$group
RegNetVis(regulators = regulators, edge.list = edge.list, size.by = "var.group", topN = 5)
plot.list <- lapply(LETTERS[1:8], function(clu) {
VDPlot(vd.res, y = "group", group.by = "cluster", group.show = clu)
})
cowplot::plot_grid(plotlist = plot.list, ncol = 4)
## check results
library(Seurat)
seu <- qs::qread("output/seurat.aucell.qs")
VlnPlot(seu, group.by = "celltype", features = "MXD1(+)", split.by = "group",
pt.size = 0, split.plot = TRUE, sort = F)
## focus on cluster F
nodes.show <- subset(regulators, cluster == "F")$TF
RegNetVis(regulators = regulators, edge.list = edge.list, nodes.show = nodes.show, topN = 10)
nodes.show <- subset(vd.res, group > 0.2)$TF # 响应IFNB的regulon
RegNetVis(regulators = regulators, edge.list = edge.list, nodes.show = nodes.show, topN = 10)
#### Method 2: TF activity similarity based clustering ####
library(Seurat)
CSI <- function(r1, r2) {
delta <- pccMat[r1, r2]
r.others <- setdiff(colnames(pccMat), c(r1, r2))
N <- sum(pccMat[r1, r.others] < delta) + sum(pccMat[r2, r.others] < delta)
M <- length(r.others) * 2
return(N / M)
}
rasMat <- seu[["AUCell"]]@data
rasMat <- t(rasMat)
pccMat <- cor(rasMat) # 对列计算相关性
csiMat <- pbapply::pblapply(rownames(pccMat), function(i) sapply(colnames(pccMat), function(j) CSI(i, j)))
csiMat <- do.call(rbind, csiMat)
rownames(csiMat) <- rownames(pccMat)
## 找到一个合适的h for cut tree
library(dendextend)
library(ggsci)
h = 5
row_dend = as.dendrogram(hclust(dist(pccMat), method = "complete"))
clusters <- dendextend::cutree(row_dend, h = h) # dendextend::cutree()
row_dend = color_branches(row_dend, h = h, col = pal_d3("category20")(20))
plot(row_dend)
## 聚类可视化
library(ComplexHeatmap)
library(circlize)
col_range = c(0.7, 1)
col_fun <- colorRamp2(col_range, c("#FCF8DE", "#253177"))
# for(i in nrow(csiMat)) {
# csiMat[i, i] <- 0
# }
ht <- Heatmap(
matrix = csiMat,
col = col_fun,
name = "ht1",
cluster_rows = TRUE,
cluster_columns = TRUE,
show_column_names = FALSE,
show_row_names = FALSE,
show_heatmap_legend = FALSE
)
lgd <- Legend(
col_fun = col_fun,
title = "",
at = col_range,
labels = c("low", "high"),
direction = "horizontal",
legend_width = unit(1, "in"),
border = FALSE
)
{
draw(ht, heatmap_legend_list = list(lgd), heatmap_legend_side = c("bottom"))
decorate_heatmap_body("ht1", {
tree = column_dend(ht)
ind = cutree(as.hclust(tree), h = h)[order.dendrogram(tree)]
first_index = function(l) which(l)[1]
last_index = function(l) { x = which(l); x[length(x)] }
clusters <- names(table(ind))
x1 = sapply(clusters, function(x) first_index(ind == x)) - 1
x2 = sapply(clusters, function(x) last_index(ind == x))
x1 = x1 / length(ind)
x2 = x2 / length(ind)
grid.rect(x = x1, width = (x2 - x1), y = 1 - x1, height = (x1 - x2),
hjust = 0, vjust = 0, default.units = "npc",
gp = gpar(fill = NA, col = "#FCB800", lwd = 3))
grid.text(label = paste0("M", clusters),
x = x2 - length(clusters) / length(ind), y = 1 - x1 - (x2 - x1) / 2,
default.units = "npc",
hjust = 1, vjust = 0.5,
gp = gpar(fontsize = 12, fontface = "bold"))
})
decorate_column_dend("ht1", {
tree = column_dend(ht)
ind = cutree(as.hclust(tree), h = h)[order.dendrogram(tree)]
first_index = function(l) which(l)[1]
last_index = function(l) { x = which(l); x[length(x)] }
clusters <- names(table(ind))
x1 = sapply(clusters, function(x) first_index(ind == x)) - 1
x2 = sapply(clusters, function(x) last_index(ind == x))
grid.rect(x = x1 / length(ind), width = (x2 - x1) / length(ind), just = "left",
default.units = "npc", gp = gpar(fill = pal_d3("category20")(20), alpha = .5, col = NA))
})
}
tree <- column_dend(ht)
ind <- cutree(as.hclust(tree), h = h)[order.dendrogram(tree)]
clusters <- names(table(ind))
regulon.clusters <- data.frame(regulon = names(ind), cluster = paste0("M", ind))
table(regulon.clusters$cluster)
regulon.clusters
# 绘制每个regulon-module的平均活性
k = length(clusters)
cell.info <- seu@meta.data
moduleRasMat <- lapply(paste0("M", 1:k), function(x) {
regulon.use <- subset(regulon.clusters, cluster == x)$regulon
rowMeans(rasMat[, regulon.use, drop = FALSE])
})
names(moduleRasMat) <- paste0("M", 1:k)
moduleRasMat <- do.call(cbind, moduleRasMat)
cell.info <- cbind(cell.info, moduleRasMat[rownames(cell.info),])
cell.info <- cbind(cell.info, FetchData(seu, vars = paste0("umap_", 1:2)))
p.list <- lapply(paste0("M", 1:k), function(module) {
data.use <- cell.info
expression.color <- c("darkblue", "lightblue", "green", "yellow", "red")
max.val <- quantile(data.use[, module], 0.99)
low.val <- quantile(data.use[, module], 0.1)
data.use[, module] <- ifelse(data.use[, module] > max.val, max.val, data.use[, module])
ggplot(data.use, aes(umap_1, umap_2, color = get(module))) +
geom_point(size = 0.05) +
theme_bw(base_size = 15) +
ggtitle(module) +
facet_wrap(~group) +
scale_color_gradientn(name = NULL, colors = expression.color) +
theme(legend.position = "right",
legend.title = element_blank(),
plot.title = element_text(hjust = .5, face = "bold", size = 20),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black")
)
})
cowplot::plot_grid(plotlist = p.list, ncol = 3)
## VD plot
vd.res[regulon.clusters$regulon, "cluster"] <- regulon.clusters$cluster
plot.list <- lapply(paste0("M", 1:11), function(clu) {
VDPlot(vd.res, y = "group", group.by = "cluster", group.show = clu)
})
cowplot::plot_grid(plotlist = plot.list, ncol = 4)
C:去除响应的 组别
### moudle F 响应了 IFNB
### 去除moudle F的 reguton
### 后面作图发现 去掉相关模块后,前面grn找到的组间差异消失了,又向去除批次效应后融合在一起
library(Seurat)
seu <- qs::qread("output/seurat.aucell.qs")
#### 1. remove module F ####
regulon.cluster <- readRDS("output/regulon_clusters_network.rds")
regulons.F <- subset(regulon.cluster, cluster == "F")$TF
regulons.F <- paste0(regulons.F, "(+)")
features.use <- setdiff(rownames(seu), regulons.F)
## 用RAS matrix计算UMAP
seu <- RunUMAP(object = seu,
features = features.use,
metric = "correlation", # 注意这里用correlation效果最好
reduction.name = "umapRASrmF",
reduction.key = "umapRASrmF_")
## 可视化:UMAP on harmony
p1 <- DimPlot(seu, reduction = "umap", group.by = "celltype") +
ggsci::scale_color_d3("category20") +
NoLegend()
p2 <- DimPlot(seu, reduction = "umap", group.by = "group") + NoLegend()
## 可视化:UMAP on RAS
p3 <- DimPlot(seu, reduction = "umapRAS", group.by = "celltype") +
ggsci::scale_color_d3("category20") +
NoLegend()
p4 <- DimPlot(seu, reduction = "umapRAS", group.by = "group") + NoLegend()
## 可视化:UMAP on RAS (remove module F)
p5 <- DimPlot(seu, reduction = "umapRASrmF", group.by = "celltype") + ggsci::scale_color_d3("category20")
p6 <- DimPlot(seu, reduction = "umapRASrmF", group.by = "group")
(p1 | p3 | p5) / (p2 | p4 | p6)
#### 2. remove M5,6,11 ####
regulon.cluster <- readRDS("output/clusters_activity_similarity.rds")
## 去除M5,6,11的regulon
regulons.rm <- subset(regulon.cluster, cluster %in% paste0("M", c(5, 6, 11)))$regulon
features.use <- setdiff(rownames(seu), regulons.rm)
## 用RAS matrix计算UMAP
seu <- RunUMAP(object = seu,
features = features.use,
metric = "correlation", # 注意这里用correlation效果最好
reduction.name = "umapRASrm",
reduction.key = "umapRASrm_")
## 可视化:UMAP on RAS (remove M5,6,11)
p5 <- DimPlot(seu, reduction = "umapRASrm", group.by = "celltype") + ggsci::scale_color_d3("category20")
p6 <- DimPlot(seu, reduction = "umapRASrm", group.by = "group")
(p1 | p3 | p5) / (p2 | p4 | p6)
#### 3. remove group specific regulons ####
## 去group特异的regulon
regulons.rm <- subset(vd.res, group > 0.1)$gene
features.use <- setdiff(rownames(seu), regulons.rm)
## 用RAS matrix计算UMAP
seu <- RunUMAP(object = seu,
features = features.use,
metric = "correlation", # 注意这里用correlation效果最好
reduction.name = "umapRASrm",
reduction.key = "umapRASrm_")
## 可视化:UMAP on RAS (remove group > 0.1)
p5 <- DimPlot(seu, reduction = "umapRASrm", group.by = "celltype") + ggsci::scale_color_d3("category20")
p6 <- DimPlot(seu, reduction = "umapRASrm", group.by = "group")
(p1 | p3 | p5) / (p2 | p4 | p6)
#### 4. remove celltype specific regulons ####
regulons.rm <- subset(vd.res, celltype > 0.1)$gene
features.use <- setdiff(rownames(seu), regulons.rm)
## 用RAS matrix计算UMAP
seu <- RunUMAP(object = seu,
features = features.use,
metric = "correlation", # 注意这里用correlation效果最好
reduction.name = "umapRASrmct",
reduction.key = "umapRASrmct_")
## 可视化:UMAP on RAS (remove celltype > 0.1)
p7 <- DimPlot(seu, reduction = "umapRASrmct", group.by = "celltype") + ggsci::scale_color_d3("category20")
p8 <- DimPlot(seu, reduction = "umapRASrmct", group.by = "group")
(p1 | p3 | p5 | p7) / (p2 | p4 | p6 | p8)
每种细胞的差异基因 到 差异的通路,在从通路到 转录因子
library(tidyverse)
library(Seurat)
library(clusterProfiler)
library(enrichplot)
#### Q1: CD14 Monocyte在IFNB刺激后哪些通路发生了变化? ####
seu <- qs::qread("output/ifnb_pbmc.seurat.aucell.qs")
DefaultAssay(seu) <- "RNA"
seu$celltype.stim <- paste(seu$celltype, seu$group, sep = "_")
Idents(seu) <- "celltype.stim"
### 基于差异分析的GSEA分析策略
## 注意,fold change = ident.1 / ident.2
interferon.response <- FindMarkers(seu,
ident.1 = "CD14 Mono_STIM",
ident.2 = "CD14 Mono_CTRL",
logfc.threshold = 0)
### GSEA 分析
gene_df <- interferon.response
geneList <- gene_df$avg_log2FC
names(geneList) = rownames(gene_df)
geneList = sort(geneList, decreasing = TRUE)
### hallmarks gene set
hallmarks <- read.gmt("resource/h.all.v2022.1.Hs.symbols.gmt")
### 主程序GSEA
y <- GSEA(geneList, TERM2GENE = hallmarks)
yd <- as.data.frame(y)
dotplot(y, showCategory = 30, split = ".sign") + facet_grid(~.sign)
## activated
gseaplot2(y, "HALLMARK_INTERFERON_GAMMA_RESPONSE", color = "red", pvalue_table = T)
## repressed
gseaplot2(y, "HALLMARK_OXIDATIVE_PHOSPHORYLATION", color = "red", pvalue_table = T)
#### Q2: 这些变化的通路是由哪些转录因子介导的? ####
regulon.list <- readRDS("output/ifnb_pbmc.regulons.rds")
sapply(regulon.list, length) %>% summary()
## regulon的基因集大小范围: 10~4068
regulons <- read.gmt("output/ifnb_pbmc.regulons.gmt")
head(regulons)
## query 1: HALLMARK_INTERFERON_GAMMA_RESPONSE (activated)
genes <- subset(hallmarks, term == "HALLMARK_INTERFERON_GAMMA_RESPONSE")$gene
## 注意minGSSize和maxGSSize和regulon基因集大小的范围对应
e.regulon <- enricher(gene = genes, TERM2GENE = regulons, minGSSize = 10, maxGSSize = 5000)
dotplot(e.regulon)
## 结合regulon活性进一步筛选:
DefaultAssay(seu) <- "AUCell"
VlnPlot(seu, group.by = "celltype", features = c("IRF7(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("STAT1(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("KLF6(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
## 这里是一个阳性的例子,我们已经明确知道IRF7, STAT1, KLF6是干扰素应答的转录因子
##
## query 2: HALLMARK_OXIDATIVE_PHOSPHORYLATION (repressed)
genes <- subset(hallmarks, term == "HALLMARK_OXIDATIVE_PHOSPHORYLATION")$gene
e.regulon <- enricher(gene = genes, TERM2GENE = regulons, minGSSize = 10, maxGSSize = 5000)
dotplot(e.regulon)
## 结合regulon活性进一步筛选:
DefaultAssay(seu) <- "AUCell"
VlnPlot(seu, group.by = "celltype", features = c("SPI1(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("ILF2(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("IRF3(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
## 对数据的解释如下:
## IFNB刺激后,ILF2的活性下降(数据支持),氧化磷酸化被抑制(数据支持)
## 氧化磷酸化通路的基因富集了ILF2调控的靶基因(相关性证据)
## ILF2促进氧化磷酸化(文献支持) (扰动的证据,因果关系)
## PMID: 31908894 Figure 2 H446/H82细胞系(小细胞肺癌细胞系)
## 结论:IFNB刺激导致的CD14 monocyte的氧化磷酸化被抑制可能是ILF2介导的
(4) 可视化
library(Seurat)
library(tidyverse)
library(ggraph)
library(tidygraph)
LoadpySCENICOutput <- function(regulon.gmt, adj.mat.file) {
message(glue::glue("Loading {regulon.gmt} ..."))
tf2target <- clusterProfiler::read.gmt(regulon.gmt)
tf2target$TF <- sub("\\([0-9]+g\\)", "", tf2target$term)
colnames(tf2target)[1:2] <- c("regulon", "target")
tf2target$id <- paste0(tf2target$TF, "-", tf2target$target)
head(tf2target)
message(glue::glue("Loading {adj.mat.file} ..."))
adj <- data.table::fread(adj.mat.file, sep = "\t", header = T)
adj$id <- paste0(adj$TF, "-", adj$target)
adj <- subset(adj, id %in% tf2target$id)
data <- left_join(adj, tf2target, by = c("id", "TF", "target"))
return(data)
}
data <- LoadpySCENICOutput(regulon.gmt = "output/02-ifnb_pbmc.regulons.gmt",
adj.mat.file = "output/01-step1_adj.tsv")
summary(data$importance)
data <- subset(data, importance > 1)
### 展示调控IFN-gamma通路的转录调控网络
hallmarks <- clusterProfiler::read.gmt("resource/h.all.v2022.1.Hs.symbols.gmt")
genes <- subset(hallmarks, term == "HALLMARK_INTERFERON_GAMMA_RESPONSE")$gene
RegulonGraphVis(data, tf.show = c("STAT2", "STAT1", "IRF7", "IRF2", "IRF8", "ETV7", "ELF1"),
targets.show = genes)
RegulonGraphVis(data, tf.show = c("STAT2", "STAT1", "IRF7", "IRF2", "IRF8", "ETV7", "ELF1"),
targets.show = genes, layout = "circle")
RegulonGraphVis(data, tf.show = c("STAT2", "STAT1", "IRF7", "IRF2", "IRF8", "ETV7", "ELF1"),
targets.show = genes, prop = 0.01)
RegulonGraphVis(data, tf.show = c("STAT2", "STAT1", "IRF7", "IRF2", "IRF8", "ETV7", "ELF1"),
targets.show = genes, prop = NULL, n = 20)
拓展
1.特定细胞的身份是哪些TFs维持的
options(stringsAsFactors = F)
library(Seurat)
library(tidyverse)
library(data.table)
library(ggrepel)
# http://blog.fens.me/r-entropy/
library(philentropy)
calIndMat <- function(celltype.vector) {
cell.types <- as.character(unique(celltype.vector))
ctMat <- lapply(cell.types, function(ii) {
as.numeric(celltype.vector == ii)
}) %>% do.call(cbind, .)
colnames(ctMat) <- cell.types
rownames(ctMat) <- names(celltype.vector)
return(ctMat)
}
calRSSMat <- function(rasMat, ctMat) {
rssMat <- pbapply::pblapply(colnames(rasMat), function(i) {
sapply(colnames(ctMat), function(j) {
suppressMessages(
1 - philentropy::JSD(rbind(rasMat[, i], ctMat[, j]), unit = 'log2', est.prob = "empirical")
)
})
}) %>% do.call(rbind, .)
rownames(rssMat) <- colnames(rasMat)
colnames(rssMat) <- colnames(ctMat)
return(rssMat)
}
PlotRegulonRank <- function(rssMat, cell.type, topn = 5) {
data <- data.frame(
Regulons = 1:nrow(rssMat),
RSS = sort(rssMat[, cell.type], decreasing = T),
label = sub("(+)", "", names(sort(rssMat[, cell.type], decreasing = T)), fixed = T)
)
data$pt.col <- ifelse(data$Regulons <= topn, "#007D9B", "#BECEE3")
data <- head(data, n = 200)
data.label <- head(data, n = topn)
ggplot(data, aes(Regulons, RSS)) +
geom_point(size = 3, color = data$pt.col) +
ggrepel::geom_text_repel(inherit.aes = FALSE, data = data.label, aes(Regulons, RSS, label = label), size = 4) +
ggtitle(cell.type) +
ylab("Specificity score") +
theme_bw(base_size = 12) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"),
axis.ticks = element_line(color = "black"),
axis.text = element_text(color = "black"),
plot.title = element_text(hjust = .5)
)
}
DimPlot2 <- function(seu, reduction = "umap", group.by = "celltype", group.highlight = NULL, regulon = NULL, threshold = NULL, text.size = 4) {
dim.1 <- paste(reduction, 1, sep = "_")
dim.2 <- paste(reduction, 2, sep = "_")
if (is.null(regulon)) {
data <- FetchData(seu, vars = c(dim.1, dim.2, group.by))
data$pt.col <- ifelse(data[[group.by]] == group.highlight, "red", "#DFDFDF")
data$pt.size <- ifelse(data[[group.by]] == group.highlight, 0.2, 0.1)
title <- paste0(group.highlight)
col.title <- "red"
} else {
data <- FetchData(seu, vars = c(dim.1, dim.2, regulon))
mean.val <- mean(data[, regulon])
sd.val <- sd(data[, regulon])
if (is.null(threshold)) {
threshold <- mean.val + 2 * sd.val
threshold <- round(threshold, 2)
}
data$pt.col <- ifelse(data[, regulon] > threshold, "#006464", "#DFDFDF")
data$pt.size <- ifelse(data[, regulon] > threshold, 0.2, 0.1)
title <- paste0("Regulon: ", regulon)
title <- glue::glue("{title}\nThreshold: {threshold}")
col.title = "#006464"
}
ggplot(data, aes(get(dim.1), get(dim.2))) +
geom_point(size = data$pt.size, color = data$pt.col) +
theme_bw(base_size = 12) +
ggtitle("") +
xlab(dim.1) +
ylab(dim.2) +
annotate("text", x = Inf, y = Inf, hjust = 1.1, vjust = 1.5, label = title, color = col.title, size = text.size) +
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black")
)
}
## 注意,这里我们使用对照组来计算,简化对数据的解释
seu <- qs::qread("output/ifnb_pbmc.seurat.aucell.qs")
seu <- subset(seu, group == "CTRL")
DefaultAssay(seu) <- "AUCell"
# rows are cells and columns are features (regulons)
rasMat <- t(seu[["AUCell"]]@data)
dim(rasMat)
## 2.2 计算cell type indicate matrix
ctMat <- calIndMat(seu$celltype)
## 2.3 计算RSS matrix
rssMat <- calRSSMat(rasMat, ctMat)
## 2.4 可视化
## Overall
plot.list <- lapply(levels(seu$celltype), function(xx) {
PlotRegulonRank(rssMat, xx)
})
cowplot::plot_grid(plotlist = plot.list, ncol = 6)
## Specific cases
PlotRegulonRank(rssMat, "DC")
FeaturePlot(seu, reduction = "umap", features = "RUNX2(+)")
DimPlot2(seu, reduction = "umap", group.by = "celltype", group.highlight = "DC") +
DimPlot2(seu, reduction = "umap", regulon = "RUNX2(+)")
DimPlot2(seu, reduction = "umap", regulon = "RUNX2(+)", threshold = 0.03)
PlotRegulonRank(rssMat, "B cell")
FeaturePlot(seu, reduction = "umap", features = "EBF1(+)")
DimPlot2(seu, reduction = "umap", group.by = "celltype", group.highlight = "B cell") +
DimPlot2(seu, reduction = "umap", regulon = "EBF1(+)")
2.给定任意基因(包括miRNA, lncRNA等等不在cisTarget DB中收录的基因),(任意基因:有变化的(在你的实验组里显著上调的))找到它上游的TF
## input基因, TSS(regulatory region) GTF文件
## chr4 76944450 76945150 # 上游500bp 下游200bp
## input区间,找到这个区间里所有的TFBS
library(Seurat)
library(tidyverse)
## 500 TFs
header <- c("chrom", "start", "end", "motif", "score", "strand", "TF")
jaspar.tfs <- read.table("output/CXCL10.candidate_regulators.bed", sep = "\t", header = F, col.names = header)
length(unique(jaspar.tfs$TF))
jaspar.tfs <- jaspar.tfs %>%
group_by(TF) %>%
summarise(nTFs = n()) %>%
arrange(desc(nTFs)) %>%
mutate(keep = nTFs > 5)
## 169 TFs (JASPAR) motif DNA
jaspar.tfs <- toupper(subset(jaspar.tfs, keep)$TF)
jaspar.tfs <- lapply(jaspar.tfs, function(xx) {
unlist(strsplit(xx, split = "::"))
}) %>% do.call(c, .) %>% unique()
##
vd.res <- read_tsv("output/de_regulons.tsv")
vd.res$gene <- sub("\\(\\+\\)", "", vd.res$gene)
head(vd.res)
## 71 TFs (SCENIC) context-dependent(表达的信息)
scenic.tfs <- subset(vd.res, group > 0.1)$gene
###################################################
## universal (JASPAR) + context-dependent (SCENIC)
###################################################
## 71 -> 12 TFs !!
candidate.tfs <- intersect(scenic.tfs, jaspar.tfs)
length(candidate.tfs)
## 12 -> 4 TFs !! # 判断变化的方向,靶基因处理组上调,正调控的上游TF,TF活性上调
vd.res.2 <- subset(vd.res, gene %in% candidate.tfs)
header <- c("chrom", "start", "end", "motif", "score", "strand", "TF")
input.bed <- read.table("output/10-1.CXCL10.candidate_regulators.bed", sep = "\t", header = F, col.names = header)
output.bed <- subset(input.bed, TF %in% c("Stat2", "STAT2", "STAT1", "Stat1",
"HESX1", "Hesx1", "ATF3", "Atf3"))
## 检查表达信息
seu <- qs::qread("output/ifnb_pbmc.seurat.aucell.qs")
DefaultAssay(seu) <- "AUCell"
VlnPlot(seu, group.by = "celltype", features = c("CXCL10"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red"))
VlnPlot(seu, group.by = "celltype", features = c("STAT2(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("STAT1(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("HESX1(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")
VlnPlot(seu, group.by = "celltype", features = c("ATF3(+)"), split.by = "group",
split.plot = TRUE, pt.size = 0, cols = c("blue", "red")) + ylab("TF activity")