vst 算法R语言手工实现 | Seurat 筛选高变基因的算法
1. 算法描述
> ?FindVariableFeatures
vst:
- First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess).
- Then standardizes the feature values using the observed mean and expected variance (given by the fitted line).
- Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
vst steps: 目的是在var~mean曲线中,不同mean值区域都能挑选var较大的基因
- 使用局部多项式拟合(loess) log(variance) 和log(mean) 平滑曲线模型
- 获取模型计算的值作为y=var.exp值
- 截取最大之后,var.standarlized = get variance after feature standardization:
(每个基因 - mean)/sd 后 取var(). 注意sd=sqrt(var.exp) - 按照 var.standarlized 降序排列,取前n(比如2000)个基因作为高变基因。
2. 加载数据及Seurat vst 结果
使用pbmc 3k数据,走标准Seurat4,选取top 2000 HVG。
library(Seurat)
library(ggplot2)
library(dplyr)pbmc=readRDS("d:\\code_R\\filtered_gene_bc_matrices\\pbmc3k.final.Rds")
DimPlot(pbmc, label=T)# pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)# 0. 获取Seurat包计算的HVG ----
top10 <- head(VariableFeatures(pbmc), 10)
p1=VariableFeaturePlot(pbmc); p1
LabelPoints(plot = p1, points = top10, repel = TRUE)genelist1=VariableFeatures(pbmc)
length(genelist1)
head(genelist1)head( pbmc@assays$RNA@meta.features )
3. 手工 HVG
(1) LOESS fit y~x
log(variance) ~ log(mean)
# use raw data: vst 的直接输入的是 counts,所以直接算的 行平均数,作为基因表达值
counts = pbmc@assays$RNA@counts# 计算每行均值
hvf.info <- data.frame(mean = rowMeans(x = counts, na.rm = T))# 计算每行方差
hvf.info$variance = apply(counts, 1, function(x){var(x, na.rm = T)
})head(hvf.info)
dim(hvf.info)if(0){#pdf( paste0(outputRoot, keyword, "_01_4B.HVG.pdf"), width=5.5, height =4.8)plot(hvf.info$mean, hvf.info$variance, pch=19, cex=0.3)plot(log10(hvf.info$mean), hvf.info$variance, pch=19, cex=0.3)plot(log10(hvf.info$mean), log10(hvf.info$variance), pch=19, cex=0.3) #looks good#dev.off()
}# 通过loess拟合,计算期望方差和标准化的方差
hvf.info$variance.expected <- 0
hvf.info$variance.standardized <- 0not.const <- (hvf.info$variance >0) & (!is.na(hvf.info$variance)) & (!is.na(hvf.info$mean))
table(not.const)
#TRUE
#13714# 拟合 y~x
loess.span=0.3 #default in Seurat4
fit <- loess(formula = log10(x = variance) ~ log10(x = mean),data = hvf.info[not.const, ],span = loess.span
)dim(hvf.info[not.const, ]) #13714 4
#str(fit)
(2). 获取模型给出的期望y值
# 期望y值:使用模型计算的值
hvf.info$variance.expected[not.const] <- 10 ^ fit$fitted
(3). 截取极端值后,计算标准化后的方差
#clip.max == 'auto',则自动设置为 列数(细胞数)的开方
clip.max <- sqrt(x = ncol(x = counts))
clip.max #51.9# 计算feature标准化( (counts - mean)/sd )后的方差,注意sd=sqrt(var)
# 求方差前截取极大值
hvf.info$variance.standardized[not.const]=(function(){result=c()for(i in 1:nrow(counts)){row.var=NAif(not.const[i]){# clip to a maximumrow.counts.std = (counts[i, ] - hvf.info$mean[i]) / sqrt(hvf.info$variance.expected[i])row.counts.std[row.counts.std>clip.max]=clip.max# 计算方差row.var=var( row.counts.std, na.rm = T )}# 返回结果result=c(result, row.var)}return(result)
})() #2min
4. 结果比较
(1) 结果检查1:基因列表一致
手工计算的和Seurat4的HVG gene list结果完全一致。
# Check 1: HVG gene list
hvf.info=hvf.info[order(-hvf.info$variance.standardized),]
head(hvf.info)
tail(hvf.info)
#top.features=head( rownames(hvf.info), n=250)
top.features_2=head( rownames(hvf.info), n=2000)
length(top.features_2)
head(top.features_2)
setdiff(top.features_2, genelist1)
setdiff(genelist1, top.features_2)
#
if(0){# Check: gene and their parampbmc@assays$RNA@meta.features[c(setdiff(genelist1, top.features_2)),]hvf.info[c(setdiff(genelist1, top.features_2)),]#pbmc@assays$RNA@meta.features[c(setdiff(top.features_2, genelist1)),]hvf.info[c(setdiff(top.features_2, genelist1)),]
}
(2) 结果检查2:基因参数一致
手工计算的和Seurat4的HVG gene 参数完全一致。
# check2: HVG and its params
# 1.
dim(pbmc@assays$RNA@meta.features)
head( pbmc@assays$RNA@meta.features )
# vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
#AL627309.1 0.003333333 0.003323453 0.003575582 0.9294859 FALSE
#AP006222.2 0.001111111 0.001110288 0.001112798 0.9977442 FALSE
#RP11-206L10.2 0.001851852 0.001849107 0.001921811 0.9621691 FALSE
#RP11-206L10.9 0.001111111 0.001110288 0.001112798 0.9977442 FALSE
#LINC00115 0.006666667 0.006624676 0.007342308 0.9022607 FALSE
#NOC2L 0.106666667 0.158310485 0.203482316 0.7780061 FALSE# 2.
dim(hvf.info)
hvf.info=hvf.info[rownames(pbmc@assays$RNA@meta.features),]
head(hvf.info)
# mean variance variance.expected variance.standardized
#AL627309.1 0.003333333 0.003323453 0.003575582 0.9294859
#AP006222.2 0.001111111 0.001110288 0.001112798 0.9977442
#RP11-206L10.2 0.001851852 0.001849107 0.001921811 0.9621691
#RP11-206L10.9 0.001111111 0.001110288 0.001112798 0.9977442
#LINC00115 0.006666667 0.006624676 0.007342308 0.9022607
#NOC2L 0.106666667 0.158310485 0.203482316 0.7780061
比较高变基因的参数:
> hvf.info[genelist1|> head(), ]mean variance variance.expected variance.standardized
PPBP 0.2451852 9.577506 0.5888573 11.171153
S100A9 6.0466667 278.681037 34.8969051 7.985838
IGLL5 0.2792593 8.894938 0.6929479 7.964379
LYZ 10.2466667 564.108825 70.8492711 7.962098
GNLY 1.5740741 45.239046 6.0378423 7.492585
FTL 27.6674074 2008.688897 278.9968379 7.199683
> pbmc@assays$RNA@meta.features[genelist1|> head(), ]vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
PPBP 0.2451852 9.577506 0.5888573 11.172765 TRUE
S100A9 6.0466667 278.681037 34.8969051 7.985838 TRUE
IGLL5 0.2792593 8.894938 0.6929479 7.965360 TRUE
LYZ 10.2466667 564.108825 70.8492711 7.962098 TRUE
GNLY 1.5740741 45.239046 6.0378423 7.492585 TRUE
FTL 27.6674074 2008.688897 278.9968379 7.199683 TRUE> table(abs(hvf.info[genelist1, 4] - pbmc@assays$RNA@meta.features[genelist1, 4])<0.005)
FALSE TRUE 3 1997# 差别不大,差异的绝对值大于0.005的共三个:
> keep2=abs(hvf.info[genelist1, 4] - pbmc@assays$RNA@meta.features[genelist1, 4])>0.005
> table(keep2)
keep2
FALSE TRUE 1997 3 > hvf.info[genelist1, ][keep2, ]mean variance variance.expected variance.standardized
IGJ 0.16777778 16.822896 0.36540081 3.481455
SLC48A1 0.03370370 0.871409 0.04618194 2.215032
NAPA-AS1 0.02962963 1.050622 0.03932270 1.274513
> pbmc@assays$RNA@meta.features[genelist1, ][keep2, ]vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
IGJ 0.16777778 16.822896 0.36540081 3.498952 TRUE
SLC48A1 0.03370370 0.871409 0.04618194 2.220942 TRUE
NAPA-AS1 0.02962963 1.050622 0.03932270 1.280866 TRUE
最后一列略有区别:
#
all( abs(hvf.info[,1] - pbmc@assays$RNA@meta.features[,1]) < 1e-6) #T
all( abs(hvf.info[,2] - pbmc@assays$RNA@meta.features[,2]) < 1e-6) #T
all( abs(hvf.info[,3] - pbmc@assays$RNA@meta.features[,3]) < 1e-6) #Ttable( abs(hvf.info[,4] - pbmc@assays$RNA@meta.features[,4]) < 1e-6) #not all T
table( abs(hvf.info[,4] - pbmc@assays$RNA@meta.features[,4]) < 0.01)
#FALSE TRUE
# 1 13713keep = abs(hvf.info[,4] - pbmc@assays$RNA@meta.features[,4]) > 1e-2> table(keep)
keep
FALSE TRUE
13713 1
> hvf.info[keep, ]mean variance variance.expected variance.standardized
IGJ 0.1677778 16.8229 0.3654008 3.481455
> pbmc@assays$RNA@meta.features[keep, ]vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable
IGJ 0.1677778 16.8229 0.3654008 3.498952 TRUE
(3) 绘图比较
(a) var~avg with top 2000 HVG selected by vst;
(b) std.var ~ avg with top 2000 HVG selected by vst;
( c) same as (b), but draw by Seurat functions.
# plot1
plot(log10(hvf.info$mean), log10(hvf.info$variance), pch=19, cex=0.3, main="vst manully #1", mgp=c(2,1,0))
points(log10(hvf.info[top.features_2, ]$mean), log10(hvf.info[top.features_2, ]$variance), pch=19, cex=0.3, col="red")# plot2
plot(log10(hvf.info$mean), hvf.info$variance.standardized, pch=19, cex=0.3, main="vst manully #2", mgp=c(2,1,0))
points(log10(hvf.info[top.features_2,]$mean), (hvf.info[top.features_2,]$variance.standardized), pch=19, cex=0.3, col="red")# plot3: Seurat
LabelPoints(plot = p1, points = top10, repel = TRUE)
#
Ref:
- https://medium.com/byte-sized-machine-learning/selection-of-highly-variable-genes-hvgs-in-scrna-seq-647c8eee3845
- https://zhuanlan.zhihu.com/p/479549742