上一节我们研究了如何对单样本进行分析,这节我们就着重来研究一下如何对多样本整合进行研究分析!
library(Seurat)
library(tidyverse)
library(patchwork)
# 导入单样本文件
dir = c(
'~/Desktop/diversity intergration/scRNA_26-0_filtered_feature_bc_matrix',
'~/Desktop/diversity intergration/scRNA_27-2_filtered_feature_bc_matrix',
'~/Desktop/diversity intergration/scRNA_28-0_filtered_feature_bc_matrix'
)
# 导入数据
counts <- Read10X(data.dir = dir)
# 转换为seurat对象
scRNA1 <- CreateSeuratObject(counts = counts,min.cells = 3,min.features = 200)
dim(scRNA1)
# 查看每个样本的细胞
table(scRNA1@meta.data$orig.ident)
# healthy1 healthy2 healthy3
# 6037 6343 11657
scRNAlist <- list() # 创建一个空的列表,其中包含三个seurat对象
names(dir) = c('healthy1','healthy2','healthy3')
sample_name = c('healthy1','healthy2','healthy3')
for (i in 1:length(dir)){
counts <- Read10X(data.dir = dir[i])
scRNAlist[[i]] <- CreateSeuratObject(counts = counts,min.cells = 3,min.features = 200)
scRNAlist[[i]] <- RenameCells(scRNAlist[[i]],add.cell.id = sample_name[i])
}
scRNA <- merge(scRNAlist[[1]],y = c(scRNAlist[[2]],scRNAlist[[3]]))
dim(scRNA)
# [1] 18037 24037
# 每一个样本分别进行数据标准化和提取高变基因
for (i in 1:length(scRNAlist)){
scRNAlist[[i]] <- NormalizeData(scRNAlist[[i]])
scRNAlist[[i]] <- FindVariableFeatures(scRNAlist[[i]],selection.method = 'vst',nfeatures = 2000) # 这里我们只需要2000的基因
}
# 以variableFeatures为基础寻找锚点
scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist)
dim(scRNA.anchors)
# 利用锚点整合数据
scRNA.integrated <- IntegrateData(anchorset = scRNA.anchors)
dim(scRNA.integrated)
# [1] 2000 24037
# 设置默认的分析方法为integrated
# ScaleData 函数的作用是对数据进行标准化或归一化处理,以确保不同特征之间具有可比性,并减少数据的偏差和方差。通过缩放数据,可以使后续的分析更加稳定和可靠。
DefaultAssay(scRNA.integrated) <- 'integrated'
scRNA.integrated <-ScaleData(scRNA.integrated,verbose = FALSE)
# scRNA.integrated 对象中的数据将被缩放,并可用于后续的分析步骤,如降维、聚类、差异表达分析等。
scRNA.integrated <- RunPCA(scRNA.integrated,features=VariableFeatures(scRNA.integrated))
DimPlot(scRNA.integrated,reduction = 'pca',group.by = 'orig.ident')
ElbowPlot(scRNA.integrated,ndims = 30,reduction = 'pca')
# 从图中可以看出我们最好应该选择前20个维度的数据
scRNA <- FindNeighbors(scRNA.integrated,dims = 1:20)
scRNA <- FindClusters(scRNA,resolution = 0.1) # 0.5聚类的结果太多
table(scRNA@meta.data$seurat_clusters)
metadata <- scRNA@meta.data
# 单独将数据提取出来
cell_cluster <- data.frame(cell_ID=rownames(metadata),cluster_ID=metadata$seurat_clusters)
scRNA <- RunUMAP(scRNA,dims = 1:20)
embed_umap <- Embeddings(scRNA,'umap')
DimPlot(scRNA,reduction = 'umap')
DimPlot(scRNA,reduction = 'umap',group.by = 'orig.ident')
scRNA <- RunTSNE(scRNA,dims = 1:20)
embed_tsne <- Embeddings(scRNA,'tsne')
DimPlot(scRNA,reduction = 'tsne')
DimPlot(scRNA,reduction = 'umap',group.by = 'orig.ident')
UMAP(Uniform Manifold Approximation and Projection)和 t-SNE(t-Distributed Stochastic Neighbor Embedding)都是用于降维和数据可视化的技术,它们有一些区别和联系:
区别:
联系:
# 切换数据集
DefaultAssay(scRNA) <- 'RNA'
# 计算线粒体和红细胞的基因比例
scRNA[['percent.mt']] <- PercentageFeatureSet(scRNA,pattern = 'MT-')
# 通常是指与血红蛋白(Hemoglobin)相关的基因
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB_m <- match(HB.genes, rownames(scRNA@assays$RNA)) # 匹配已经拥有的基因,返回一个含有下标的向量
HB.genes <- rownames(scRNA@assays$RNA)[HB_m]
scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes)
##meta.data添加信息
proj_name <- data.frame(proj_name=rep("demo2",ncol(scRNA)))
rownames(proj_name) <- row.names(scRNA@meta.data)
scRNA <- AddMetaData(scRNA, proj_name)
# 绘制小提琴图
# 所有样本一个小提琴图用group.by="proj_name",每个样本一个小提琴图用group.by="orig.ident"
plot7 <-VlnPlot(scRNA, group.by = "proj_name", raster=FALSE,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
pt.size = 0, #不需要显示点,可以设置pt.size = 0
) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) # 将x轴的标题,文本和刻度线都设置为空,这样x轴就不会显示任何内容
plot1 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.HB")
pearplot <- CombinePlots(plots = list(plot1, plot2), nrow=1, legend="none")
pearplot
# 去除细胞特征过高和过低的细胞
scRNA <- subset(scRNA, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 10)
# 数据归一化
scRNA <- NormalizeData(scRNA, normalization.method = "LogNormalize", scale.factor = 10000)
# 鉴定高变基因
# 这一步的目的是鉴定出细胞与细胞之间表达相差很大的基因,用于后续鉴定细胞类型
# 我们使用默认参数,用vst方法选出2000个高变基因
scRNA <- FindVariableFeatures(scRNA,selection.method = 'vst',nfeatures = 2000)
dim(scRNA) # 但是这里跑程序的时候基因的数量不对,还没有找到原因
# [1] 18037 21402
# 前十个高变基因
top10 <- head(VariableFeatures(scRNA),10)
top10
# [1] "PTGDS" "S100A9" "S100A8" "CST3" "TRBV11-2" "HLA-DQA1" "HLA-DRA" "C1QA" "LILRA4" "LYZ"
可视化
plot1 <- VariableFeaturePlot(scRNA)
plot2 <- LabelPoints(plot = plot1,points = top10,repel = TRUE,xnudge = 0,ynudge = 0)
Idents(scRNA) <- 'integrated_snn_res.0.5'
plot3 = DimPlot(scRNA, reduction = "umap", label=T)
# 鉴定细胞类型
# 为了后续分析的方便,我们先用singleR来预测每个cluster的细胞类型
library(celldex)
library(SingleR)
cg <- ImmGenData() # 选定一个参考集数据,ImmGenData是一个免疫细胞的数据集
cellpred <- SingleR(test=testdata,ref=cg, labels=cg$label.main)
table(cellpred$labels) # 看看都注释到了哪些细胞
# B cells Basophils Endothelial cells Eosinophils Epithelial cells
# 20337 1 224 595 31
# Fibroblasts ILC Macrophages Mast cells
# 3 205 5 1
# 由得到的结果可以看出,用SingerR注释的结果太拉胯了,虽然手动注释比较麻烦,但是为了数据的准确性还是手动注释吧
cellType=data.frame(seurat=scRNA@meta.data$seurat_clusters,
predict=cellpred$labels)
table(cellType[,1:2]) # 访问celltyple的2~3列
####细胞注释
scRNA.sub <- subset(scRNA, idents = c(1,3,4,7), invert = TRUE) # 挑选需要的簇
scRNA.sub
new.cluster.ids <- c(
"1" = "CD4 T",
"2" = "Monocyte",
"3" = "CD4 T",
"4" = "NK",
"5" = "Monocyte",
"6" = "CD8 T",
"7" = "B",
"8" = "CD8 T",
"9" ="Monocyte",
"11"="NK",
"13" = "Monocyte",
"14" = "Monocyte",
"16" = "CD4 T",
"17" = "DC",
"18" = "B",
"19"="NK",
"20"="Monocyte",
"21"="DC",
"22" = "B",
"25"="NK"
)
scRNA.sub <- RenameIdents(scRNA.sub, new.cluster.ids)
options(repr.plot.height = 6, repr.plot.width = 8)
scRNA.sub$cell_type <- Idents(scRNA.sub)
Idents(scRNA.sub) <- "cell_type"
DimPlot(scRNA.sub, reduction = "umap", label = TRUE,raster=FALSE)