关注微信:生信小博士
选择最优聚类有多种方式,今天着重于Silhouette Method。
Before we do the actual clustering, we need to identity the Optimal number of clusters (k) for this data set of wholesale customers. The popular way of determining number of clusters are
Elbow and Silhouette methods are direct methods and gap statistic method is the statistics method.
In this demonstration, we are going to see how silhouette method is used.
如果你已经准备好前期的数据,此步可以省略。
这一步的目的:会得到一个名为inttegtaed_data的seurat对象。
- slide_files <- list.files("~/silicosis/spatial/prcessed_visum_for_progeny/data/",recursive = TRUE,
- all.files = TRUE,full.names = TRUE,pattern = "_")
-
-
-
-
- integrated_data <- map(slide_files, readRDS)
- print("You managed to load everything")
- print("Object size")
- print(object.size(integrated_data))
-
-
-
-
-
- # Calculate HVG per sample - Here we assume that batch and patient effects aren't as strong
- # since cell-types and niches should be greater than the number of batches
-
- Assays(integrated_data [[1]])
- str(integrated_data [[1]])
-
-
-
- def_assay="Spatial"
-
-
- hvg_list <- map(integrated_data, function(x) {
-
- DefaultAssay(x) <- def_assay
-
- x <- FindVariableFeatures(x, selection.method = "vst",
- nfeatures = 3000)
-
- x@assays[[def_assay]]@var.features
-
- }) %>% unlist()
-
- hvg_list <- table(hvg_list) %>%
- sort(decreasing = TRUE)
-
-
-
-
- gene_selection_plt <- hvg_list %>% enframe() %>%
- group_by(value) %>%
- mutate(value = as.numeric(value)) %>%
- summarize(ngenes = length(name)) %>%
- ggplot(aes(x = value, y = ngenes)) +
- geom_bar(stat = "identity")
-
- gene_selection <- hvg_list[1:3000] %>% names()
-
-
- #########
- # Create merged object ---------------------------------
- integrated_data <- purrr::reduce(integrated_data,
- merge,
- merge.data = TRUE)
-
- print("You managed to merge everything")
- print("Object size")
- print(object.size(integrated_data))
-
- # Default assay ---------------------------------------
- #DefaultAssay(integrated_data) <- def_assay
-
-
-
- # Process it before integration -----------------------
- integrated_data <- integrated_data %>%
- ScaleData(verbose = FALSE) %>%
- RunPCA(features = gene_selection,
- npcs = 30,
- verbose = FALSE)
-
- original_pca_plt <- DimPlot(object = integrated_data,
- reduction = "pca",
- pt.size = .1,
- group.by = "orig.ident")
-
- head(integrated_data@meta.data)
-
- batch_covars="orig.ident"
- # Integrate the data -----------------------
- integrated_data <- RunHarmony(integrated_data,
- group.by.vars = batch_covars,
- plot_convergence = TRUE,
- assay.use = def_assay,
- max.iter.harmony = 20)
-
-
-
- # Corrected dimensions -----------------------
- corrected_pca_plt <- DimPlot(object = integrated_data,
- reduction = "harmony",
- pt.size = .1,
- group.by = "orig.ident")
-
-
- # Create the UMAP with new reduction -----------
- integrated_data <- integrated_data %>%
- RunUMAP(reduction = "harmony", dims = 1:30,
- reduction.name = "umap_harmony") %>%
- RunUMAP(reduction = "pca", dims = 1:30,
- reduction.name = "umap_original")
-
- integrated_data <- FindNeighbors(integrated_data,
- reduction = "harmony",
- dims = 1:30)
-
- head(integrated_data@meta.data)
-
- colnames(integrated_data@meta.data)=gsub(pattern ="_snn_res.",replacement = "_before",x = colnames(integrated_data@meta.data) )
准备好的数据如下。这里的数据你可以使用seurat的pbmc教程中的数据,作为练习。

然后进行下一步
全代码如下
- optimize=TRUE
- ################opt cluster--------------
- if(optimize) {
-
- # Clustering and optimization -------------------------
- print("Optimizing clustering")
-
- seq_res <- seq(0.5, 1.5, 0.1)
-
- # seq_res=1
- integrated_data <- FindClusters(integrated_data,
- resolution = seq_res,
- verbose = F)
-
- clustree_plt <- clustree::clustree(integrated_data,
- prefix = paste0(DefaultAssay(integrated_data), "_snn_res."))
-
- # Optimize clustering ------------------------------------------------------
- cell_dists <- dist(integrated_data@reductions$harmony@cell.embeddings,
- method = "euclidean")
- head(cell_dists)
-
-
- cluster_info <- integrated_data@meta.data[,grepl(paste0(DefaultAssay(integrated_data),"_snn_res"),
- colnames(integrated_data@meta.data))] %>%
- dplyr::mutate_all(as.character) %>%
- dplyr::mutate_all(as.numeric)
-
- head(cluster_info)[,1:9]
- # head(cell_dists)[,1:9]
- si= silhouette(cluster_info[,1], cell_dists) %>%head()
- si
-
- silhouette_res <- apply(cluster_info, 2, function(x){
- si <- silhouette(x, cell_dists)
-
- if(!any(is.na(si))) {
- mean(si[, 'sil_width'])
- } else {
- NA
- }
- })
- head(silhouette_res)
-
- integrated_data[["opt_clust_integrated"]] <- integrated_data[[names(which.max(silhouette_res))]]
-
- Idents(integrated_data) = "opt_clust_integrated"
上述代码的部分运行结果:


全代码如下
- # Reduce meta-data -------------------------------------------------------------------------
- spam_cols <- grepl(paste0(DefaultAssay(integrated_data), "_snn_res"),
- colnames(integrated_data@meta.data)) |
- grepl("seurat_clusters",colnames(integrated_data@meta.data))
-
- integrated_data@meta.data <- integrated_data@meta.data[,!spam_cols]
-
- } else {
-
- print("Not Optimizing clustering")
-
- seq_res <- seq(0.2, 1.6, 0.2)
-
- integrated_data <- FindClusters(integrated_data,
- resolution = seq_res,
- verbose = F)
-
- clustree_plt <- clustree(integrated_data,
- prefix = paste0(DefaultAssay(integrated_data),
- "_snn_res."))
-
- integrated_data <- FindClusters(integrated_data,
- resolution = default_resolution,
- verbose = F)
-
- integrated_data[["opt_clust_integrated"]] <- integrated_data[["seurat_clusters"]]
-
- spam_cols <- grepl(paste0(DefaultAssay(integrated_data), "_snn_res"),
- colnames(integrated_data@meta.data)) |
- grepl("seurat_clusters",colnames(integrated_data@meta.data))
-
- integrated_data@meta.data <- integrated_data@meta.data[,!spam_cols]
-
- }
4 保存对象
- # Save object ------------------------------------------------------
- saveRDS(integrated_data, file ="~/silicosis/spatial/integrated_slides/integrated_slides.rds" )
关注微信:生信小博士
参考: