• Multimodal-intersection-analysis-MIA-/ github


    在这里插入代码片
    
    • 1
    ---
    title: "Multimodal intersection analysis (MIA) tutorial"
    author: "Reuben Moncada"
    date: "11/3/2020"
    output: html_document
    ---
    
    
    ```{r setup, include = FALSE}
    knitr::opts_chunk$set(echo = TRUE)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    This tutorial will walk through multimodal intersection analysis (MIA) for the integration of single-cell RNA-seq (scRNA-seq) and microarray-based spatial transcriptomics (ST) data, as described in Moncada et al. Nature Biotechnology (2020). We will start with the scRNA-seq and ST for the PDAC-A sample.


    Let’s begin by loading the filtered scRNA-seq data for PDAC-A (GEO accession GSE111672) and converting the data to a Seurat object for analysis.
    The scRNA-seq expression matrix from GEO already has cluster annotations as the column header, so we won’t need to cluster the data.
    library(Seurat)
    library(dplyr)
    library(stringi)
    library(ggplot2)
    library(reshape2)
    library(scales)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    # Load PDAC-A scRNA-seq expression matrix
    sc <- read.table('GSE111672_PDAC-A-indrop-filtered-expMat.txt', sep = '\t', header = FALSE)
    cluster_assignments <- as.character(sc[1,])[-(1:1)] # Extract cluster annotations
    genes <- sc[-(1:1),1]                               # Extract genes
    sc <- sc[-(1:1),-(1:1)]                             # Remove genes and cluster annotations from expression matrix
    rownames(sc) <- make.names(genes, unique = TRUE)    # Set rownames of expression matrix to the list of genes
    sc <- CreateSeuratObject(sc, assay = 'RNA')         # Create Seurat object
    Idents(sc) <- cluster_assignments                   # Use the original cluster annotations as the cell identities
    # Collapse subpopulations into broader cell type annotations
    sc <- RenameIdents(object = sc, 'Macrophages A' = 'Macrophages')
    sc <- RenameIdents(object = sc, 'Macrophages B' = 'Macrophages')
    sc <- RenameIdents(object = sc, 'Ductal - terminal ductal like' = 'Ductal')
    sc <- RenameIdents(object = sc, 'Ductal - APOL1 high/hypoxic' = 'Ductal')
    sc <- RenameIdents(object = sc, 'Ductal - MHC Class II' = 'Ductal')
    sc <- RenameIdents(object = sc, 'Ductal - CRISP3 high/centroacinar like' = 'Ductal')
    sc <- RenameIdents(object = sc, 'mDCs A' = 'mDCs')
    sc <- RenameIdents(object = sc, 'mDCs B' = 'mDCs')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    Now we’ll load the PDAC-A-ST1 ST data. The column headers of the data contain the tissue coordinates of each ST spot.
    # Load PDAC-A-ST1 expression matrix:
    st <- read.table('GSM3036911_PDAC-A-ST1-filtered.txt', sep = '\t', header = TRUE)
    genes <- st[,1]                               
    st <- st[,-(1:1)]
    rownames(st) <- make.names(genes, unique = TRUE)
    st <- CreateSeuratObject(st, assay = 'Spatial')
    # Create data frame of spot tissue_coordinates
    tissue_coord <- data.frame(row.names = colnames(st))
    for (i in colnames(st)) {
      tissue_coord[i,'X'] <- (strsplit(sub('X', '', as.name(i)), 'x') %>% unlist())[1] %>% as.integer()
      tissue_coord[i,'Y'] <- (strsplit(sub('X', '', as.name(i)), 'x') %>% unlist())[2] %>% as.integer()
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    Let’s normalize our scRNA-seq data prior to marker gene identification, and visualize our cell types with tSNE:
    sc <- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = median(sc$nCount_RNA), verbose = FALSE)
    sc <- ScaleData(sc, features = rownames(sc), verbose = FALSE)
    sc <- FindVariableFeatures(sc, selection.method = "vst", nfeatures = 1000, verbose = FALSE)
    sc <- RunPCA(sc, features = VariableFeatures(object = sc), verbose = FALSE)
    sc <- RunTSNE(sc, dims = 1:30, verbose = FALSE)
    # Visualize cell types
    DimPlot(sc, reduction = 'tsne', label = TRUE, pt.size = 0.5)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

    Now let’s identify marker genes for each of our cell types:

    sc.markers <- FindAllMarkers(sc, only.pos = TRUE, test.use = 't', min.pct = 0.25, logfc.threshold = 0.25, verbose = FALSE)
    sc.markers['cluster'] %>% summary(maxsum=50) 
    
    • 1
    • 2
    Switching back to the ST data:
    Since we don’t have cluster annotations, let’s cluster the ST data:
    st <- NormalizeData(st, normalization.method = "LogNormalize", scale.factor = median(st$nCount_Spatial), verbose = FALSE)
    st <- ScaleData(st, features = rownames(st), verbose = FALSE)
    st <- FindVariableFeatures(st, selection.method = "vst", nfeatures = 1000, verbose = FALSE)
    st <- RunPCA(st, features = VariableFeatures(object = st), verbose = FALSE)
    st <- FindNeighbors(st, dims = 1:10, verbose = FALSE)
    st <- FindClusters(st, resolution = 0.5, verbose = FALSE)
    # Add clustering assignments to tissue tissue_coordinate data frame
    tissue_coord$clusters <- Idents(st) %>% unlist() %>% as.character()
    # Visualize clusters
    ggplot(tissue_coord, aes(x=X, y=Y, color=clusters)) + 
      geom_point(shape = 15, size = 5.5) + 
      theme_minimal() +
      theme(axis.text.x=element_blank(), axis.text.y=element_blank(),
            axis.title.x = element_blank(), axis.title.y = element_blank(),
            panel.grid.major = element_blank(), panel.grid.minor = element_blank())
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    The clusters generally match the cluster assignments in the manuscript, so we can rename the clusters based on our prior knowledge:
    # Cluster 0 is stroma, cluster 1 is the duct epithelium, cluster 2 is the cancer region, and cluster 3 is pancreatic tissue.
    new.cluster.ids <- c('Stroma','Duct epithelium','Cancer','Pancreatic tissue')
    names(new.cluster.ids) <- levels(st)
    st <- RenameIdents(st, new.cluster.ids)
    tissue_coord$clusters <- Idents(st) %>% unlist() %>% as.character()
    ggplot(tissue_coord, aes(x=X, y=Y, color=clusters)) + 
      geom_point(shape = 15, size = 5.5) + 
      theme_minimal() +
      theme(axis.text.x=element_blank(), axis.text.y=element_blank(),
            axis.title.x = element_blank(), axis.title.y = element_blank(),
            panel.grid.major = element_blank(), panel.grid.minor = element_blank())
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    Now let’s identify marker genes for each ST region:**

    st.markers <- FindAllMarkers(st, only.pos = TRUE, logfc.threshold = 0.1, test.use = 't',verbose = FALSE)
    st.markers['cluster'] %>% summary()
    
    • 1
    • 2

    Multimodal Intersection Analysis (MIA)

    Now that we have a set of marker genes for the cell types and for the ST tissue regions, we can now integrate both datasets with MIA to determine where the scRNA-seq defined cell types localize in the tissue.
    Briefly, MIA uses the hypergeometric cumulative distribution to measure the enrichment of cell-type specific marker genes (from scRNA-seq) within a list of tissue-region specific genes (from ST). We use this as a proxy for estimating the likelihood each cell type is localized to a given tissue region.
    First, let’s compile a list of marker genes for both the ST and scRNA-seq data:
    # Create a list object containing the marker genes for each ST region:
    st.clusts <- Idents(st) %>% levels()
    N <- length(st.clusts)
    st.marker.list <- vector(mode = 'list', length = N)
    names(st.marker.list) <- st.clusts
    for(i in st.clusts) {
        st.marker.list[[i]] <- st.markers[st.markers$cluster == i,'gene']
    }
    # Create a list object containing the marker genes for each cell type:
    sc.clusts <- Idents(sc) %>% levels()
    M <- length(sc.clusts)
    sc.marker.list <- vector(mode = 'list', length = M)
    names(sc.marker.list) <- sc.clusts
    for (i in sc.clusts) {
      sc.marker.list[[i]] <- sc.markers[sc.markers$cluster == i,'gene']
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    Let’s first demonstrate MIA with a single example: fibroblasts (scRNA-seq) and the cancer region (ST):

    在这里插入图片描述

    gene.universe         <- length(rownames(st))                    
    cell.type.markers     <- sc.marker.list[["Fibroblasts"]] # Genes specific to fibroblasts
    tissue.region.markers <- st.marker.list[["Cancer"]]      # Genes specific to the cancer region
    common.genes          <- intersect(cell.type.markers,    # Common genes between these two gene sets
                                       tissue.region.markers)
    # MIA:
    A <- length(common.genes)
    B <- length(cell.type.markers)
    C <- length(tissue.region.markers)
    enr <- -log10(phyper(A, B, gene.universe-B, C, lower.tail = FALSE))
    dep <- -log10(1-phyper(A, B, gene.universe-B, C, lower.tail = FALSE))
    if (enr < dep) {
      MIA.results <- -dep
    } else {
      MIA.results <- enr
    }
    print(MIA.results)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    What this signifies is that the fibroblast genes are significantly enriched within the genes specific to the cancer region of the tissue.
    With a -log10(p-value) of ~13, one way to think about this is that there’s approximately a 0.00000000000001% chance we observed such an overlap between the cell-type and tissue-region genes simply by chance.
    In parallel we also test for cell type depletion by computing -log10(1-p). Here, we are estimating the likelihood for an observed lack of overlap between gene sets.
    Let’s use MIA to determine the cell type enrichments for all cell types and tissue regions:
    # Initialize a dataframe for us to store values in:
    N <- length(st.clusts) ; M <- length(sc.clusts)
    MIA.results <- matrix(0,nrow = M, ncol = N)
    row.names(MIA.results) <- sc.clusts
    colnames(MIA.results) <- st.clusts
    # Gene universe
    gene.universe <- length(rownames(st))
      # Loop over ST clusters
    for (i in 1:N) {
      # Then loop over SC clusters
      for (j in 1:M) {
        genes1 <- st.marker.list[[st.clusts[i]]]
        genes2 <- sc.marker.list[[sc.clusts[j]]]
        
        # Hypergeometric    
        A <- length(intersect(genes1,genes2))
        B <- length(genes1)
        C <- length(genes2)
        enr <- -log10(phyper(A, B, gene.universe-B, C, lower.tail = FALSE))
        dep <- -log10(1-phyper(A, B, gene.universe-B, C, lower.tail = FALSE))
        if (enr < dep) {
          MIA.results[j,i] = -dep
        } else {
          MIA.results[j,i] = enr
        }
      }
    }
    # Some results were -Inf...check why this is the case...
    MIA.results[is.infinite(MIA.results)] <- 0
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    # Visualize as heatmap
    heatmap_df <- data.frame('Cell types' = melt(MIA.results)[,1],
                             'Tissue regions' = melt(MIA.results)[,2],
                             enrichment = melt(MIA.results)[,3])
    ggplot(data = heatmap_df, aes(x = Tissue.regions, y = Cell.types, fill = enrichment)) +
      geom_tile() + 
      scale_fill_gradient2(low = "navyblue", high = "red", mid = "white", 
                           midpoint = 0, limit = c(-10,10), space = "Lab", 
                           oob=squish, name="Enrichment \n -log10(p)") +
      ylim(heatmap_df$Cell.types %>% levels() %>% sort() %>% rev()) + 
      theme_minimal()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    
    ```css
    在这里插入代码片
    
    • 1
    • 2
    • 3
  • 相关阅读:
    系统性能测试
    Flutter笔记:拖拽手势
    计算机网络——数据链路层介质访问控制
    前端使用 Konva 实现可视化设计器(10)- 对齐线
    完美且简要,如此输出风控中的重要数据指标曲线(如KS等)
    Spring AOP 底层实现原理
    DotNetGuide新增C#/.NET/.NET Core充电站(让你学习不迷路)
    延时任务(三)-基于redis zset的完整实现
    php7 改为从栈上分配内在的思路
    数据结构|基础知识定义
  • 原文地址:https://blog.csdn.net/qq_52813185/article/details/126728179