• GEO生信数据挖掘(七)差异基因分析


    上节,我们使用结核病基因数据,做了一个数据预处理的实操案例。例子中结核类型,包括结核,潜隐进展,对照和潜隐,四个类别。本节延续上个数据,进行了差异分析。

    差异分析 计算差异指标step12

    加载数据

    load("dataset_TB_LTBI_step8.Rdata")

    构建差异比较矩阵

    1. #样本列表
    2. group_list=group_data_TB_LTBI$group_more
    3. #构建分组
    4. design=model.matrix(~0+factor(group_list))
    5. colnames(design)=levels(factor(group_list))
    6. #head(dataset_TB_LTBI)
    7. row.names(design)=colnames(dataset_TB_LTBI)
    8. design #得到分组矩阵:0代表不是,1代表是
    9. #str(design)
    10. library(limma)
    11. ##差异比较矩阵
    12. contrast_matrix=makeContrasts(paste0(c('LTBI','TB'),collapse = '-'),levels = design)

    计算差异基因指标

    1. #step:lmFit
    2. fit=lmFit(dataset_TB_LTBI,design)
    3. fit2=contrasts.fit(fit,contrast_matrix)
    4. #step:eBayes
    5. fit3=eBayes(fit2)
    6. #step3:topTable
    7. tempoutput=topTable(fit3,coef = 1,n=Inf)
    8. DEG_M=na.omit(tempoutput) #得到差异分析矩阵,重点看logFC和P值
    9. head(DEG_M) #查看数据
    10. '''
    11. logFC AveExpr t P.Value adj.P.Val B
    12. ASPHD2 -1.452777 8.415563 -12.38370 5.885193e-22 5.868863e-18 39.30255
    13. C1QC -3.978887 5.971935 -12.34993 6.954041e-22 5.868863e-18 39.14037
    14. GBP1P1 -4.075057 5.607978 -12.24397 1.174622e-21 6.608814e-18 38.63087
    15. GBP6 -3.225604 4.393248 -11.93968 5.320543e-21 1.692866e-17 37.16200
    16. SDC3 -2.374911 7.388880 -11.92896 5.612049e-21 1.692866e-17 37.11012
    17. LHFPL2 -1.705514 8.411180 -11.91494 6.017652e-21 1.692866e-17 37.04225
    18. '''

    #绘制前40个基因在不同样本之间的热图

    1. library(pheatmap)
    2. #绘制前40个基因在不同样本之间的热图
    3. f40_gene=head(rownames(DEG_M),40)
    4. f40_subset_matrix=dataset_TB_LTBI[f40_gene,]
    5. head(f40_subset_matrix)
    6. f40_subset_matrixx=t(scale(t(f40_subset_matrix))) #数据标准化。。。数据标准化和归一化的区别:平移和压缩
    7. pheatmap(f40_subset_matrixx) #出图

    差异分析 结果过滤筛选step13

    1. res = DEG[,c("logFC","P.Value","adj.P.Val")]
    2. colnames(res)<-c("logFC","PValue","padj")
    3. colnames(res)
    4. library(dplyr)
    5. FC_filter =0.585
    6. P_filter=0.05
    7. all_diff =res %>% filter(abs(logFC)>FC_filter) %>% filter(padj<P_filter)
    8. res$id = rownames(res)
    9. res=select(res,id,everything())
    10. #write.table(res,'all_diff.txt',sep='\t',quote=F)
    11. up_diff=res %>% filter(logFC>FC_filter) %>% filter(padj<P_filter)
    12. up_diff$id = rownames(up_diff)
    13. up_diff=select(up_diff,id,everything())
    14. #write.table(up_diff,'up_diff.txt',sep='\t',quote=F)
    15. down_diff=res %>% filter(logFC< -FC_filter ) %>% filter(padj<P_filter)
    16. down_diff$id = rownames(down_diff)
    17. down_diff=select(down_diff,id,everything())
    18. #write.table(down_diff,'down_diff.txt',sep='\t',quote=F)
    19. group_data_clean <-function(data){
    20. # colnames(data)[c(9,10,11)] =c("logFC","PValue","padj")
    21. data[which(data$padj %in% NA),'sig'] <- 'no diff'
    22. data[which(data$logFC >= FC_filter & data$padj < 0.05),'sig'] <- 'up'
    23. data[which(data$logFC <= -FC_filter & data$padj < 0.05),'sig'] <- 'down'
    24. data[which(abs(data$logFC) < FC_filter | data$padj >= 0.05),'sig'] <- 'no diff'
    25. cat(" 上调",nrow(data[data$sig %in% "up", ]))
    26. cat(" 下调",nrow(data[data$sig %in% "down", ]))
    27. cat(" no fiff",nrow(data[data$sig %in% "no diff", ]))
    28. # filter_data = subset(data, data$sig == 'up' | data$sig == 'down')
    29. # filter_data$Geneid <- rownames(filter_data)
    30. return(data)
    31. }
    32. limma_clean_res = group_data_clean(res)
    33. #上调 1381 下调 1432 no fiff 14066
    34. rownames(all_diff)
    35. dataset_TB_LTBI_DEG = dataset_TB_LTBI[rownames(all_diff),]
    36. dim(dataset_TB_LTBI_DEG) #[1] 2813 102
    37. #&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    38. #+&&&&&&&&&&&&&&&&&&数据保存&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    39. #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    40. save(DEG,res,all_diff,limma_clean_res,dataset_TB_LTBI_DEG,file = "DEG_TB_LTBI_step13.Rdata")
    41. #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    42. #+&&&&&&&&&&&&&&&&&&数据保存&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    43. #&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

    差异分析 绘制火山图step14

    1. library(ggplot2)
    2. data <- limma_clean_res
    3. #################
    4. # ggplot2绘制火山图
    5. data$label <- c(rownames(data)[1:10],rep(NA,nrow(data) - 10))
    6. #sizeGrWindow(12, 9)
    7. pdf(file="差异基因火山图step14.pdf", width = 9, height = 6)
    8. ggplot(data,aes(logFC,-log10(PValue),color = sig)) +
    9. xlab("log2FC") +
    10. geom_point(size = 0.6) +
    11. scale_color_manual(values=c("#00AFBB","#999999","#FC4E07")) +
    12. geom_vline(xintercept = c(-1,1), linetype ="dashed") +
    13. geom_hline(yintercept = -log10(0.05), linetype ="dashed") +
    14. theme(title = element_text(size = 15), text = element_text(size = 15)) +
    15. theme_classic() +
    16. geom_text(aes(label = label),size = 3, vjust = 1,hjust = -0.1)
    17. dev.off()

    1. #&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    2. #+&&&&&&&&&&&&&&&&&&数据保存&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    3. #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    4. save(DEG,res,all_diff,limma_clean_res,dataset_TB_LTBI_DEG,file = "DEG_TB_LTBI_step13.Rdata")
    5. #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    6. #+&&&&&&&&&&&&&&&&&&数据保存&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    7. #&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&

    差异基因分析完毕,下面我们可以观察一下,这些基因富集在哪些通路之上。

  • 相关阅读:
    《OpenDRIVE1.6规格文档》6
    GO结构体
    Electron 从基础到实战笔记 - Electron App对象及其事件
    mysql索引部分总结
    java编译时指定classpath
    CSDN博客运营团队2022年H1总结
    tomcat 端口 8005 被 windows 系统服务占用导致启动闪退的问题
    深入理解Windows句柄
    Redis入门完整教程:问题定位与优化
    day11-Servlet01
  • 原文地址:https://blog.csdn.net/zzh1464501547/article/details/133752831