• 批量从bam文件获取指定位置的碱基


    批量从bam文件获取指定位置(RS.xlsx)的碱基后输出到一个excel,按照编号进行排序

    1. #!/user/bin/python3
    2. #coding:utf-8
    3. import pandas as pd
    4. import pysam
    5. import os,glob
    6. hetRatioCutoff=0.35
    7. primaryRatioCutoff=0.90
    8. baseRefL=['A', 'C', 'T', 'G']
    9. sampleGT={}
    10. def getGT(bam, chr_, pos_):
    11. bamfile = pysam.AlignmentFile(bam, "rb")
    12. depth={}
    13. totalDepth=0
    14. for pileupcolumn in bamfile.pileup(chr_, pos_-1, pos_+1):
    15. if pileupcolumn.reference_pos+1
    16. continue
    17. elif pileupcolumn.reference_pos+1>pos_:
    18. break
    19. for pileupread in pileupcolumn.pileups:
    20. if not pileupread.is_del and not pileupread.is_refskip:
    21. base=pileupread.alignment.query_sequence[pileupread.query_position].upper()
    22. depth[base]=1+depth[base] if base in depth else 1
    23. totalDepth+=1
    24. if totalDepth<=10: return 'NA' #低于10x时为No Call
    25. depth=dict(sorted(depth.items(), key=lambda item:item[1], reverse=True))
    26. baseL=list(depth.keys())
    27. faL=[float(depth[i])/totalDepth for i in depth]
    28. if len(baseL)==1:
    29. return baseL[0]+baseL[0]
    30. else:
    31. if max(faL)>=primaryRatioCutoff:
    32. return baseL[faL.index(max(faL))]+baseL[faL.index(max(faL))]
    33. elif max(faL)>= hetRatioCutoff and max(faL) <=primaryRatioCutoff:
    34. faL_sort = sorted(faL,reverse=True)
    35. primary=faL_sort[0]+faL_sort[1]
    36. if primary>= primaryRatioCutoff:
    37. return baseL[faL.index(faL_sort[1])]+baseL[faL.index(faL_sort[0])] # 获取深度最大和第二大的碱基
    38. else:
    39. return 'NA'
    40. else:
    41. return 'NA'
    42. # read position
    43. df = pd.read_excel('RS.xlsx',header = 0,encoding = 'utf-8')
    44. df2 = []
    45. def output(bamname):
    46. barcode = os.path.splitext(bamname)[0]
    47. outfilename = barcode +'.xlsx'
    48. df1=df.to_dict(orient='records')
    49. for i in range(len(df1)):
    50. chr_ = df.iloc[i]['chr']
    51. pos_ = int(df.iloc[i]['pos'])
    52. rs = df.iloc[i]['rs']
    53. sampleGT[rs]=getGT(bamname, chr_, pos_)
    54. gt = 'gt_'+ barcode
    55. genetype = 'genetype_'+ barcode
    56. df1[i][gt] = sampleGT[rs]
    57. gt_list = list(df1[i][gt])
    58. if len(set(gt_list))== 1:
    59. if ''.join(set(gt_list)) == df.iloc[i]['ref']:
    60. df1[i][genetype]= "野生型"
    61. else:
    62. df1[i][genetype] = "纯合子"
    63. else:
    64. df1[i][genetype] = "杂合子"
    65. outDf=pd.DataFrame(df1)
    66. tempdf = outDf.loc[:,['rs',gt,genetype]]
    67. df2.append(tempdf)
    68. #outDf.to_excel(outfilename, index=False, columns=['rs','chr','pos','ref','alt',gt,genetype], encoding='utf-8')
    69. # 批量读取bam文件的前缀
    70. bamlist = glob.glob('*.bam')
    71. for bam in bamlist:
    72. output(bam)
    73. df3 = pd.concat(df2,axis=1, join='inner',sort = False) #concat合并可以是多个表作为一个list,merge合并是两个表
    74. df3 = df3.drop_duplicates().T.drop_duplicates().T # 去掉重复列'rs'
    75. outDf_result = pd.merge(df,df3,how='left',on='rs')
    76. outDf_result.to_excel('output.xlsx', index=False, encoding='utf-8')

    以上办法是用pysam包处理,对于indel,质量不好等不一定准确,最好的还是用专业的软件处理,bcftools 和gatk。bacftools call snp后输出vcf文件后,再使用bacftools query -f命令处理vcf文件。

    此处的test文件就是指定的位置,获取多个bam文件中这些位置的碱基

    bcftools mpileup -R test -f /media/gsadmin/vd2/tmp/database/GATK/library/hg19/ucsc.hg19.fasta -q 10 -Q 10 --ff UNMAP -Ou *.bam | bcftools call -m >variants.vcf 

    bcftools call  -mv表示call为变异的位置碱基,如果不加v,就可以call出未变异的。

  • 相关阅读:
    3D Gaussian Splatting Windows安装
    【C++从0到王者】第三十八站:位图和布隆过滤器
    QT实现后台服务,linux下使用systemd管理QT后台服务
    python趣味编程-5分钟实现一个F1 赛车公路游戏(含源码、步骤讲解)
    这些傻白甜的Linux命令,不会有人教你!
    闭包问题
    OpenCV4.60安装教程-OpenCV怎么下载安装?OpenCV怎么配置?
    VUE快速入门-5
    JAVA计算机毕业设计小型企业员工工资管理系统(附源码、数据库)
    在 Spring 生态中玩转 RocketMQ
  • 原文地址:https://blog.csdn.net/Cassiel60/article/details/133760509