批量从bam文件获取指定位置(RS.xlsx)的碱基后输出到一个excel,按照编号进行排序
- #!/user/bin/python3
- #coding:utf-8
-
- import pandas as pd
- import pysam
- import os,glob
-
-
- hetRatioCutoff=0.35
- primaryRatioCutoff=0.90
- baseRefL=['A', 'C', 'T', 'G']
- sampleGT={}
-
-
- def getGT(bam, chr_, pos_):
- bamfile = pysam.AlignmentFile(bam, "rb")
- depth={}
- totalDepth=0
- for pileupcolumn in bamfile.pileup(chr_, pos_-1, pos_+1):
- if pileupcolumn.reference_pos+1
- continue
- elif pileupcolumn.reference_pos+1>pos_:
- break
- for pileupread in pileupcolumn.pileups:
- if not pileupread.is_del and not pileupread.is_refskip:
- base=pileupread.alignment.query_sequence[pileupread.query_position].upper()
- depth[base]=1+depth[base] if base in depth else 1
- totalDepth+=1
-
- if totalDepth<=10: return 'NA' #低于10x时为No Call
- depth=dict(sorted(depth.items(), key=lambda item:item[1], reverse=True))
- baseL=list(depth.keys())
- faL=[float(depth[i])/totalDepth for i in depth]
- if len(baseL)==1:
- return baseL[0]+baseL[0]
- else:
- if max(faL)>=primaryRatioCutoff:
- return baseL[faL.index(max(faL))]+baseL[faL.index(max(faL))]
- elif max(faL)>= hetRatioCutoff and max(faL) <=primaryRatioCutoff:
- faL_sort = sorted(faL,reverse=True)
- primary=faL_sort[0]+faL_sort[1]
- if primary>= primaryRatioCutoff:
- return baseL[faL.index(faL_sort[1])]+baseL[faL.index(faL_sort[0])] # 获取深度最大和第二大的碱基
- else:
- return 'NA'
- else:
- return 'NA'
-
-
- # read position
- df = pd.read_excel('RS.xlsx',header = 0,encoding = 'utf-8')
- df2 = []
- def output(bamname):
- barcode = os.path.splitext(bamname)[0]
- outfilename = barcode +'.xlsx'
- df1=df.to_dict(orient='records')
- for i in range(len(df1)):
- chr_ = df.iloc[i]['chr']
- pos_ = int(df.iloc[i]['pos'])
- rs = df.iloc[i]['rs']
- sampleGT[rs]=getGT(bamname, chr_, pos_)
- gt = 'gt_'+ barcode
- genetype = 'genetype_'+ barcode
- df1[i][gt] = sampleGT[rs]
- gt_list = list(df1[i][gt])
- if len(set(gt_list))== 1:
- if ''.join(set(gt_list)) == df.iloc[i]['ref']:
- df1[i][genetype]= "野生型"
- else:
- df1[i][genetype] = "纯合子"
- else:
- df1[i][genetype] = "杂合子"
-
-
- outDf=pd.DataFrame(df1)
- tempdf = outDf.loc[:,['rs',gt,genetype]]
- df2.append(tempdf)
-
- #outDf.to_excel(outfilename, index=False, columns=['rs','chr','pos','ref','alt',gt,genetype], encoding='utf-8')
-
- # 批量读取bam文件的前缀
- bamlist = glob.glob('*.bam')
- for bam in bamlist:
- output(bam)
-
- df3 = pd.concat(df2,axis=1, join='inner',sort = False) #concat合并可以是多个表作为一个list,merge合并是两个表
- df3 = df3.drop_duplicates().T.drop_duplicates().T # 去掉重复列'rs'
- outDf_result = pd.merge(df,df3,how='left',on='rs')
- outDf_result.to_excel('output.xlsx', index=False, encoding='utf-8')
以上办法是用pysam包处理,对于indel,质量不好等不一定准确,最好的还是用专业的软件处理,bcftools 和gatk。bacftools call snp后输出vcf文件后,再使用bacftools query -f命令处理vcf文件。
此处的test文件就是指定的位置,获取多个bam文件中这些位置的碱基
bcftools mpileup -R test -f /media/gsadmin/vd2/tmp/database/GATK/library/hg19/ucsc.hg19.fasta -q 10 -Q 10 --ff UNMAP -Ou *.bam | bcftools call -m >variants.vcf
bcftools call -mv表示call为变异的位置碱基,如果不加v,就可以call出未变异的。