|
发表于
2020-1-15 12:45:10
|
显示全部楼层
本帖最后由 liyujie 于 2020-1-15 13:49 编辑
- #!usr/bin/env python
- #-*- coding:utf-8 -*-
- # learning wen https://github.com/biopython/biopython/blob/master/Bio/SeqIO/QualityIO.py
- #(1)计算每条reads的平均碱基质量值
- #(2)计算得到不同碱基质量值的碱基数
- #(3)总结如下统计内容:
- #fastq文件的N含量
- #碱基质量值低于5的比例
- #计算GC含量
- #计算Q20的比例
- #计算Q30的比例
- def fileopen(fq):
- if fq.endswith(".gz"):
- return gzip.open(fq,"rU")
- else:
- return open(fq,"rU")
- def trimfq(fq,start,end):
- with fileopen(fq) as Infq, open("trim_"+str(start)+"_"+str(end)+"_"+os.path.basename(fq)):
- for record in SeqIO.parse(Infq, "fastq"):
- subfq = record[start:end]
- outfq.write(subfq.format("fastq"))
- def randomfq(fq):
- pass
- def qualstat(qualDict,qual,L,up):
- quals=0
- for k,v in qualDict.items():
- if up == "T":
- if int(k) > int(qual):
- quals+=int(v)
- elif up == "F":
- if int(k) < int(qual):
- quals+=int(v)
- else:
- print "wrong1"
- return round(float(quals)/L,4) * 100
- def statfq(fq):
- with fileopen(fq) as Infq:
- A,G,C,T,N,L,n = 0,0,0,0,0,0,0
- qual_dict = {}
- for record in SeqIO.parse(Infq, "fastq"):
- n+=1
- seqfq=record.seq
- qualfqL=record.letter_annotations["phred_quality"]
- A += seqfq.count("A")
- C += seqfq.count("C")
- T += seqfq.count("T")
- G += seqfq.count("G")
- N += seqfq.count("N")
- for qual in qualfqL:
- qual_dict[qual] = qual_dict.get(qual,0) + 1
- L += len(seqfq)
- GCstat = round(float(G+C)/L,4) * 100
- Nstat = round(float(N)/L,4) * 100
- Q20 = qualstat(qual_dict,20,L,"T")
- Q30 = qualstat(qual_dict,30,L,"T")
- Q5low = qualstat(qual_dict,15,L,"F")
- total = n
- return GCstat,Nstat,Q20,Q30,Q5low,qual_dict,total
- if __name__ == '__main__' :
- from Bio import SeqIO
- import gzip
- import sys,os
- fq1 = sys.argv[1]
- fq2 = sys.argv[2]
- fq1GC,fq1N,fq1Q20,fq1Q30,fq1Q5,fq1qual,fq1total = statfq(fq1)
- fq2GC,fq2N,fq2Q20,fq2Q30,fq2Q5,fq2qual,fq2total = statfq(fq2)
-
- print "total reads of read1 / read2: ",fq1total,"\t",fq2total
- print "GC of read1 / read2: ",fq1GC,"\t",fq2GC
- print "N of read1 / read2: ",fq1N,"\t",fq2N
- print "Q20 of read1 / read2: ",fq1Q20,"\t",fq2Q20
- print "Q30 of read1 / read2: ",fq1Q30,"\t",fq2Q30
- print "below Q5 of read1 / read2: ",fq1Q5,"\t",fq2Q5
- for k1,v1 in fq1qual.items():
- print "Qual of read1: ",k1,"\t",v1
- for k2,v2 in fq2qual.items():
- print "Qual of read2: ",k2,"\t",v2
复制代码
|
|