嗨,我正在运行这个python脚本,以删除我的fastq文件中的过度代表序列,但我一直得到的错误。我是生物信息组学的新手,一直按照一套固定的流水线进行序列组装。我想用这个脚本来删除过度代表性的序列。
python homeTranscriptomeAssemblyToolsRemoveFastqcOverrepSequenceReads.py -1 R1_1.fq -2 R1_2.fq**这里是错误的Traceback(最近一次调用最后一次)。
File "TranscriptomeAssemblyToolsRemoveFastqcOverrepSequenceReads.py", line 46, inleftseqs=ParseFastqcLog(opts.l_fastqc)File "TranscriptomeAssemblyToolsRemoveFastqcOverrepSequenceReads. py",第33行,在ParseFastqcLogwith open(fastqclog) as fp:TypeError: coercing to Unicode: need string or buffer, NoneType found**。
import sys
import gzip
from os.path import basename
import argparse
import re
from itertools import izip,izip_longest
def seqsmatch(overreplist,read):
flag=False
if overreplist!=[]:
for seq in overreplist:
if seq in read:
flag=True
break
return flag
def get_input_streams(r1file,r2file):
if r1file[-2:]=='gz':
r1handle=gzip.open(r1file,'rb')
r2handle=gzip.open(r2file,'rb')
else:
r1handle=open(r1file,'r')
r2handle=open(r2file,'r')
return r1handle,r2handle
def FastqIterate(iterable,fillvalue=None):
"Grab one 4-line fastq read at a time"
args = [iter(iterable)] * 4
return izip_longest(fillvalue=fillvalue, *args)
def ParseFastqcLog(fastqclog):
with open(fastqclog) as fp:
for result in re.findall('Overrepresented sequences(.*?)END_MODULE', fp.read(), re.S):
seqs=([i.split('\t')[0] for i in result.split('\n')[2:-1]])
return seqs
if __name__=="__main__":
parser = argparse.ArgumentParser(description="options for removing reads with over-represented sequences")
parser.add_argument('-1','--left_reads',dest='leftreads',type=str,help='R1 fastq file')
parser.add_argument('-2','--right_reads',dest='rightreads',type=str,help='R2 fastq file')
parser.add_argument('-fql','--fastqc_left',dest='l_fastqc',type=str,help='fastqc text file for R1')
parser.add_argument('-fqr','--fastqc_right',dest='r_fastqc',type=str,help='fastqc text file for R2')
opts = parser.parse_args()
leftseqs=ParseFastqcLog(opts.l_fastqc)
rightseqs=ParseFastqcLog(opts.r_fastqc)
r1_out=open('rmoverrep_'+basename(opts.leftreads).replace('.gz',''),'w')
r2_out=open('rmoverrep_'+basename(opts.rightreads).replace('.gz',''),'w')
r1_stream,r2_stream=get_input_streams(opts.leftreads,opts.rightreads)
counter=0
failcounter=0
with r1_stream as f1, r2_stream as f2:
R1=FastqIterate(f1)
R2=FastqIterate(f2)
for entry in R1:
counter+=1
if counter%100000==0:
print "%s reads processed" % counter
head1,seq1,placeholder1,qual1=[i.strip() for i in entry]
head2,seq2,placeholder2,qual2=[j.strip() for j in R2.next()]
flagleft,flagright=seqsmatch(leftseqs,seq1),seqsmatch(rightseqs,seq2)
if True not in (flagleft,flagright):
r1_out.write('%s\n' % '\n'.join([head1,seq1,'+',qual1]))
r2_out.write('%s\n' % '\n'.join([head2,seq2,'+',qual2]))
else:
failcounter+=1
print 'total # of reads evaluated = %s' % counter
print 'number of reads retained = %s' % (counter-failcounter)
print 'number of PE reads filtered = %s' % failcounter
r1_out.close()
r2_out.close()
也许你已经解决了,我有同样的错误,但现在运行良好。希望这个帮助
(1)我们需要的文件:用法.删除FastqcOverrepSequenceReads.py RemoveFastqcOverrepSequenceReads.py [-h] [-1 LEFTREADS] [-2 RIGHTREADS] [-fql L_FASTQC] [-fqr R_FASTQC
(2)指定fastqc输出中的fastqc_data.text文件,解压到输出目录下
'-fql','-fastqc_left',dest='l_fastqc',type=str,help='fastqc text file for R1'。
'-fqr','-fastqc_right',dest='r_fastqc',type=str,help='fastqc文本文件R2'。
(3)将读取的数据和fastqc_data文本保存在同一个目录下。
(4) 在每个文件前指定路径位置 python RemoveFastqcOverrepSequenceReads.py -1 .bicho.fq.1.gz -2.bicho.fq.2.gz -fql .fastqc_data_bicho_1.txt -fqr .fastqc_data_bicho_2.txt.
(5)跑! :)