blasttab+结果写成class
先每一行解析一下
class LastHit(object):
def __init__(self,line):
fields = line.rstrip().split("\t")
self.query_sequence_id = fields[0]
self.subject_sequence_id = fields[1]
self.identity = float(fields[2])
self.alignment_length = int(fields[3])
self.mismatchs = int(fields[4])
self.gaps = int(fields[5])
self.qstart = int(fields[6]) # qstart: start of alignment in query
self.qend = int(fields[7]) # qend: end of alignment in query
self.sstart = int(fields[8]) # sstart: start of alignment in subject
self.send = int(fields[9]) # send: end of alignment in subject
self.evalue = float(fields[10]) # evalue: expect value
self.score = float(fields[11])
self.query_length = int(fields[12])
self.subject_length = int(fields[13])
self.undientify = int(fields[14])
if self.qstart >= self.qend:
tmp_position = self.qstart
self.qstart = self.qend
self.qend = tmp_position
发现不是很方便,那就把每一行形成一个列表写成类对象,用了pandas模块方便一点
class LastHits(object):
def __init__(self,last_result_list):
self.last_hits = []
for i in last_result_list:
self.last_hits.append(LastHit(i))
self.last_result_pd_list = []
for i in last_result_list:
self.last_result_pd_list.append(i.split("\t"))
columns_names = ['query_sequence_id','subject_sequence_id','identity','alignment_length','mismatchs','gaps','qstart','qend','sstart','send','evalue','score','query_length','subject_length','unidentify']
self.last_result_pd = pd.DataFrame(self.last_result_pd_list, columns=columns_names, dtype=float)
def filter_result_pd(min_identity = 90,min_alignment_length = 400):
if min_identity is not None:
self.last_result_pd[self.last_result_pd['identity'] > 90]
if min_alignment_length is not None:
self.last_result_pd[self.last_result_pd['alignment_length'] > 400]
这样,运行下面代码
cmd = 'lastal -Q1 -P 4 -q 1 -b 1 -Q 0 -a 1 -e 45 -f BlastTab+ AMR.db {}'.format(input_fastq_file) #可以改变Last得分矩阵的参数
cmd += " | grep -v ^# -"
last_hits = []
f = os.popen(cmd)
last_result_list = f.read().rstrip().split("\n") #列表,一行为一元素
然后把结果解析成LastHits对象
last_hits = LastHits(last_result_list)
LastHits对象就拥有了last_hits、last_result_pd_list、last_result_pd属性,并且拥有了filter_result_pd()过滤方法,后面就是不断添加方法
这种思路就是python面对对象的编程思路,我就做个简单演示