blasttab+结果写成class
先每一行解析一下
class LastHit(object):
def __init__(self,line):
fields = line.rstrip().split("\t")
self.query_sequence_id = fields[0]
self.subject_sequence_id = fields[1]
self.identity = float(fields[2])
self.alignment_length = int(fields[3])
self.mismatchs = int(fields[4])
self.gaps = int(fields[5])
self.qstart = int(fields[6]) # qstart: start of alignment in query
self.qend = int(fields[7]) # qend: end of alignment in query
self.sstart = int(fields[8]) # sstart: start of alignment in subject
self.send = int(fields[9]) # send: end of alignment in subject
self.evalue = float(fields[10]) # evalue: expect value
self.score = float(fields[11])
self.query_length = int(fields[12])
self.subject_length = int(fields[13])
self.undientify = int(fields[14])
if self.qstart >= self.qend:
tmp_position = self.qstart
self.qstart = self.qend
self.qend = tmp_position
发现不是很方便,那就把每一行形成一个列表写成类对象,用了pandas模块方便一点
class LastHits(object):
def __init__(self,last_result_list):
self.last_hits = []
for i in last_result_list:
self.last_hits.append(LastHit(i))
self.last_result_pd_list = []
for i in last_result_list:
self.last_result_pd_list.append(i.split("\t"))
columns_names = ['query_sequence_id','subject_sequence_id','identity','alignment_length','mismatchs','gaps','qstart','qend','sstart','send','evalue','score','query_length','subject_length','unidentify']
self.last_result_pd = pd.DataFrame(self.last_result_pd_list, columns=columns_names, dtype=float)
def _filter_result_pd(self,result_pd, min_identity, min_alignment_length): #filter identity<=90 alignment_length<=400
if min_identity is not None:
filtered_result_pd = result_pd[result_pd['identity'] > min_identity]
if min_alignment_length is not None:
filtered_result_pd = filtered_result_pd[filtered_result_pd['alignment_length'] > min_alignment_length]
filtered_result_pd=filtered_result_pd.groupby('query_sequence_id', as_index=False, sort=False).apply(lambda g: g[g['identity'] == max(g['identity'])]) #根据query_sequence_id分组并,取每一组内identity值最高的一行
return filtered_result_pd
def count_filter_result_pd(self, min_identity = 90, min_alignment_length = 400):
filtered_result_pd = self._filter_result_pd(self.last_result_pd, min_identity, min_alignment_length)
filtered_result_pd_count = filtered_result_pd.groupby('subject_sequence_id').agg({"identity": np.mean,"query_length": np.mean, "score": np.mean, "subject_sequence_id": len})
filtered_result_pd_count.columns = ['Depth', 'Ave_Identity', 'Ave_Length', 'Ave_Score']
filtered_result_pd_count["Gene_names"]=filtered_result_pd_count.index
return round(filtered_result_pd_count,2)
这样,运行下面代码
cmd = 'lastal -Q1 -P 4 -q 1 -b 1 -Q 0 -a 1 -e 45 -f BlastTab+ AMR.db {}'.format(input_fastq_file) #可以改变Last得分矩阵的参数
cmd += " | grep -v ^# -"
last_hits = []
f = os.popen(cmd)
last_result_list = f.read().rstrip().split("\n") #列表,一行为一元素
然后把结果解析成LastHits对象
last_hits = LastHits(last_result_list)
LastHits对象就拥有了last_hits、last_result_pd_list、last_result_pdd属性,并且拥有了filter_result_pd()过滤方法,后面就是不断添加方法。
这种思路就是python面对对象的编程思路,我就做个简单演示
reference也可以做同样的处理
class ReferenceFasta(object):
def __init__(self,reference_fasta_file):
with open(reference_fasta_file,"r") as ref_fa:
self.ref_fa_dict = {}
for line in ref_fa:
line= line.strip() #strip()方法移除字符串头尾指定的字符(默认为空格或换行符),不会删除中间部分字符
if line == '':
continue
if line.startswith('>'):
read_name = line.lstrip('>')
read_name_dict = re.sub('\..*', '', read_name)
'''把>标签行整理成bed文件中read标签的格式(比如fasta文件中>标签行:'gi|41406098|ref|NC_002944.2| Mycobacterium avium subsp. paratuberculosis K-10, complete sequence'(ncbi标准格式);bed文件中第一列read标签:如NC_002944.2(只需其中一部分)),运用自己构建的正则表达提取。例句只提取第一个空格前的'''
self.ref_fa_dict[read_name] = ''
else:
self.ref_fa_dict[read_name] += line
self.ref_fa_names = pd.DataFrame(self.ref_fa_dict.keys(), columns = ["Gene_names"], dtype = object)
ref_fa_names = self.ref_fa_names.copy(deep = True)
ref_fa_names['Depth'] = float(0)
ref_fa_names['Ave_Length'] = float(0)
ref_fa_names['Ave_Score'] = float(0)
ref_fa_names['Ave_Identity'] = float(0)
self.ref_fa_count_initialization = ref_fa_names
def count_result_depth(self,filter_result_count_pd):
result_depth_count = self.ref_fa_count_initialization.copy(deep=True)
for i in filter_result_count_pd['Gene_names'].to_list():
result_depth_count.loc[result_depth_count['Gene_names'] == i, 'Depth'] = filter_result_count_pd.loc[i, 'Depth']
result_depth_count.loc[result_depth_count['Gene_names'] == i, 'Ave_Length'] = filter_result_count_pd.loc[i, 'Ave_Length']
result_depth_count.loc[result_depth_count['Gene_names'] == i, 'Ave_Score'] = filter_result_count_pd.loc[i, 'Ave_Score']
result_depth_count.loc[result_depth_count['Gene_names'] == i, 'Ave_Identity'] = filter_result_count_pd.loc[i, 'Ave_Identity']
return result_depth_count
瞬间觉得可重复了