import sys
from Bio import GenBank
from Bio import SeqIO
gbk_filename = "GCA_000299965.1_ASM29996v1_genomic.gbff"
faa_filename = "gbff_converted.faa"
input_handle = open(gbk_filename, "r")
output_handle = open(faa_filename, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
print("Dealing with GenBank record %s" % seq_record.id)
for seq_feature in seq_record.features:
if seq_feature.type == "CDS":
assert len(seq_feature.qualifiers['translation']) == 1
output_handle.write(">%s, from %s protein_id=%s product=%s\n%s\n" % (
seq_feature.qualifiers['locus_tag'][0],
seq_record.name,
seq_feature.qualifiers['protein_id'][0],
seq_feature.qualifiers['product'][0],
seq_feature.qualifiers['translation'][0]))
output_handle.close()
input_handle.close()
print("Done")
*上面由gbk文件中提取出来的信息如下图所示
包含的信息有:
1.locus_tag,
2.protein_id (可在ncbi的batch entrez中搜索的蛋白accession号)
3.蛋白质产物
4.对应序列
可根据需求改脚本,比如说输入的文件以及,主要用上面的seq_feature.qualifiers进行提取,添加或删减feature。
参考地址