通过计算edist距离,如果小于3个碱基错配,则认为识别到引物。
import numpy as np
def reverse_complement(t):
"""反向互补序列"""
completement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C','N': 'N'}
s = ''
for base in t:
s = completement[base] + s
return s
def edist(p,t):
"""
p: primer
t: target sequence
calculate edist distance
"""
p_len = len(p)
t_len = len(t)
data = np.zeros(shape=(p_len+1,t_len+1), dtype=int)
# i for row(primer), j for col(target sequence)
for i in range(p_len+1):
data[i][0] = i
# edist distance
for i in range(1,p_len+1):
for j in range(1,t_len+1):
hor = data[i][j-1] + 1 # horizontal
ver = data[i-1][j] + 1 # vertical
# diag
if p[i-1] == t[j-1]:
delta = 0
else:
delta = 1
dia = data[i-1][j-1] + delta
data[i][j] = min(ver, hor, dia)
return(data, min(data[p_len]))
def primer_align(p,t):
"""
align a single primer to a sequence.
t: target sequence
"""
edist1, forward_dist1 = edist(p, t)
edist2, forward_dist2 = edist(p, reverse_complement(t))
if forward_dist1 < forward_dist2:
return edist1, forward_dist1
else:
return edist2, forward_dist2
def primers_align(f, r, t, mismatch=3):
"""
align a pair of primers to a sequence
"""
# 1/3 region
_, forward_dist = primer_align(f, t[:int(len(t)/3)])
_, reverse_dist = primer_align(reverse_complement(r), t[int(len(t)/3*2):-1])
if forward_dist <= mismatch or reverse_dist <= mismatch:
return 'aligned'
else:
return 'unaligned'
if __name__ == '__main__':
forward = 'ATCG'
fa = 'ACTCA'
print(edist(forward, fa))
(array([[0, 0, 0, 0, 0, 0],
[1, 0, 1, 1, 1, 0],
[2, 1, 1, 1, 2, 1],
[3, 2, 1, 2, 1, 2],
[4, 3, 2, 2, 2, 2]]), 2)