s代表句子,w代表句子中的单词,z代表单词的词性。推导式中用了一些基本的概率公式,有些地方简写了。对于维特比与HMM的概念就不再赘述。
python代码主要分为三步来实现:
1、对数据进行处理;
2、计算HMM三元组的PI、A、B;
3、维特比算法的实现。
数据集可联系私发,代码如下:
#整理数据集
tag2id,id2tag = {},{}
word2id,id2word = {},{}
res = []
for line in open('dataset.txt','r', encoding='UTF-8'):
temp = line.rstrip().split("/")
if len(temp)==2:
word, tag = temp[0], temp[1]
if word not in word2id:
word2id[word] = len(word2id)
id2word[len(id2word)] = word
if tag not in tag2id:
tag2id[tag] = len(tag2id)
id2tag[len(id2tag)] = tag
M = len(word2id)
N = len(tag2id)
#计算PI 、A、B
import numpy as np
PI = np.zeros(N) #初始矩阵;每个词性出现在句首的概率
A = np.zeros((N,M)) #发射矩阵;A[i][j]:给定tag[i] 出现单词j的概率
B = np.zeros((N,N)) #状态转移矩阵;B[i][j]:从前一状态i转移到当前状态j的概率
pre_tag = ""
for line in open("dataset.txt","r",encoding='UTF-8'):
items = line.rstrip().split("/")
wordid,tagid= word2id[items[0]],tag2id[items[1]]
if pre_tag=="": #意味着句子的开始
PI[tagid] += 1 #先计算出现的次数
A[tagid][wordid] += 1
else: #如果不是句子的开头
A[tagid][wordid] += 1
B[tag2id[pre_tag]][tagid] += 1
if items[0] in ["。","!","?"]: #下一个单词是新句子的开始
pre_tag = ""
else:
pre_tag = items[1]
# 把频次转成概率
PI = PI/sum(PI)
for i in range(N):
A[i] /= sum(A[i])
B[i] /= sum(B[i]) #到此为止计算完了各个参数
#计算log值,防止出现log0
def log(ele):
if ele == 0:
return np.log(ele + 0.000001)
return np.log(ele)
#利用前向最大匹配进行分词
def forwardMaxMatch(strng):
dict = [x for x in word2id.keys()]
result = []
while len(strng):
maxlength = 5 if len(strng)>=5 else len(strng)
sub = strng[:maxlength]
while sub not in dict and len(sub)>1:
sub = sub[:len(sub)-1]
result.append(sub)
strng = strng[len(sub):]
return "/".join(result)
#重头戏:利用维特比进行词性标注
def vertebi(x):
x = forwardMaxMatch(x).split("/")
res = [word2id[word] for word in x]
T = len(res)
ptr = np.zeros((T,N),dtype=int) #定义一个维特比篱笆网,存放下标,故指定为整型
dp = np.zeros((T,N))
for j in range(N): #从前往后计算,所以要首先填充第一列
dp[0][j] = log(PI[j])+log(A[j][res[0]]) #第一个tag是PI[j]的概率 + 词性是A[j]的情况下是单词x[0]的概率
for i in range(1,T): #每个单词
for j in range(N): #每个词性
""""
dp[i][j],ptr[i][j] = max(dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]],k) for k in range(N))
这一行可代替下面六行
"""
dp[i][j] = -9999999999
for k in range(N): #从前一项的每一个k可达到当前的词性j
scores = dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]]) #前一列的第k个转移过来的概率+状态转移概率(k->j)+词性为A[j]时单词正好为x[i]的概率
if scores > dp[i][j]:
dp[i][j] = scores
ptr[i][j] = k #填充篱笆网络 当前最好路径是从k过来的
# 把最好的tag sequence打印出来(从后到前)
best_seq= [0]*T #找出篱笆网络最后一列中最大的值得下标
best_seq[T-1] = np.argmax(dp[T-1])
for i in range(T-2,-1,-1): #从后往前其次求出每个单词的词性
best_seq[i]=ptr[i+1][best_seq[i+1]]
result = []
for i in range(len(best_seq)):
result.append(x[i]+'/'+id2tag[best_seq[i]]+" ")
return "".join(result)
sentence = input("请输入您想要进行词性标注的句子:")
print(vertebi(sentence))