维特比算法实现词性标注

s代表句子，w代表句子中的单词，z代表单词的词性。推导式中用了一些基本的概率公式，有些地方简写了。对于维特比与HMM的概念就不再赘述。
python代码主要分为三步来实现：
1、对数据进行处理；
2、计算HMM三元组的PI、A、B；
3、维特比算法的实现。
数据集可联系私发，代码如下：

#整理数据集
tag2id,id2tag = {},{}
word2id,id2word = {},{}
res = []
for line in open('dataset.txt','r', encoding='UTF-8'):
    temp = line.rstrip().split("/")
    if len(temp)==2:
        word, tag = temp[0], temp[1]
        if word not in word2id:
            word2id[word] = len(word2id)
            id2word[len(id2word)] = word
        if tag not in tag2id:
            tag2id[tag] = len(tag2id)
            id2tag[len(id2tag)] = tag
M = len(word2id)
N = len(tag2id)


#计算PI 、A、B
import numpy as np
PI = np.zeros(N) #初始矩阵；每个词性出现在句首的概率
A = np.zeros((N,M)) #发射矩阵；A[i][j]：给定tag[i] 出现单词j的概率
B = np.zeros((N,N)) #状态转移矩阵；B[i][j]:从前一状态i转移到当前状态j的概率
pre_tag = ""
for line in open("dataset.txt","r",encoding='UTF-8'):
    items = line.rstrip().split("/")
    wordid,tagid= word2id[items[0]],tag2id[items[1]]
    if pre_tag=="": #意味着句子的开始
        PI[tagid] += 1 #先计算出现的次数
        A[tagid][wordid] += 1
    else: #如果不是句子的开头
        A[tagid][wordid] += 1
        B[tag2id[pre_tag]][tagid] += 1
    if items[0] in ["。","！","？"]: #下一个单词是新句子的开始
        pre_tag = ""
    else:
        pre_tag = items[1]

# 把频次转成概率
PI = PI/sum(PI)
for i in range(N):
    A[i] /= sum(A[i])
    B[i] /= sum(B[i])  #到此为止计算完了各个参数

#计算log值，防止出现log0
def log(ele):
    if ele == 0:
        return np.log(ele + 0.000001)
    return np.log(ele)

#利用前向最大匹配进行分词
def forwardMaxMatch(strng):
    dict = [x for x in word2id.keys()]
    result = []
    while len(strng):
        maxlength = 5 if len(strng)>=5 else len(strng)
        sub = strng[:maxlength]
        while sub not in dict and len(sub)>1:
            sub = sub[:len(sub)-1]
        result.append(sub)
        strng = strng[len(sub):]
    return "/".join(result)

#重头戏：利用维特比进行词性标注
def vertebi(x):
    x = forwardMaxMatch(x).split("/")
    res = [word2id[word] for word in x]
    T = len(res)
    ptr = np.zeros((T,N),dtype=int) #定义一个维特比篱笆网，存放下标，故指定为整型
    dp = np.zeros((T,N))
    for j in range(N):  #从前往后计算，所以要首先填充第一列
        dp[0][j] = log(PI[j])+log(A[j][res[0]]) #第一个tag是PI[j]的概率 + 词性是A[j]的情况下是单词x[0]的概率
    for i in range(1,T):  #每个单词
        for j in range(N):  #每个词性
            """"
            dp[i][j],ptr[i][j] = max(dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]],k) for k in range(N))   
            这一行可代替下面六行        
            """
            dp[i][j] = -9999999999
            for k in range(N): #从前一项的每一个k可达到当前的词性j
                scores = dp[i-1][k] + log(B[k][j]) + log(A[j][res[i]]) #前一列的第k个转移过来的概率+状态转移概率（k->j）+词性为A[j]时单词正好为x[i]的概率
                if scores > dp[i][j]:
                    dp[i][j] = scores
                    ptr[i][j] = k #填充篱笆网络  当前最好路径是从k过来的
    # 把最好的tag sequence打印出来（从后到前）
    best_seq= [0]*T #找出篱笆网络最后一列中最大的值得下标
    best_seq[T-1] = np.argmax(dp[T-1])
    for i in range(T-2,-1,-1): #从后往前其次求出每个单词的词性
        best_seq[i]=ptr[i+1][best_seq[i+1]]
    result = []
    for i in range(len(best_seq)):
        result.append(x[i]+'/'+id2tag[best_seq[i]]+" ")
    return "".join(result)

sentence = input("请输入您想要进行词性标注的句子：")
print(vertebi(sentence))

维特比算法实现词性标注

推荐阅读更多精彩内容