1.具体步骤:分词,测试标注,实体识别
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 要保证正常运行,请参照最后的完整代码附录
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import SementicRoleLabeller
from pyltp import NamedEntityRecognizer
from pyltp import Parser
class extractEntity:
def __init__(self,persons,locations,institutions):
#使用set可以避免重复元素
persons = set()
locations = set()
institutions = set()
# 分句,也就是将一片文本分割为独立的句子
def sentence_splitter(sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
sents = SentenceSplitter.split(sentence) # 分句
# print '\n'.join(sents)
# 分词测试
def segmentor(self,sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
segmentor = Segmentor() # 初始化实例
segmentor.load('/Users/Zd/Downloads/ltp_data_v3.4.0/cws.model') # 加载模型
words = segmentor.segment(sentence) # 分词
# #默认可以这样输出
# print '\t'.join(words)
# 可以转换成List 输出
words_list = list(words)
segmentor.release() # 释放模型
return words_list
# 测试标注
def posttagger(self,words):
postagger = Postagger() # 初始化实例
postagger.load('/Users/Zd/Downloads/ltp_data_v3.4.0/pos.model') # 加载模型
postags = postagger.postag(words) # 词性标注
# for word,tag in zip(words,postags):
# print word+'/'+tag
postagger.release() # 释放模型
return postags
# 命名实体识别
def ner(self,words, postags):
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load('/Users/Zd/Downloads/ltp_data_v3.4.0/ner.model') # 加载模型
netags = recognizer.recognize(words, postags) # 命名实体识别
for word, ntag in zip(words, netags):
# print word + '/' + ntag
if ntag == 'S-Nh':
self.persons.add(word)
# print "人名:"+word
elif ntag == 'S-Ns':
self.locations.add(word)
# print "地名:" + word
elif ntag == 'S-Ni':
self.institutions.add(word)
# print "机构名:" + word
recognizer.release() # 释放模型
return netags
# 依存语义分析
def parse(self,words, postags):
parser = Parser() # 初始化实例
parser.load('/Users/Zd/Downloads/ltp_data_v3.4.0/parser.model') # 加载模型
arcs = parser.parse(words, postags) # 句法分析
print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
parser.release() # 释放模型
return arcs
# 角色标注
def role_label(self,words, postags, netags, arcs):
labeller = SementicRoleLabeller() # 初始化实例
labeller.load('/Users/Zd/Downloads/ltp_data_v3.4.0/srl') # 加载模型
roles = labeller.label(words, postags, netags, arcs) # 语义角色标注
for role in roles:
print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
labeller.release() # 释放模型
tree = ET.parse('/Users/Zd/Desktop/newsSpider/newsSpider/extractEntity/scrawlToXML.xml')
root = tree.getroot()
#实例化,调用类,并赋予参数
ee = extractEntity(set,set,set)
for element in root.iter('head'):
# add element:persons,locations,institutions
p = ET.SubElement(element,'persons')
p.text = ''
l = ET.SubElement(element, 'locations')
l.text = ''
i = ET.SubElement(element, 'institutions')
i.text = ''
#str = element.text.encode('utf-8')
#print element.text
if element.text is not None:
#分词
words = ee.segmentor(element.text.encode('utf-8'))
#测试标注
tags = ee.posttagger(words)
#每条新闻开始,要将set赋予空
ee.persons = set()
ee.locations = set()
ee.institutions = set()
#命名实体识别
netags = ee.ner(words, tags)
for s in ee.persons:
#print type(s) str
#拼接人名
p.text = p.text + ' ' + s
#print p.text
for s in ee.locations:
#print type(s)
l.text = l.text + ' ' + s
for s in ee.institutions:
#print type(s)
i.text = i.text + ' ' + s
#print i.text
for element in root.iter('content'):
# add element:persons,locations,institutions
p = ET.SubElement(element,'persons')
p.text = ''
l = ET.SubElement(element, 'locations')
l.text = ''
i = ET.SubElement(element, 'institutions')
i.text = ''
#str = element.text.encode('utf-8')
#print element.text
if element.text is not None:
words = ee.segmentor(element.text.encode('utf-8'))
tags = ee.posttagger(words)
ee.persons = set()
ee.locations = set()
ee.institutions = set()
netags = ee.ner(words, tags)
for s in ee.persons:
#print type(s)
p.text = p.text + ' ' + s
#print p.text
for s in ee.locations:
#print type(s)
l.text = l.text + ' ' + s
for s in ee.institutions:
#print type(s)
i.text = i.text + ' ' + s
#print i.text
#生成新的xml文件
tree.write('entity.xml',encoding='utf-8')
# element.text
# segmentor(element.nodeValue)
# #测试分句子
# print('******************测试将会顺序执行:**********************')
# sentence_splitter()
# print('###############以上为分句子测试###############')
# 测试分词
# words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?还有,微博是MebiuW')
# print('###############以上为分词测试###############')
# 测试标注
# tags = posttagger(words)
# print('###############以上为词性标注测试###############')
# 命名实体识别
#netags = ner(words, tags)
#print('###############以上为命名实体识别测试###############')
# 依存句法识别
# arcs = parse(words,tags)
# print('###############以上为依存句法测试###############')
# #角色标注
# roles = role_label(words,tags,netags,arcs)
# print('###############以上为角色标注测试###############')
# 测试分句子
# sentence_splitter()
# 测试分词
# words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?')
# 测试标注
# tags = posttagger(words)
# 命名实体识别
# ner(words,tags)
2.code参考:
https://blog.csdn.net/MebiuW/article/details/52496920
3.ltp的API:
https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id4
4.xml文档中节点CRUD操作的参考:
https://blog.csdn.net/lihao21/article/details/72891932