字典加载
对字典的说明见此篇博客NLTK VADER lexicon Structure for sentiment analysis
另外需要如要加载金融领域的词典部分如下所示:
import csv
import pandas as pd
# stock market lexicon
stock_lex = pd.read_csv('./stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}
stock_lex_scaled = {}
for k, v in stock_lex.items():
if v > 0:
stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
else:
stock_lex_scaled[k] = v / min(stock_lex.values()) * -4
# print('stock_lex:{}'.format(stock_lex_scaled))
# Loughran and McDonald
positive = []
with open('./lm_positive.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
positive.append(row[0].strip().lower())
negative = []
with open('./lm_negative.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
entry = row[0].strip().split(" ")
if len(entry) > 1:
negative.extend([aa.lower() for aa in entry])
else:
negative.append(entry[0].lower())
final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
print(final_lex)
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex
整体代码框架
整体代码主要分为
- 两个类SentiText(object)和SentimentIntensityAnalyzer(object)
-
四个静态方法 negated(), normalize(),allcap_differential(),scalar_inc_dec()
- SentiText(object)
_words_plus_punc() 将定义的标点符号拼接到去除标点后的token前后组成新的token集合PT
_words_plus_punc()输入:text = """i'm your baby!! =:)"""
_words_plus_punc()输出:{'!!!baby': 'baby', '!!!im': 'im', '!!!your': 'your', '!!baby': 'baby', '!!im': 'im', '!!your': 'your', '!?!?baby': 'baby', ...}
_words_and_emoticons() 遍历text的token,将符合集合PT样式的token前后标点去除,并保留缩略词和大多数表情
_words_and_emoticons()输入:text = """i'm your baby!! =:)"""
_words_and_emoticons()输出:["i'm", 'your', 'baby', '=:)']
这里用到一个迭代的方法itertools.product
from itertools import product
PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]
# 去除标点
REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
def _words_plus_punc(self):
"""
Returns mapping of form:
{
'cat,': 'cat',
',cat': 'cat',
}
"""
# 删除标点符号(但会丢失表情符号和缩略符号)
no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
words_only = no_punc_text.split()
# 删除单独字符
words_only = set( w for w in words_only if len(w) > 1 )
# itertools.product方法返回结果 ('cat', ',') and (',', 'cat')
punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
words_punc_dict = punc_before
words_punc_dict.update(punc_after)
return words_punc_dict
def _words_and_emoticons(self):
"""
去除单词前后标点,保留缩略和大部分表情符号;但是不保留标点+字母的表情符号(e.g. :D)
"""
wes = self.text.split()
words_punc_dict = self._words_plus_punc()
wes = [we for we in wes if len(we) > 1]
for i, we in enumerate(wes):
if we in words_punc_dict:
wes[i] = words_punc_dict[we]
return wes
- SentimentIntensityAnalyzer(object)
1、基于输入文本返回情感强度的浮点数值。正值为正向,负值为负向。
def polarity_scores(self, text):
sentitext = SentiText(text)
#text, words_and_emoticons, is_cap_diff = self.preprocess(text)
sentiments = []
words_and_emoticons = sentitext.words_and_emoticons
for item in words_and_emoticons:
valence = 0
i = words_and_emoticons.index(item) # 获取token词在文本中的位置
# 如果位置不是最后一个并且是一个kind of词组 或者 token词在程度词库中,其分值保留是0
if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
words_and_emoticons[i+1].lower() == "of") or \
item.lower() in BOOSTER_DICT:
sentiments.append(valence)
continue
#如果不是上述情形则对句子中的词做规则处理
sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
# 最后检测是否有but情形
sentiments = self._but_check(words_and_emoticons, sentiments)
return self.score_valence(sentiments, text)
2、利用一定规则对句子中词的分值进行计算
def sentiment_valence(valence, sentitext, item, i, sentiments):
is_cap_diff = sentitext.is_cap_diff
words_and_emoticons = sentitext.words_and_emoticons
item_lowercase = item.lower()
if item_lowercase in self.lexicon:
#获取情感词的人工打分(valence)
valence = self.lexicon[item_lowercase]
#检查情感词是否大写(并且其他的词不大写),如果满足情形则在词本身的情感强度上附加C_INCR强度大小[正面词强度+,负面词强度-]
if item.isupper() and is_cap_diff:
if valence > 0:
valence += C_INCR
else:
valence -= C_INCR
for start_i in range(0,3):
if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
# 根据前面的单词和表情符号与当前情感词的距离来调节其标量值(不包括直接位于情感词前面的那些)
s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
if start_i == 1 and s != 0:
s = s*0.95
if start_i == 2 and s != 0:
s = s*0.9
valence = valence+s
# 判断情感词前面“never”的情形
valence = self._never_check(valence, words_and_emoticons, start_i, i)
# 判断情感词前后是否是习语的情形
if start_i == 2:
valence = self._idioms_check(valence, words_and_emoticons, i)
# future work: consider other sentiment-laden idioms
# other_idioms =
# {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
# "upper hand": 1, "break a leg": 2,
# "cooking with gas": 2, "in the black": 2, "in the red": -2,
# "on the ball": 2,"under the weather": -2}
# 检查情感词前“least”的情形
valence = self._least_check(valence, words_and_emoticons, i)
sentiments.append(valence)
return sentiments
def scalar_inc_dec(word, valence, is_cap_diff):
"""
检查前面的词是否增加、减少或否定/取消情感强度值。
"""
scalar = 0.0
word_lower = word.lower()
if word_lower in BOOSTER_DICT:
scalar = BOOSTER_DICT[word_lower]
if valence < 0:
scalar *= -1
#检验程度增强词和抑制词是否是大写(而其他词不是大写)
if word.isupper() and is_cap_diff:
if valence > 0:
scalar += C_INCR
else: scalar -= C_INCR
return scalar
def _never_check(self, valence, words_and_emoticons, start_i, i):
if start_i == 0:
if negated([words_and_emoticons[i-1]]):
valence = valence*N_SCALAR
if start_i == 1:
if words_and_emoticons[i-2] == "never" and\
(words_and_emoticons[i-1] == "so" or
words_and_emoticons[i-1] == "this"):
valence = valence*1.5
elif negated([words_and_emoticons[i-(start_i+1)]]):
valence = valence*N_SCALAR
if start_i == 2:
if words_and_emoticons[i-3] == "never" and \
(words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
(words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
valence = valence*1.25
elif negated([words_and_emoticons[i-(start_i+1)]]):
valence = valence*N_SCALAR
return valence
def _idioms_check(self, valence, words_and_emoticons, i):
onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])
twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
words_and_emoticons[i-1], words_and_emoticons[i])
twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])
threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
words_and_emoticons[i-2], words_and_emoticons[i-1])
threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
for seq in sequences:
if seq in SPECIAL_CASE_IDIOMS:
valence = SPECIAL_CASE_IDIOMS[seq]
break
if len(words_and_emoticons)-1 > i:
zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
if zeroone in SPECIAL_CASE_IDIOMS:
valence = SPECIAL_CASE_IDIOMS[zeroone]
if len(words_and_emoticons)-1 > i+1:
zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
if zeroonetwo in SPECIAL_CASE_IDIOMS:
valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
# 检查bi-grams程度词增强和抑制比如'sort of' or 'kind of'
if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
valence = valence+B_DECR
return valence
def _but_check(self, words_and_emoticons, sentiments):
# 检查对比连词“but”导致的情绪强度变化,but前情感强度值减小,but后情感强度值增强
if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
try:
bi = words_and_emoticons.index('but')
except ValueError:
bi = words_and_emoticons.index('BUT')
for sentiment in sentiments:
si = sentiments.index(sentiment)
if si < bi:
sentiments.pop(si)
sentiments.insert(si, sentiment*0.5)
elif si > bi:
sentiments.pop(si)
sentiments.insert(si, sentiment*1.5)
return sentiments
def _least_check(self, valence, words_and_emoticons, i):
# 检验使用"least"的否定词情形
if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
and words_and_emoticons[i-1].lower() == "least":
if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
valence = valence*N_SCALAR
elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
and words_and_emoticons[i-1].lower() == "least":
valence = valence*N_SCALAR
return valence
- 根据每个词的得分来计算整体句子的情感强度
def score_valence(self, sentiments, text):
if sentiments:
sum_s = float(sum(sentiments))
# 根据文本中的标点符号来增强情感强度
punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
if sum_s > 0:
sum_s += punct_emph_amplifier
elif sum_s < 0:
sum_s -= punct_emph_amplifier
compound = normalize(sum_s)
# 区分积极、消极和中性情绪分数
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
if pos_sum > math.fabs(neg_sum):
pos_sum += (punct_emph_amplifier)
elif pos_sum < math.fabs(neg_sum):
neg_sum -= (punct_emph_amplifier)
total = pos_sum + math.fabs(neg_sum) + neu_count
pos = math.fabs(pos_sum / total)
neg = math.fabs(neg_sum / total)
neu = math.fabs(neu_count / total)
else:
compound = 0.0
pos = 0.0
neg = 0.0
neu = 0.0
sentiment_dict = \
{"neg" : round(neg, 3),
"neu" : round(neu, 3),
"pos" : round(pos, 3),
"compound" : round(compound, 4)}
return sentiment_dict
def _punctuation_emphasis(self, sum_s, text):
# 根据感叹号和问号增加句子的情感强度
ep_amplifier = self._amplify_ep(text)
qm_amplifier = self._amplify_qm(text)
punct_emph_amplifier = ep_amplifier+qm_amplifier
return punct_emph_amplifier
def _amplify_ep(self, text):
# 统计感叹号的个数产生附加情感强度(最多4个)
ep_count = text.count("!")
if ep_count > 4:
ep_count = 4
# (感叹号的经验推导平均情感强度等级增加)
ep_amplifier = ep_count*0.292
return ep_amplifier
def _amplify_qm(self, text):
# 统计问号的个数产生附加情感强度(2 or 3+)
qm_count = text.count("?")
qm_amplifier = 0
if qm_count > 1:
if qm_count <= 3:
# (empirically derived mean sentiment intensity rating increase for
# question marks)
qm_amplifier = qm_count*0.18
else:
qm_amplifier = 0.96
return qm_amplifier
def _sift_sentiment_scores(self, sentiments):
# 统计正向和负向的分值以及中性词的个数
pos_sum = 0.0
neg_sum = 0.0
neu_count = 0
for sentiment_score in sentiments:
if sentiment_score > 0:
pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
if sentiment_score < 0:
neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
if sentiment_score == 0:
neu_count += 1
return pos_sum, neg_sum, neu_count