vader-sentiment 代码解析

字典加载

对字典的说明见此篇博客NLTK VADER lexicon Structure for sentiment analysis

另外需要如要加载金融领域的词典部分如下所示：

import csv
import pandas as pd

# stock market lexicon
stock_lex = pd.read_csv('./stock_lex.csv')
stock_lex['sentiment'] = (stock_lex['Aff_Score'] + stock_lex['Neg_Score'])/2
stock_lex = dict(zip(stock_lex.Item, stock_lex.sentiment))
stock_lex = {k:v for k,v in stock_lex.items() if len(k.split(' '))==1}

stock_lex_scaled = {}
for k, v in stock_lex.items():
    if v > 0:
        stock_lex_scaled[k] = v / max(stock_lex.values()) * 4
    else:
        stock_lex_scaled[k] = v / min(stock_lex.values()) * -4
# print('stock_lex:{}'.format(stock_lex_scaled))


# Loughran and McDonald
positive = []
with open('./lm_positive.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        positive.append(row[0].strip().lower())  
negative = []
with open('./lm_negative.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        entry = row[0].strip().split(" ")
        if len(entry) > 1:
            negative.extend([aa.lower() for aa in entry])
        else:
            negative.append(entry[0].lower())

final_lex = {}
final_lex.update({word:2.0 for word in positive})
final_lex.update({word:-2.0 for word in negative})
print(final_lex)
final_lex.update(stock_lex_scaled)
final_lex.update(sia.lexicon)
sia.lexicon = final_lex

整体代码框架

整体代码主要分为

两个类SentiText(object)和SentimentIntensityAnalyzer(object)
四个静态方法 negated()， normalize()，allcap_differential()，scalar_inc_dec()

vader框架结构.png

SentiText(object)
_words_plus_punc() 将定义的标点符号拼接到去除标点后的token前后组成新的token集合PT
_words_plus_punc()输入：text = """i'm your baby!! =:)"""
_words_plus_punc()输出：{'!!!baby': 'baby', '!!!im': 'im', '!!!your': 'your', '!!baby': 'baby', '!!im': 'im', '!!your': 'your', '!?!?baby': 'baby', ...}
_words_and_emoticons() 遍历text的token，将符合集合PT样式的token前后标点去除，并保留缩略词和大多数表情
_words_and_emoticons()输入：text = """i'm your baby!! =:)"""
_words_and_emoticons()输出：["i'm", 'your', 'baby', '=:)']
这里用到一个迭代的方法itertools.product

from itertools import product
PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
             "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]

# 去除标点
REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))

def _words_plus_punc(self):
    """
    Returns mapping of form:
    {
        'cat,': 'cat',
        ',cat': 'cat',
    }
    """
    # 删除标点符号（但会丢失表情符号和缩略符号）
    no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
    words_only = no_punc_text.split()
    # 删除单独字符
    words_only = set( w for w in words_only if len(w) > 1 )
    # itertools.product方法返回结果 ('cat', ',') and (',', 'cat')
    punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
    punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
    words_punc_dict = punc_before
    words_punc_dict.update(punc_after)
    return words_punc_dict
def _words_and_emoticons(self):
    """
    去除单词前后标点，保留缩略和大部分表情符号；但是不保留标点+字母的表情符号(e.g. :D)
    """
    wes = self.text.split()
    words_punc_dict = self._words_plus_punc()
    wes = [we for we in wes if len(we) > 1]
    for i, we in enumerate(wes):
        if we in words_punc_dict:
            wes[i] = words_punc_dict[we]
    return wes

SentimentIntensityAnalyzer(object)

1、基于输入文本返回情感强度的浮点数值。正值为正向，负值为负向。

def polarity_scores(self, text):
    sentitext = SentiText(text)
    #text, words_and_emoticons, is_cap_diff = self.preprocess(text)

    sentiments = []
    words_and_emoticons = sentitext.words_and_emoticons
    for item in words_and_emoticons:
        valence = 0
        i = words_and_emoticons.index(item)  # 获取token词在文本中的位置
        # 如果位置不是最后一个并且是一个kind of词组 或者 token词在程度词库中，其分值保留是0
        if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \
            words_and_emoticons[i+1].lower() == "of") or \
            item.lower() in BOOSTER_DICT:
            sentiments.append(valence)
            continue
        #如果不是上述情形则对句子中的词做规则处理
        sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
    # 最后检测是否有but情形
    sentiments = self._but_check(words_and_emoticons, sentiments)

    return self.score_valence(sentiments, text)

2、利用一定规则对句子中词的分值进行计算

def sentiment_valence(valence, sentitext, item, i, sentiments):
    is_cap_diff = sentitext.is_cap_diff
    words_and_emoticons = sentitext.words_and_emoticons
    item_lowercase = item.lower()
    if item_lowercase in self.lexicon:
        #获取情感词的人工打分(valence)
        valence = self.lexicon[item_lowercase]
        #检查情感词是否大写(并且其他的词不大写)，如果满足情形则在词本身的情感强度上附加C_INCR强度大小[正面词强度+，负面词强度-]
        if item.isupper() and is_cap_diff:
            if valence > 0:
                valence += C_INCR
            else:
                valence -= C_INCR

        for start_i in range(0,3):
            if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon:
                # 根据前面的单词和表情符号与当前情感词的距离来调节其标量值（不包括直接位于情感词前面的那些）
                s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff)
                if start_i == 1 and s != 0:
                    s = s*0.95
                if start_i == 2 and s != 0:
                    s = s*0.9
                valence = valence+s
               # 判断情感词前面“never”的情形
                valence = self._never_check(valence, words_and_emoticons, start_i, i)
               # 判断情感词前后是否是习语的情形
                if start_i == 2:
                    valence = self._idioms_check(valence, words_and_emoticons, i)
                    # future work: consider other sentiment-laden idioms
                    # other_idioms =
                    # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                    #  "upper hand": 1, "break a leg": 2,
                    #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
                    #  "on the ball": 2,"under the weather": -2}
        # 检查情感词前“least”的情形
        valence = self._least_check(valence, words_and_emoticons, i)

    sentiments.append(valence)
    return sentiments

def scalar_inc_dec(word, valence, is_cap_diff):
    """
    检查前面的词是否增加、减少或否定/取消情感强度值。
    """
    scalar = 0.0
    word_lower = word.lower()
    if word_lower in BOOSTER_DICT:
        scalar = BOOSTER_DICT[word_lower]
        if valence < 0:
            scalar *= -1
        #检验程度增强词和抑制词是否是大写（而其他词不是大写）
        if word.isupper() and is_cap_diff:
            if valence > 0:
                scalar += C_INCR
            else: scalar -= C_INCR
    return scalar

def _never_check(self, valence, words_and_emoticons, start_i, i):
    if start_i == 0:
        if negated([words_and_emoticons[i-1]]):
                valence = valence*N_SCALAR
    if start_i == 1:
        if words_and_emoticons[i-2] == "never" and\
           (words_and_emoticons[i-1] == "so" or
            words_and_emoticons[i-1] == "this"):
            valence = valence*1.5
        elif negated([words_and_emoticons[i-(start_i+1)]]):
            valence = valence*N_SCALAR
    if start_i == 2:
        if words_and_emoticons[i-3] == "never" and \
           (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \
           (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"):
            valence = valence*1.25
        elif negated([words_and_emoticons[i-(start_i+1)]]):
            valence = valence*N_SCALAR
    return valence

def _idioms_check(self, valence, words_and_emoticons, i):
    onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i])

    twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2],
                                   words_and_emoticons[i-1], words_and_emoticons[i])

    twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1])

    threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3],
                                    words_and_emoticons[i-2], words_and_emoticons[i-1])

    threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2])

    sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

    for seq in sequences:
        if seq in SPECIAL_CASE_IDIOMS:
            valence = SPECIAL_CASE_IDIOMS[seq]
            break

    if len(words_and_emoticons)-1 > i:
        zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1])
        if zeroone in SPECIAL_CASE_IDIOMS:
            valence = SPECIAL_CASE_IDIOMS[zeroone]
    if len(words_and_emoticons)-1 > i+1:
        zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2])
        if zeroonetwo in SPECIAL_CASE_IDIOMS:
            valence = SPECIAL_CASE_IDIOMS[zeroonetwo]

    # 检查bi-grams程度词增强和抑制比如'sort of' or 'kind of'
    if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
        valence = valence+B_DECR
    return valence

def _but_check(self, words_and_emoticons, sentiments):
   # 检查对比连词“but”导致的情绪强度变化，but前情感强度值减小，but后情感强度值增强
    if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
        try:
            bi = words_and_emoticons.index('but')
        except ValueError:
            bi = words_and_emoticons.index('BUT')
        for sentiment in sentiments:
            si = sentiments.index(sentiment)
            if si < bi:
                sentiments.pop(si)
                sentiments.insert(si, sentiment*0.5)
            elif si > bi:
                sentiments.pop(si)
                sentiments.insert(si, sentiment*1.5)
    return sentiments

def _least_check(self, valence, words_and_emoticons, i):
    # 检验使用"least"的否定词情形
    if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \
       and words_and_emoticons[i-1].lower() == "least":
        if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very":
            valence = valence*N_SCALAR
    elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \
         and words_and_emoticons[i-1].lower() == "least":
        valence = valence*N_SCALAR
    return valence

根据每个词的得分来计算整体句子的情感强度

def score_valence(self, sentiments, text):
    if sentiments:
        sum_s = float(sum(sentiments))
        # 根据文本中的标点符号来增强情感强度
        punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
        if sum_s > 0:
            sum_s += punct_emph_amplifier
        elif  sum_s < 0:
            sum_s -= punct_emph_amplifier

        compound = normalize(sum_s)
        # 区分积极、消极和中性情绪分数
        pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

        if pos_sum > math.fabs(neg_sum):
            pos_sum += (punct_emph_amplifier)
        elif pos_sum < math.fabs(neg_sum):
            neg_sum -= (punct_emph_amplifier)

        total = pos_sum + math.fabs(neg_sum) + neu_count
        pos = math.fabs(pos_sum / total)
        neg = math.fabs(neg_sum / total)
        neu = math.fabs(neu_count / total)

    else:
        compound = 0.0
        pos = 0.0
        neg = 0.0
        neu = 0.0

    sentiment_dict = \
        {"neg" : round(neg, 3),
         "neu" : round(neu, 3),
         "pos" : round(pos, 3),
         "compound" : round(compound, 4)}

    return sentiment_dict

def _punctuation_emphasis(self, sum_s, text):
    # 根据感叹号和问号增加句子的情感强度
    ep_amplifier = self._amplify_ep(text)
    qm_amplifier = self._amplify_qm(text)
    punct_emph_amplifier = ep_amplifier+qm_amplifier
    return punct_emph_amplifier

def _amplify_ep(self, text):
    # 统计感叹号的个数产生附加情感强度（最多4个）
    ep_count = text.count("!")
    if ep_count > 4:
        ep_count = 4
    # (感叹号的经验推导平均情感强度等级增加)
    ep_amplifier = ep_count*0.292
    return ep_amplifier

def _amplify_qm(self, text):
    # 统计问号的个数产生附加情感强度（2 or 3+）
    qm_count = text.count("?")
    qm_amplifier = 0
    if qm_count > 1:
        if qm_count <= 3:
            # (empirically derived mean sentiment intensity rating increase for
            # question marks)
            qm_amplifier = qm_count*0.18
        else:
            qm_amplifier = 0.96
    return qm_amplifier

def _sift_sentiment_scores(self, sentiments):
    # 统计正向和负向的分值以及中性词的个数
    pos_sum = 0.0
    neg_sum = 0.0
    neu_count = 0
    for sentiment_score in sentiments:
        if sentiment_score > 0:
            pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
        if sentiment_score < 0:
            neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
        if sentiment_score == 0:
            neu_count += 1
    return pos_sum, neg_sum, neu_count

最后编辑于：2019.04.08 16:25:27

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 216,591评论 6赞 501
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 92,448评论 3赞 392
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 162,823评论 0赞 353
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,204评论 1赞 292
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,228评论 6赞 388
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,190评论 1赞 299
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,078评论 3赞 418
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,923评论 0赞 274
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,334评论 1赞 310
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,550评论 2赞 333
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,727评论 1赞 348
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,428评论 5赞 343
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 41,022评论 3赞 326
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,672评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,826评论 1赞 269
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,734评论 2赞 368
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,619评论 2赞 354

vader-sentiment 代码解析

字典加载

整体代码框架

推荐阅读更多精彩内容