非结构化文本的分类算法
如身高、体重、对法案的投票等。具有
能用表格来展现 的共性的数据我们称为 “结构化数据” 。
数据集中的每条数据(下表中的一行)由多个特征进行描述(下表中的列)。
非结构化的数据 指的是诸如电子邮件文本、推特信息、博客、新闻等。这些数据第一眼看起来是无法用一张表格来展现的。
非结构化文本我们可以使用 朴素贝叶斯算法 来进行分类
h ∈ H 表示计算每个事件的概率;
P( D | h ) 表示在给定 h 的条件下,D 发生的概率(如给定某类文章,这类文章中特定单词出现的概率);
P( h ) 则指事件 h 发生的概率。
训练阶段
首先,我们统计所有文本中一共出现了多少个不同的单词,记作“| Vocabulary |”(总词汇表)。
对于每个单词 wk,我们将计算 P( wk | hi),每个 hi (喜欢和讨厌两种)的计算步骤如下:
1、将该分类下的所有文章合并到一起;
2、统计每个单词出现的次数,记为 n
3、对于总词汇表中的单词 wk,统计他们在本类文章中出现的次数 nk;
-
4、最后应用下方的公式:
使用朴素贝叶斯进行分类
对下面这句话判断它是正面还是负面:
I am stunned by the hype over gravity.
我们需要计算的是下面两个概率,并选取较高的结果:
P( like ) × P( I | like ) × P( am | like ) × P( stunned | like ) × ...
P( dislike ) × P( I | dislike ) × P( am | dislike ) × P( stunned | dislike ) × ...
结果中的 6.22E - 22 是科学计数法,等价于 6.22 * 10-22。
由于 Python 不能处理处理这么小的小树,所以我们要用对数来计算——将每个概率的对数相加。
算法实现一个包含 100 字的文本中,每个单词的概率是 0.0001:
import math
p = 0
for i in range(100):
  p += math.log(0.0001)
提示:
- bn = x 可以转换为 logbx = n
- log10(ab) = log10(a) + log10(b)
常用词和停词
“这些组成语法结构的单词是没有意思的,反而会产生很多噪音”—— H.P.Luhn
也就是说,将这些“噪音”单词去除后是会提升分类正确率。我们将这些单词称为“停词”,有专门的停词表可供使用。去除这些词的理由是:
1. 能够减少需要处理的数据量;
2. 这些词的存在会对分类效果产生负面影响。
虽然像 the、a 这种单词的确没有意义,但有常用词如 work、write、school 等在某些场合下还是有作用的,如果将他们也列入停词表里可能会有问题。所以定制停词表还需要做些额外的考虑。
分类器的初始化代码要完成以下工作:
1、 读取停词列表;
2、 获取训练集中各目录(分类)的名称;
3、对于各个分类,调用 train 方法,统计单词出现的次数;
-
4、 计算下面的公式
初始化分类器
from __future__ import print_function
import os, codecs, math
class BayesText:
def __init__(self, trainingdir, stopwordlist):
"""朴素贝叶斯分类器
trainingdir 训练集目录,子目录是分> 类,子目录中包含若干文本
stopwordlist 停词列表(一行一个)
"""
self.vocabulary = {}
self.prob = {}
self.totals = {}
self.stopwords = {}
f = open(stopwordlist)
for line in f:
self.stopwords[line.strip()] = 1
f.close()
categories = os.listdir(trainingdir)
# 将不是目录的元素过滤掉
self.categories = [filename for filename in categories if os.path.isdir(trainingdir + filename)]
print("Counting ...")
for category in self.categories:
print(' ' + category)
(self.prob[category], self.totals[category]) = self.train(trainingdir, category)
# 删除出现次数小于3次的单词
toDelete = []
for word in self.vocabulary:
if self.vocabulary[word] < 3:
# 遍历列表时不能删除元素,因此做一个标记
toDelete.append(word)
# 删除
for word in toDelete:
del self.vocabulary[word]
# 计算概率
vocabLength = len(self.vocabulary)
print("Computing probabilities:")
for category in self.categories:
print(' ' + category)
denominator = self.totals[category] + vocabLength
for word in self.vocabulary:
if word in self.prob[category]:
count = self.prob[category][word]
else:
count = 1
self.prob[category][word] = (float(count + 1) / denominator)
print ("DONE TRAINING\n\n")
def train(self, trainingdir, category):
"""计算分类下各单词出现的次数"""
currentdir = trainingdir + category
files = os.listdir(currentdir)
counts = {}
total = 0
for file in files:
#print(currentdir + '/' + file)
f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
# 删除标点符号,并将单词转换为小写
token = token.strip('\'".,?:-')
token = token.lower()
if token != '' and not token in self.stopwords:
self.vocabulary.setdefault(token, 0)
self.vocabulary[token] += 1
counts.setdefault(token, 0)
counts[token] += 1
total += 1
f.close()
return(counts, total)
分类器
def classify(self, filename):
results = {}
for category in self.categories:
results[category] = 0
f = codecs.open(filename, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
#print(token)
token = token.strip('\'".,?:-').lower()
if token in self.vocabulary:
for category in self.categories:
if self.prob[category][token] == 0:
print("%s %s" % (category, token))
results[category] += math.log(self.prob[category][token])
f.close()
results = list(results.items())
results.sort(key=lambda tuple: tuple[1], reverse = True)
# 如果要调试,可以打印出整个列表。
return results[0][0]
分类所有文档,并计算准确率
def testCategory(self, directory, category):
files = os.listdir(directory)
total = 0
correct = 0
for file in files:
total += 1
result = self.classify(directory + file)
if result == category:
correct += 1
return (correct, total)
def test(self, testdir):
"""测试集的目录结构和训练集相同"""
categories = os.listdir(testdir)
# 过滤掉不是目录的元素
categories = [filename for filename in categories if os.path.isdir(testdir + filename)]
correct = 0
total = 0
for category in categories:
print(".", end="")
(catCorrect, catTotal) = self.testCategory(testdir + category + '/', category)
correct += catCorrect
total += catTotal
print("\n\nAccuracy is %f%% (%i test instances)" % ((float(correct) / total) * 100, total))
# -*- coding:utf-8 -*-
'''
Created on 2018年11月28日
@author: KingSley
'''
import os, codecs, math
class BayesText:
def __init__(self, trainingdir, stopwordlist):
""" 朴素贝叶斯分类器
trainingdir 训练集目录,子目录是分类,子目录中包含若干文本
stopwordlist 停词列表(一行一个)
"""
self.vocabulary = {}
self.prob = {}
self.totals = {}
self.stopwords = {}
f = open(stopwordlist)
for line in f:
self.stopwords[line.strip()] = 1
f.close()
categories = os.listdir(trainingdir)
# 将不是目录的元素过滤掉
self.categories = [filename for filename in categories
if os.path.isdir(trainingdir + filename)]
print("Counting ...")
for category in self.categories:
print(' ' + category)
(self.prob[category],
self.totals[category]) = self.train(trainingdir, category)
# 删除出现次数小于 3 次的单词
toDelete = []
for word in self.vocabulary:
if self.vocabulary[word] < 3:
# 遍历列表时不能删除元素,因此做一个标记
toDelete.append(word)
# 删除
for word in toDelete:
del self.vocabulary[word]
# 计算概率
vocabLength = len(self.vocabulary)
print("Computing probabilities:")
for category in self.categories:
print(' ' + category)
denominator = self.totals[category] + vocabLength
for word in self.vocabulary:
if word in self.prob[category]:
count = self.prob[category][word]
else:
count = 1
self.prob[category][word] = (float(count + 1) / denominator)
print("DONE TRAINING\n\n")
def train(self, trainingdir, category):
"""计算分类下个单词出现的次数"""
currentdir = trainingdir + category
files = os.listdir(currentdir)
counts = {}
total = 0
for file in files:
#print(currentdir + '/' + file)
f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
# 删除标点符号,并将单词转换为小写
token = token.strip('\'".,?:-')
token = token.lower()
if token != '' and not token in self.stopwords:
self.vocabulary.setdefault(token, 0)
self.vocabulary[token] += 1
counts.setdefault(token, 0)
counts[token] += 1
total += 1
f.close()
return(counts, total)
def classify(self, filename):
results = {}
for category in self.categories:
results[category] = 0
f = codecs.open(filename, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
#print(token)
token = token.strip('\'".,?:-').lower()
if token in self.vocabulary:
for category in self.categories:
if self.prob[category][token] == 0:
print("%s %s" % (category, token))
results[category] += math.log(
self.prob[category][token])
f.close()
results = list(results.items())
results.sort(key=lambda tuple: tuple[1], reverse = True)
# 如果要调试,可以打印出整个列表
return results[0][0]
def testCategory(self, directory, category):
files = os.listdir(directory)
total = 0
correct = 0
for file in files:
total += 1
result = self.classify(directory + file)
if result == category:
correct += 1
return (correct, total)
def test(self, testdir):
"""测试集的目录结构和训练集相同"""
categories = os.listdir(testdir)
# 过滤掉不是目录的元素
categories = [filename for filename in categories if
os.path.isdir(testdir + filename)]
correct = 0
total = 0
for category in categories:
print(".", end="")
(catCorrect, catTotal) = self.testCategory(
testdir + category + '/', category)
correct += catCorrect
total += catTotal
print("\n\nAccuracy is %f%% (%i test instances)" %
((float(correct) / total) * 100, total))
baseDirectory = '/20news-bydate/'
trainingDir = baseDirectory + '20news-bydate-train/'
testDir = baseDirectory + '20news-bydate-test/'
stoplistfile = "20news-bydate\stopwords0.txt"
print("Reg stoplist 0 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords0.txt")
print("Running Test ...")
bT.test(testDir)
print("\n\nReg stoplist 25 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords25.txt")
print("Running Test ...")
bT.test(testDir)
print("\n\nReg stoplist 174 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords174.txt")
print("Running Test ...")
bT.test(testDir)
参考原文作者:Ron Zacharski CC BY-NC 3.0] https://github.com/egrcc/guidetodatamining