在学习文本分类的时候发现主要有以下几个步骤,借助代码说明(代码大多参考:机器学习算法原理与编程实战,不过发现给的语料有些编码问题,并且本人用的是Python3.6+windows,所有进行了部分修改)。
主要步骤
- 将训练集中的所有文本用jieba分词保存到另外一个文件
- 统计分词后文本的TF-IDF,转化为词频向量
- 去掉停用词
- 应用sklearn分类
文档分词
这个是一层层进入文件,然后将结果又一层层保存到文件,原文档如下结构
需要分词的文档是这种三级结构,分词后也是的到对应的三级结构。
统计文本的TF-IDF,这用到sklearn中的函数,直接见代码注释,我发现在pycharm中调试代码,就能很好理解这些sklearn中的函数了,一目了然
测试集
文中的测试集是每个种类抽取十来个文本,当然标签页带上,上图的“环境200”,“计算机200” 就是类别标签(数字指里面的文档数,我直接带上了)。如图,测试集(我手动添加的测试集),然后搞了掉小动作,故意将一篇文档放错到别的类别文件夹中。
代码实现
代码能全部执行,环境是Python3,需要安装sklearn(前面介绍了安装),还需要更改文件路径
import jieba
import os
import pickle #持久化
from numpy import *
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer #TF_IDF向量生成类
from sklearn.datasets.base import Bunch
from sklearn.naive_bayes import MultinomialNB #多项式贝叶斯算法
def readFile(path):
with open(path,'r',errors='ignore') as file: #文档中编码有些问题,所有用errors过滤错误
content = file.read()
return content
def saveFile(path,result):
with open(path,'w',errors='ignore') as file:
file.write(result)
def segText(inputPath,resultPath):
fatherLists = os.listdir(inputPath) #主目录
for eachDir in fatherLists: #遍历主目录中各个文件夹
eachPath = inputPath + eachDir + "/" #保存主目录中每个文件夹目录,便于遍历二级文件
each_resultPath = resultPath + eachDir + "/"#分词结果文件存入的目录
if not os.path.exists(each_resultPath):
os.makedirs(each_resultPath)
childLists = os.listdir(eachPath) #获取每个文件夹中的各个文件
for eachFile in childLists: #遍历每个文件夹中的子文件
eachPathFile = eachPath + eachFile #获得每个文件路径
print(eachFile)
content = readFile(eachPathFile)#调用上面函数读取内容
#content = str(content)
result = (str(content)).replace("\r\n","").strip()#删除多余空行与空格
#result = content.replace("\r\n","").strip()
cutResult = jieba.cut(result)#默认方式分词,分词结果用空格隔开
saveFile(each_resultPath+eachFile," ".join(cutResult))#调用上面函数保存文件
def bunchSave(inputFile,outputFile):
catelist = os.listdir(inputFile)
bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])
bunch.target_name.extend(catelist)#将类别保存到Bunch对象中
for eachDir in catelist:
eachPath = inputFile + eachDir + "/"
fileList = os.listdir(eachPath)
for eachFile in fileList:#二级目录中的每个子文件
fullName = eachPath + eachFile #二级目录子文件全路径
bunch.label.append(eachDir)#当前分类标签
bunch.filenames.append(fullName) #保存当前文件的路径
bunch.contents.append(readFile(fullName).strip()) #保存文件词向量
with open(outputFile,'wb') as file_obj: #持久化必须用二进制访问模式打开
pickle.dump(bunch,file_obj)
def readBunch(path):
with open(path,'rb') as file:
bunch = pickle.load(file)
return bunch
def writeBunch(path,bunchFile):
with open(path,'wb') as file:
pickle.dump(bunchFile,file)
def getStopWord(inputFile):
stopWordList = readFile(inputFile).splitlines()
return stopWordList
def getTFIDFMat(inputPath,stopWordList,outputPath):#求得TF-IDF向量
bunch = readBunch(inputPath)
tfidfspace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames= bunch.filenames,tdm=[],vocabulary={})
#初始化向量空间
vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5)
transformer = TfidfTransformer() #该类会统计每个词语的TF-IDF权值
#文本转化为词频矩阵,单独保存字典文件
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
writeBunch(outputPath,tfidfspace)
def getTestSpace(testSetPath,trainSpacePath,stopWordList,testSpacePath):
bunch = readBunch(testSetPath)
#构建测试集TF-IDF向量空间
testSpace = Bunch(target_name = bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
#导入训练集的词袋
trainbunch = readBunch(trainSpacePath)
#使用TfidfVectorizer初始化向量空间模型 使用训练集词袋向量
vectorizer = TfidfVectorizer(stop_words=stopWordList,sublinear_tf=True,max_df=0.5,vocabulary=trainbunch.vocabulary)
transformer = TfidfTransformer()
testSpace.tdm = vectorizer.fit_transform(bunch.contents)
testSpace.vocabulary = trainbunch.vocabulary
#持久化
writeBunch(testSpacePath,testSpace)
def bayesAlgorithm(trainPath,testPath):
trainSet = readBunch(trainPath)
testSet = readBunch(testPath)
clf = MultinomialNB(alpha=0.001).fit(trainSet.tdm,trainSet.label)
print(shape(trainSet.tdm))
print(shape(testSet.tdm))
predicted = clf.predict(testSet.tdm)
total = len(predicted)
rate=0
for flabel,fileName,expct_cate in zip(testSet.label,testSet.filenames,predicted):
if flabel != expct_cate:
rate +=1
print(fileName,":实际类别:",flabel,"-->预测类别:",expct_cate)
print("erroe rate:",float(rate)*100/float(total),"%")
#分词,第一个是分词输入,第二个参数是结果保存的路径
segText("E:/Train_Data/文本分类语料库/","E:/Train_Data/segResult/")
bunchSave("E:/Train_Data/segResult/","E:/Train_Data/train_set.dat")#输入分词,输出分词向量
stopWordList = getStopWord("E:/Train_Data/各种停用词表/哈工大停用词表.txt")#获取停用词
getTFIDFMat("E:/Train_Data/train_set.dat",stopWordList,"E:/Train_Data/tfidfspace.dat")#输入词向量,输出特征空间
#训练集
segText("E:/Train_Data/test_data/","E:/Train_Data/test_segResult/")#分词
bunchSave("E:/Train_Data/test_segResult/","E:/Train_Data/test_set.dat")
getTestSpace("E:/Train_Data/test_set.dat","E:/Train_Data/tfidfspace.dat",stopWordList,"E:/Train_Data/testspace.dat")
bayesAlgorithm("E:/Train_Data/tfidfspace.dat","E:/Train_Data/testspace.dat")