[转载自]https://blog.csdn.net/weixin_43930694/article/details/98334465
感谢作者,支持原创。如有侵权,请联系我删除。谢谢!
需求:
- 循环抓取豆瓣影评中所有观众对《陈情令》的评论,存储在文本文档中,并运用可视化库--词云对其进行分析
- 目标网站:https://movie.douban.com/subject/27195020/comments?start=
- 涉及的python类库:requests、lxml、wordcloud、numpy、PIL、jieba,可能还要安装matplotlib这个库
- 使用 Python 3.7
代码如下:
#-*- coding:utf-8 -*-
import requests
from lxml import etree
from wordcloud import WordCloud
import PIL.Image as image
import numpy as np
import jieba
# 获取html源代码
def getPage(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
}
response = requests.get(url, headers=headers).text
return response
# 获得所有页面
def all_page():
base_url = "https://movie.douban.com/subject/27195020/comments?start="
# 列表存放所有的网页,共10页
urllist = []
for page in range(0, 200, 20):
allurl = base_url + str(page)
urllist.append(allurl)
return urllist
# 解析网页
def parse():
# 列表存放所有的短评
all_comment = []
number = 1
for url in all_page():
# 初始化
html = etree.HTML(getPage(url))
# 短评
comment = html.xpath('//div[@class="comment"]//p/span/text()')
all_comment.append(comment)
print('第' + str(number) + '页解析并保存成功')
number += 1
return all_comment
# 保存为txt
def save_to_txt():
result = parse()
for i in range(len(result)):
with open('陈情令评论集.txt', 'a+', encoding='utf-8') as f:
# 按行存储每一页数据
f.write(str(result[i]) + '\n')
f.close()
# 将爬取的文档进行分词
def trans_CN(text):
word_list = jieba.cut(text)
# 分词后在单独个体之间加上空格
result = ' '.join(word_list)
return result
# 制作词云
def getWordCloud():
path_text = "陈情令评论集.txt"
path_jpg = "timg.jpg" # 当前目录下放一张这个图片
path_font = "/Users/XXXX/project/simsun.ttf" # 词云字体文件需要去网上找一个
text = open(path_text, encoding='utf-8').read()
# 剔除无关字
text = text.replace("真的", " ")
text = text.replace("什么", " ")
text = text.replace("但是", " ")
text = text.replace("而且", " ")
text = text.replace("那么", " ")
text = text.replace("就是", " ")
text = text.replace("可以", " ")
text = text.replace("不是", " ")
text = trans_CN(text)
mask = np.array(image.open(path_jpg)) # 词云背景图:事先要准备好一张图片
wordcloud = WordCloud(
background_color='white',
mask=mask,
scale=15,
max_font_size=80,
font_path=path_font
).generate(text)
wordcloud.to_file('陈情令评论集.jpg')
# 主函数
if __name__ == '__main__':
save_to_txt()
print('所有页面保存成功')
getWordCloud()
print('词云制作成功')
搞定。这样就生成了一张词云的图片,就是出现频率越高的词,在这个词云图片中字体越大。如下:
陈情令评论集.jpg
最后附上字体simsun.ttf
链接:https://pan.baidu.com/s/1nTtwgyf7dCuJXUrWcfxs9A 密码:r025