import requests
import re
from lxmlimport html
import pandasas pd
from matplotlibimport pyplotas plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] =False
def spider_douban(isbn):
dy_list = []
# 目标站点地址
url ='https://movie.douban.com/cinema/later/chongqing/?qq-pf-to=pcqq.group'.format(isbn)
# 获取站点str类型的响应
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
resp = requests.get(url, headers=headers)
html_data = resp.text
# 提取目标站的信息
selector = html.fromstring(html_data)
div_list = selector.xpath('//div[@id="showing-soon"]/div')
print('您好,共有{}家'.format(len(div_list)))
# 遍历div
for divin div_list:
# 电影名
dtitle = div.xpath('./div/h3/a/text()')[0].strip()
#上映日期,
ddata = div.xpath('./div/ul/li[1]/text()')[0]
# 类型
dtype = div.xpath('./div/ul/li[2]/text()')[0]
# 上映国家
dc = div.xpath('./div/ul/li[3]/text()')[0]
# 想看人数
dnum = div.xpath('./div/ul/li[4]/span/text()')[0]
dnum =int(dnum.replace('人想看', ''))
# 添加每一个商家的图书信息
dy_list.append({
'dtitle':dtitle,
'ddata':ddata,
'dtype':dtype,
'dc':dc,
'dnum':dnum
})
# 按照人气进行排序
dy_list.sort(key=lambda x: x['dnum'], reverse=True)
# 展示人气最高的前5 柱状图
# 电影的名称
top5_dy = [dy_list[i]for iin range(5)]
top5_dy.sort(key=lambda x: x['dnum'])
x = [x['dtitle']for xin top5_dy]
# 电影的人气
y = [x['dnum']for xin top5_dy]
plt.barh(x, y)
plt.show()
# 存储成csv文件
df = pd.DataFrame(dy_list)
df.to_csv('douban.csv')
# 电影国家的占比图 饼图
dc_list = [dc_list['dc']for dc_listin dy_list]
counts = {}
for wordin dc_list:
counts[word] = counts.get(word, 0) +1
items =list(counts.items())
dcounts = []
dlabels = []
for iin range(len(items)):
role, count = items[i]
dcounts.append(count)
dlabels.append(role)
explode = [0, 0, 0, 0]
colors = ['red', 'purple', 'blue', 'yellow']
plt.pie(dcounts, explode=explode, shadow=True, labels=dlabels, autopct='%1.1f%%', colors=colors)
plt.legend(loc=2)
plt.axis('equal')
plt.show()
spider_douban('9787115428028')