爬虫入门7(爬取豆瓣图书top250)

image.png

'''
import requests
from lxml import etree
import csv
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'}
urls=['https://book.douban.com/top250?start={}'.format(str(i)) for i in range(0,226,25)]
fp=open(r"E:\python_document\豆瓣图书.csv",'wt',newline="",encoding="utf-8")
writer=csv.writer(fp)
writer.writerow(('书名','网址','作者','出版社','出版日期','价格','评分','评论'))
def get_informations(url):
res=requests.get(url,headers=headers)
selector=etree.HTML(res.text)
infos=selector.xpath('//tr[@class="item"]')
for info in infos:
name=info.xpath('td/div/a/@title')[0]
url_0=info.xpath('td/div/a/@href')[0]
book_infos=info.xpath('td/p/text()')[0]
author=book_infos.split('/')[0]
publisher=book_infos.split('/')[-3]
date=book_infos.split('/')[-2]
price=book_infos.split('/')[-1]
rate=info.xpath('td/div/span[2]/text()')[0]
coments=info.xpath('td/p/span/text()')
coment=coments[0] if len(coments)!=0 else "空"
writer.writerow((name,url_0,author,publisher,date,price,rate,coment))
for url in urls:
get_informations(url)
fp.close()
'''

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容