#第一次实现简单爬虫,记录一下爬取全书网中《盗墓笔记》的部分章节
代码还有待完善,重复追加内容写入会遇到txt文本大小限制引起代码终结。
很多不必要的代码可以删除的,静态网页还是比较好爬的,加油!
下回爬一下动态网站!
继续学习!!
以下代码:
import requests
from bs4 import BeautifulSoup
class Download(object):
def __init__(self):
self.target='http://www.quanshuwang.com/book/9/9055'
self.href_list = []
self.chapter_name=[]
self.num =0
self.head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'Accept': 'text/html,application/xhtml',
'connection': 'keep-alive',
'Accept - Encoding':'gzip, deflate'
}
#用于获取小说目录的url
def get_url(self):
self.req = requests.get(url=self.target,params='html',headers=self.head)
self.req.encoding = 'gbk'
self.html =self.req.text
self.bf_url=BeautifulSoup(self.html)
self.div=self.bf_url.find_all('div',class_='clearfix')
self.div_a =BeautifulSoup(str(self.div[1]))
self.a =self.div_a.find_all('a')
#删除不要的章节!
self.num = len(self.a[:5])
for each in self.a[:5]:
self.chapter_name.append(each.string)
self.href_list.append(each.get('href'))
#URL获取好列表了,该模块进行下载储存
def down_novel(self,herf):
self.url = requests.get(url=herf)
self.url.encoding='gbk'
self.url_text = self.url.text
self.url_bf =BeautifulSoup(self.url_text)
self.url_bf_div =self.url_bf.find_all('div',class_='mainContenr')
self.url_bf_div_text= self.url_bf_div[0].text.replace('\xa0'*8,'\n\n')
def write(self):
with open('novel.txt', 'a', encoding='utf-8') as f:
f.write('\n')
f.writelines(self.url_bf_div_text)
f.write('\n\n')
if __name__ == '__main__':
dl = Download()
dl.get_url()
for i in dl.href_list:
dl.down_novel(i)
dl.write()
作者:Kang