注意:本教程基于python3.9,不一定适用于2.x版本。
需要用到requests,BeautifulSoup,HTMLSession
pip3 install requests
import sys
import requests
from collections import deque
from bs4 import BeautifulSoup
from requests_html import HTMLSession
searchkey = '唐家三少'
bookList = []
mainUrl = 'http://www.xbiquge.la/'
session = HTMLSession()
sys.setrecursionlimit(100000)
bookListReq = requests.post("http://www.xbiquge.la/modules/article/waps.php", data={'searchkey': searchkey})
bookListReq.encoding = 'utf-8'
bookListHt = BeautifulSoup(bookListReq.text, 'html.parser')
for aTag in bookListHt.find_all('a'):
if f'{aTag}'.find('target') > -1 & f'{aTag}'.find('//') > -1:
# print(f'{aTag["href"]} ===> {aTag.text}')
bookList.append({'href': aTag["href"], 'text': aTag.text})
bookList = deque(bookList)
length: int = len(bookList)
idx: int = 0
# 获取文本
class GetTxt:
response = None
text = None
title = None
url = None
def __init__(self, url):
self.response = session.get(url)
self.title = self.response.html.find('body > #wrapper > div.content_read > div > div.bookname > h1')[0].text
self.text = self.response.html.find('body > #wrapper > div.content_read > div > div#content')[0].text
def getNextUrl(self):
return self.response.html.find(
'body > #wrapper > div.content_read > div > div.bookname > div.bottem1 > a:nth-child(4)')[0].links
def getBook(url, book_name):
global idx
global length
book = GetTxt(url)
print(f'{idx}/{length} {book_name} 正在写入章节 -----> ' + book.title)
with open(f'{book_name}.txt', "a", encoding='utf-8') as f:
f.write('\r' + book.title + '\r' + book.text)
for u in book.getNextUrl():
if len(u.split('/')) >= 3:
getBook(f'{mainUrl}{u}', book_name)
else:
readBookWeb()
# 读取小说详情
def readBookWeb():
global idx
idx = idx + 1
book = bookList.popleft()
temp_book_name = book['text']
temp_book_href = book['href']
response = session.get(temp_book_href)
response.encoding = 'utf-8'
book_detail_html = BeautifulSoup(response.text, 'html.parser')
first_chapter_href = book_detail_html.find('dd').find('a')['href']
f = open(f'{temp_book_name}.txt', 'w')
f.close()
getBook(f'{mainUrl}{first_chapter_href}', temp_book_name)
readBookWeb()