遇到了编码错误真的很蛋疼,卡了2个小时才解决!
参考文献1,参考文献2,参考文献3
网站示例一:
# -*- coding: utf-8 -*-
import requests, re
from bs4 import BeautifulSoup
content='http://www.8shuw.com/BookReader/24-24559.html' #目录页
resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('table',{'border':'0','class':'acss'}).find('tbody')
trs = tbody.find_all('a',{'itemprop':'url','href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None] #排除一些没用的章节
#print('Count:',len(trs))
#print(trs[-1].text,'href =',trs[-1].get('href'))
with open ('novel.txt', 'w') as f:
for chapter in reversed(trs):
f.write(chapter.text+'\n')
resp = requests.get(chapter.get('href'))
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
texts = soup.find('div',{'id':'readtext','class':'fontm'}).find_all('p')
print(trs.index(chapter),chapter.text)
for line in texts:
#解决错误关键点,encode后再decode,加上ignore参数忽略一些解码错误
f.write(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',line.text.encode('gb18030').decode('gbk','ignore'))+'\n')
f.close()
网站示例二:
import requests, re
from bs4 import BeautifulSoup
content='http://www.piaotian.com/html/5/5896/'
resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('div',{'class':'centent'})
trs = tbody.find_all('a',{'href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None]
print('Count:',len(trs))
print(trs[1980].text,'href =',trs[1980].get('href'))
#print(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',texts[1].text))
with open ('novel.txt', 'w') as f:
for chapter in trs[1980:]: # 倒序目录 reversed(trs):
print(trs.index(chapter),chapter.text)
f.write(chapter.text+'\n') # 章节标题
resp = requests.get(content + chapter.get('href'))
#resp.encoding = 'gb18030'
soup=BeautifulSoup(resp.text,'html.parser') #这里解析器不同于前例
texts = soup.find_all('br')
#print(soup.get_text())
for line in texts:
if len(line.text)>0:
f.write(line.text.encode('utf-8').decode('gbk','ignore')+'\n')
f.close()