# 正则表达式, 按照一定规则提取字符串中的符合条件的内容
str1='ghmjfdhngbrghmgjhngbfrthnhfgbfv'
list1=re.findall('gj(.*?)fr',str1) #返回值是列表
print(list1)
str2 = '''hellobtevrqtyjumuynthbg
mybrtymuntbrgev
juntybrwtvewbynutybworld'''
list1 = re.findall('hello(.*?)world', str2, re.S) # re.S允许跨行匹配
print(list1)
爬取全书网小说
import os
import re
import requests
url = 'http://www.quannovel.com/read/620/' # 需要进行爬虫的网址
req = requests.post(url) # 访问网页,获取网页内容
book_name = re.findall('(.*?)(.*?)', req.text) # 获取章节名
url_list = re.findall('<a href="(.*?).html', req.text) # 获取正文网址
dict1 = {}
for i in range(len(title_list)):
dict1[title_list[i]] = f'{url}{url_list[i]}.html' # 将目录和网址放到字典里
if not os.path.exists(f'D:/{book_name}'): # 如果没有以书名命名的目录,新建目录
os.mkdir(f'D:/{book_name}')
count = 1
for k, v in dict1.items():
if count > 5:
break
else:
req = requests.get(v) # 访问正文网页
text = re.findall('class="page-content ">(.*?)<div class', req.text, re.S)[0] # 获取文章内容
text = text.replace('', '').replace('', '')
with open(f'd:/{book_name}/{k}.txt','w+') as file1:
file1.write(text)
print(f'第{count}章爬取完毕')
count += 1