爬虫之简单爬取糗事百科

技术路线：
python+requests+re

代码如下：

import requests
from bs4 import BeautifulSoup
import re

def getHTMLcode(url,data):
    try:
        r = requests.get(url, headers=data)
        print(r.status_code)
        r.raise_for_status()
        print(r.apparent_encoding)
        r.encoding = r.apparent_encoding
        print(r.encoding)
        return r.text,r.encoding
    except:
        print('爬取失败')

def parsePage(contain,html):
    soup = BeautifulSoup(html, "html.parser")
    # find_all( name , attrs , recursive , text , **kwargs )
    items = soup.find_all(name='div', class_='article')
    for item in items:
        print(item)
        print('-------------------------------')
        item = str(item)
        a = []
        # 发布人，发布内容，发布时间，点赞数
        pattern = re.compile(
            r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<i class="number">(.*?)</i>', re.S)
        groups = re.findall(pattern, item)  #groups是以元组为元素的列表

        for g in groups:    #g是元组
            a.append(g[0])
            a.append(g[1])
            a.append(g[2])
        contain.append(a)


def saveArticle(contain,filPath):
    for article in contain:
        with open(filPath,'a+',encoding='utf-8') as f:
            temp='作者:'+article[0].strip('\n')+'\n'+'内容:\n'+article[1].strip('\n').replace('<br/>', '')+'\n点赞数:'+article[2].strip('\n')+'\n\n\n\n'
            f.write(temp)

def spyder(url,data,depth,filPath):
    for i in range(depth):
        url=url+str(depth+11)
        html,encoding=getHTMLcode(url, data)
        if encoding=='ISO-8859-2':
            continue
        contain=[]
        parsePage(contain, html)
        saveArticle(contain,filPath)

if __name__=="__main__":
    depth =10
    url = 'https://www.qiushibaike.com/hot/page/'
    data = {'User-Agent': 'Mozilla/5.0'}
    filPath='newarticle.docx'
    spyder(url,data,depth,filPath)

关键点：
正则表达式的编写：
pattern = re.compile(
r'<div class="author clearfix">.?<h2>(.?)</h2>.?<span>(.?)</span>.?<i class="number">(.?)</i>', re.S)
说明：
1：(.*?)代表分组，用正则的方法re.findall(pattern, item)每个匹配的字符串里面我们先要的部分会以元组的形式返回，然后一篇文章里有多组匹配的字符串的话最终结果会返回以元组为元素的列表。
2：re.S代表.包括匹配换行

最后编辑于：2019.07.25 12:59:02