selenium和Python3.6实现招聘狗网站自动识别验证码登录、列表页、详情页爬取

  之所以选择selenium实现登录主要是为了处理验证码，招聘狗网站的验证码图片是拼接出来的，所以我的方法是通过webdriver截图来实现，然后通过打码兔平台获取验证码坐标实现自动自动登录。列表页和详情页用requests库实现。具体实现过程如下：

招聘狗的验证码如下：

深度截图_选择区域_20180517111246.png

   首先你得注册一个账号，可以跳过企业验证，招聘狗网站是给企业HR使用的，所以一般要求企业验证，这里我们直接跳过企业验证，下面是实现过程，有详细注释：

'''
import json
import os
import random
import re
import sys
import traceback
import time

from PIL import Image
from lxml import html as lxml_html
import selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains

import requests
import base64
from requests.exceptions import ConnectionError
import http.cookiejar
import logging
from dama2_API import Dama2API

随机获取useragent的第三方库，直接用pip安装

from fake_useragent import UserAgent
ua = UserAgent()

class RTC_zhaopingou(object):
def init(self, account: dict, debug=False, visible=-1, last_try=False):
assert account['user_id']
assert account['password']

    logging.info('Change webdriver to FireFox')
    #创建seeion对象，爬取列表页和详情页使用
    self.session = requests.Session()
    self.session.headers = {
        'Host': "qiye.zhaopingou.com",
        "Origin":"http://qiye.zhaopingou.com",
        "Referer":"http://qiye.zhaopingou.com",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
    }
    #需要注册打码兔账号，从打码兔平台下载代码
    self.dama2 = Dama2API()


def login(self):

    l = logging
    l.info("Processing Login...")

    self.driver = webdriver.Firefox()
    self.driver.set_window_size(1920, 1080)
    self.driver.implicitly_wait(10)
    driver = self.driver

    # login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))
    login_url = 'http://qiye.zhaopingou.com/'
    driver.get(login_url)
    #打开页面后出现的需要选择城市
    driver.find_element_by_xpath('//div[@class="city-now citys"]').click()
    #找到用户名和密码元素，模仿人手动输入
    for i in self.account['username']:
        driver.find_element_by_xpath('//input[@placeholder="请输入手机号/邮箱/狗狗号"]').send_keys(i)
        time.sleep(random.uniform(0.2,0.8))
    for j in self.account['password']:
        driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys(j)
        time.sleep(random.uniform(0.2, 0.8))

    # 获取弹出验证码的按钮元素，这里有一个坑，按钮元素在iframe节点中，不能直接获取，需要通过driver.find_element_by_tag_name("iframe")切入到第一个iframe中，然后在通过xpath获取按钮元素
    # iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')
    driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
    # driver.switch_to.frame('captcha_widget_aiwaylekc')
    driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()

    #等待5秒，避免出现有时候还未加载出来的情况，通过driver.switch_to.default_content()从iframe切换到主html页面
    time.sleep(5)
    driver.switch_to.default_content()

    #点击弹出验证码按钮后出现一个新的iframe,此时有两个iframe，并列的，从这页面切入到第二个iframe
    driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
    # 验证码区域
    captcha_xpath = '//div[@class="lc-panel"]'
    # captcha_xpath = '#l-captcha-float_aiwaylekc'
    re = self._login_process_captcha(captcha_xpath)
    #登录成功
    if re:
        driver.switch_to.default_content()
        driver.find_element_by_id('form_login').click()
        time.sleep(3)
        current_url = driver.current_url
        #判断登录后的url是否是期望值
        expect_url = 'http://qiye.zhaopingou.com/'
        if current_url==expect_url:
            l.info('login sucess!!!')
            #获取cookie，并将cookie保存到session中，以便爬虫列表页和详情页使用
            cookie = dict()
            print(driver.get_cookies())
            for item in driver.get_cookies():
                # cookie += "; {}={}".format(item['name'], item["value"])
                cookie[item['name']] = item['value']
                if item['name'] == 'hrkeepToken':
                    self.token = item['value']
            # 存储cookie
            self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)
            l.info("get cookie: {}".format(cookie))
            #登录成功，退出driver，后面不使用了
            self.driver.quit()
            return True
    else:
        l.info('login failed due to CAPTCHA, submit_count')
        return False



def _login_process_captcha(self,captcha_xpath):
    l = logging
    driver = self.driver
    captcha_element = driver.find_element_by_xpath(captcha_xpath)
    #验证码坐标和大小
    offset = captcha_element.location
    print('offset:',offset)
    size = captcha_element.size

    # 验证码接口
    dama2 = self.dama2

    #保存验证码图片
    shm_dir = r'/tmp/zhaopingou/'
    if os.path.exists(shm_dir) is False:
        os.makedirs(shm_dir)
    captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))
    maximum = 20
    attempt = 0
    while attempt<=maximum:
        l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')

        #验证码元素
        captcha_element = driver.find_element_by_xpath(captcha_xpath)
        #截取验证码图片保存到captcha_img_path
        captcha_element.screenshot(captcha_img_path)

        try:
            #调用打码兔接口，传入验证码类型，验证码图片文件，返回坐标值coordinate_list
            captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)
            l.info(f'coordinate_list:{coordinate_list}')
        except Exception as err:
            err_str = str(err)
            tb = traceback.format_exc()
            msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'
            l.warning(msg)
            attempt+=1
            # 发生异常时先返回主页面
            continue
        #将鼠标移动到返回的坐标位置并点击
        for xy in coordinate_list:
            action = ActionChains(driver)
            action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()
            action.perform()
            time.sleep(random.uniform(0.5,2))
        #先切回到主html，再切到第一个iframe，获取之前的弹出验证按钮，判断内容是否是验证成功
        driver.switch_to.default_content()
        driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
        text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text
        if text.find('验证成功')!=-1:
            l.info('验证码验证成功！')
            time.sleep(random.uniform(1,2))
            return True
        else:   #失败则再切回到第二个iframe，从新获取验证码
            driver.switch_to.default_content()
            driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
            l.info('fail,and try it again')
            attempt+=1
            time.sleep(2)
            continue
    return False

#通过搜索关键字获取列表页面，并定位到某一页
def search(self, keyword, page_to_go):
    '''搜索简历，得到列表页面，数据为json格式'''
    l = logging

    assert keyword
    self.keyword = keyword
    # 使用firefox浏览器抓取post请求参数
    params = {
        "pageSize":page_to_go,
        "pageNo":"25",
        "keyStr":keyword,
        "companyName":"",
        "schoolName":"",
        "keyStrPostion":"",
        "postionStr":"",
        "startDegrees":"-1",
        "endDegress":"-1",
        "startAge":"0",
        "endAge":"0",
        "gender":"-1",
        "region":"",
        "timeType":"-1",
        "startWorkYear":"-1",
        "endWorkYear":"-1",
        "beginTime":"",
        "endTime":"",
        "isMember":"-1",
        "hopeAdressStr":"",
        "cityId":"-1",
        "updateTime":"",
        "tradeId":"",
        "clientNo":"",
        "userToken":self.token,
        "clientType":"2"
    }

    retry = 0

    while True:
        #抓包获取请求的真实URL，后面是随机的数字字符串
        search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))
        l.info('search_url:{}'.format(search_url))
        self.current_url = search_url
        l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')

        retry += 1
        if retry == 11:
            return ''
        try:
            #使用session请求
            res = self.session.post(search_url, data=params)
        except ConnectionError:
            l.info("ConnectionError! Sleep 5 minutes and retry...")
            time.sleep(300)
            self.current_url = search_url
            continue
        else:
            l.info('current url is:{}'.format(res.url))
            if res.url != search_url:
                login_result = self.login(load=False)
                if login_result:
                    continue
                else:
                    l.warning("Login failed!")
                    sys.exit('login failed')
            elif not res.text:
                l.info("Service is busy. Wait 5 minutes and retry...")
                time.sleep(300)
                l.info('Continue Searching...')
                continue
            #返回的数据异常，内容很少
            elif len(str(res.text))<2000:
                #若返回‘请您登录后查看简历’，则重新登录后在爬取
                if '请您登录后查看简历' in str(res.text):
                    self.login(load=False)
                    continue
                result = str(res.text)
                #更换useragent
                self.session.headers['User-Agent'] = ua.firefox
                l.info(f'errorcode msg:{result}')
                l.info('Too frequent operation, please try again in a minute')
                time.sleep(random.randint(61,100))
                continue
            else:
                try:
                    #返回的正常数据，通过json.dumps()获取json数据
                    resume_list = json.loads(res.text)
                    resume_list["current_page"]=page_to_go
                    # 在列表页面加入搜索页面
                    res = json.dumps(resume_list,ensure_ascii=False)
                    l.info(f'search_resume_list_info:{res}')
                    return res
                except:
                    l.warning(res.text)
                    l.warning("something wrong!sleep 5 minutes and retry...")
                    time.sleep(300)
                    continue

def open_resume(self, url):
    '''
    打开简历，得到详情页面
    url可通过base64加密的用户id构造
    '''
    l = logging

    l.debug(f'Open a resume: request_url: {url}')
    resumeHtmlId=(url.split("="))[1]
    # 设置前链
    #self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword
    # 抓包获取简历详情页的请求参数
    open_resume_data={
        "resumeHtmlId": resumeHtmlId,
        "keyStr":"",
        "keyPositionName":"",
        "tradeId":"",
        "postionStr":"",
        "jobId":"0",
        "companyName":"",
        "schoolName":"",
        "clientNo":"",
        "userToken":self.token,
        "clientType":"2"
    }
    retry = 0
    while True:
        #抓包获取详情页真实url
        openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))
        l.info('resume_url:{}'.format(openresumeurl))

        retry += 1
        if retry == 11:
            return ''
        try:
            res = self.session.post(url=openresumeurl,data=open_resume_data)
        except ConnectionError:
            l.info("ConnectionError! Sleep 5 minutes and retry...")
            time.sleep(300)
            continue
        else:
            # 返回的html页面
            l.info('current url is:{}'.format(res.url))
            if res.url != openresumeurl:
                l.info("cookie is invalid. Login with webdriver")
                login_result = self.login(load=False)
                if login_result:
                    continue
                else:
                    l.warning("Login failed!")
                    sys.exit('login failed')
            if not res.text:
                l.info("Service is busy. Wait 5 minutes and retry...")
                time.sleep(300)
                continue
            elif len(str(res.text))<2000:
                print('errorcode:',res.text)
                result = str(res.text)
                l.info(f'errorcode msg:{result}')
                l.info('Too frequent operation, please try again in a minute')
                time.sleep(random.randint(61, 100))
                continue
            else:
                try:
                    page_len = len(res.text)
                    self.current_url = openresumeurl
                    l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')
                    resp_json=json.loads(res.text)
                    res_utf=json.dumps(resp_json,ensure_ascii=False)
                    return res_utf
                except:
                    l.warning(res.text)
                    l.warning("something wrong! sleep 5 minutes and retry...")
                    time.sleep(300)
                    continue

if name == 'main':
#账号密码是假的，大家填写自己的账号密码
rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},
debug=False,
visible=1, last_try=False)
rtc_zhaopingou.login()
keyword_list = ['python','大数据','人工智能','java']
for kw in keyword_list:
for i in range(1,200):
search_result = rtc_zhaopingou.search(kw, i)
print('****************************************************************')

res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')
print(res)

'''
打码兔平台的代码需要自己下载，放在同级目录后可以跑一下

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 215,012评论 6赞 497
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 91,628评论 3赞 389
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 160,653评论 0赞 350
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 57,485评论 1赞 288
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 66,574评论 6赞 386
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 50,590评论 1赞 293
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 39,596评论 3赞 414
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,340评论 0赞 270
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,794评论 1赞 307
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,102评论 2赞 330
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,276评论 1赞 344
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 34,940评论 5赞 339
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,583评论 3赞 322
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,201评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,441评论 1赞 268
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,173评论 2赞 366
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,136评论 2赞 352

selenium和Python3.6实现招聘狗网站自动识别验证码登录、列表页、详情页爬取

随机获取useragent的第三方库，直接用pip安装

推荐阅读更多精彩内容