1、确定目标:
首先我们这次要扒拉的是lagou网的职位信息(www.lagou.com)
2、确定接口:
打开地址,检查元素。
3、分析参数:
查看参数后发现: pn:页码 kd:关键字 first:是否为第一次检索;
所以我们得到了一个API地址:https://www.lagou.com/jobs/positionAjax.json?city=深圳&needAddtionalResult=false
4、编写代码:
开始撸码:
import requests
def fetchURL(url):
headers = {
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0'
}
try:
r = requests.get(url,headers=headers)
r.raise_for_status()
print(r.url)
return r.text
except requests.HTTPError as e:
print(e)
print("HTTPError")
except requests.RequestException as e:
print(e)
except:
print("UnKown Error!!!!!")
if __name__ == "__main__":
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
data = {
'first':'true',
'pn':'1',
'kd':'python',
}
html = get_data(url,data)
print(html)
浪一波:结果跪了!!!!提示什么呢?
{'status': False, 'msg': '您操作太频繁,请稍后再访问', 'clientIp': '218.17.*.*', 'state': 2402}
纳尼???都还没开跑就操作太频繁,有点太过了吧!!!
在原来的基础上改一下,是不是Referer的问题呢,我们试试看:
headers = {
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0'
'Referer': 'https://www.lagou.com/jobs/list_ios?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=?&labelWords=hot',
}
再来一发:
结果提示同样的问题内容。那么是不是我们的第一次就这么的草草结束呢!!
哈哈....把所有的请求头都添加到headers里面再试一波......
还是跪了......lagou你要不要这样子呀,不就扒个资源嘛,不让扒呀......
没办法不会弄呀,找度娘吧!!
发现都是半年前的扒过的,都没最新的....
仔细分析API接口发现cookie
有网友说cookie的问题,每次刷新cookie都在变化,但是不是cookie的问题呢,再优化一下
import requests
def get_data(url,data,cookies):
try:
r = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except requests.HTTPError as e:
print('HttpError == >',e)
except requests.RequestException as e:
print("RequestExcepiton == >",e)
except:
print('UnKnown Error !!!!')
# 获取的cookies
def get_cookies(url,headers):
s = requests.Session()
s.get(url, headers=headers, timeout=3) # 获取cookies
return s.cookies
if __name__ == "__main__":
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
url2 = 'https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput='
headers = {
'Host': 'www.lagou.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'https://www.lagou.com/jobs/list_ios?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=?&labelWords=hot',
}
data = {
'first':'true',
'pn':'1',
'kd':'python',
}
cookie = get_cookies(url2,headers)
html = get_data(url,data,cookie)
print(html)
哈哈.....魔性的笑声,自行补脑....。
5、最后处理数据,保存数据
优化一下,数据逻辑处理,然后保存文件。
以下就是终极代码,其实还可以优化一下......
就此Python网络爬虫第一次就完成了.....
import requests
import time
import json
import pandas as pd
def main():
get_data()
def get_data():
url_getCookie = "https://www.lagou.com/jobs/list_iOS?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput="
url_parse = "https://www.lagou.com/jobs/positionAjax.json?city=深圳&needAddtionalResult=false"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_iOS?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
comments = []
hlist = []
hlist.append('职位')
hlist.append('公司名称')
hlist.append('薪水范围')
hlist.append('规模')
hlist.append('行业')
hlist.append('学历要求')
hlist.append('工作年限')
hlist.append('优势')
hlist.append('创建时间')
hlist.append('地址')
hlist.append('精度')
hlist.append('纬度')
comments.append(hlist)
#writePage(hlist)
#text['content']['positionResult']['totalCount']总共有280条,
#text['content']['pageSize']每页15条数据for x in range(1, 21):
data = {
'first': 'true',
'pn': str(x),
'kd': 'iOS'
}
s = requests.Session()
# 请求lagou获取cookies
s.get(url_getCookie, headers=headers, timeout=3)
# 获取cookies
cookie = s.cookies
response = s.post(url_parse, data=data, headers=headers, cookies=cookie, timeout=3)
#休眠5秒
time.sleep(5)
#编码
response.encoding = response.apparent_encoding
#转JSON
text = json.loads(response.text)
#print(text)
info = text['content']['positionResult']['result']
for i in info:
list = []
#职位
positionName = i['positionName']
#公司名称
companyFullName = i['companyFullName']
#薪水范围
salary = i['salary']
#公司规模
companySize = i["companySize"]
#行业(industryField)
industryField = i['industryField']
#学历要求(education)
education = i['education']
#工作年限(workYear)
workYear = i['workYear']
#优势(advantage)
advantage = i['positionAdvantage']
#创建时间(createTime)
createTime = i['createTime']
#地址(city+district+stationname)
print(positionName,companyFullName,createTime)
if i['stationname'] is None:
if i['district'] is None:
adress = i['city']
else:
adress = i['city'] + i['district']
else:
adress = i['city'] + i['district'] + i['stationname']
#精度 (longtitude)
longitude = i['longitude']
#纬度(latitude)
latitude = i['latitude']
list.append(positionName)
list.append(companyFullName)
list.append(salary)
list.append(companySize)
list.append(industryField)
list.append(education)
list.append(workYear)
list.append(advantage)
list.append(createTime)
list.append(adress)
list.append(longitude)
list.append(latitude)
comments.append(list)
print('-----'*15)
writePage(comments)
def writePage(connects):
dataframe = pd.DataFrame(connects)
dataframe.to_csv('lagou_comment.csv',encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)
if __name__ == '__main__':
main()