搞死搞残大众点评,废话不多说,源码参上
import json
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import requests.models
import pandas as pd
from urllib.parse import urlencode
from threading import Thread
keyword = input('your keyword')
output_filename = input('output csv path') + '.csv'
post_url = 'https://m.dianping.com/isoapi/module'
true_url = 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword={}'.format(keyword)
headers = {
'Connection': 'keep-alive',
'Content-Length': '234',
'Origin': 'https://m.dianping.com',
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Content-Type': 'application/json',
'Accept': '*/*',
'Referer': 'https://m.dianping.com/shoplist/4/search?from=m_search&keyword=%E5%95%86%E5%9C%BA'
}
def get_cookies():
"""
使用selenium获取true_url 的cookies
:return: cookie
"""
chromeOptions = webdriver.ChromeOptions()
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
chromeOptions.add_argument('user-agent="%s"' % ua)
chromeOptions.add_argument('--proxy-server=http://127.0.0.1:8888')
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.get(true_url)
for _ in range(15):
driver.execute_script(
'window.scrollBy(0, 500)'
)
time.sleep(0.5)
if '请输入图片中的内容' in driver.page_source:
input('12345')
cookies = driver.get_cookies()
dict_cookies = {cookie['name']: cookie['value'] for cookie in cookies}
driver.quit()
return dict_cookies
def get_data(dict_cookies):
for p in range(0, 20 * 51, 20):
data = {
"pageEnName": "shopList",
"moduleInfoList": [
{
"moduleName": "mapiSearch",
"query": {
"search": {
"start": p,
"categoryId": 0,
"parentCategoryId": 0,
"locateCityid": 0,
"limit": 20,
"sortId": 0,
"cityId": 4,
"keyword": '商场',
"regionId": 0,
"maptype": 0
}
}
}
]
}
r = requests.post(post_url, headers=headers, cookies=dict_cookies, json=data, verify=False)
json_data = r.json()
datas = json_data['data']['moduleInfoList'][0]['moduleData']['data']['listData']['list']
items = []
for index, data in enumerate(datas):
item = {}
for k, v in data.items():
if isinstance(v, list):
continue
elif isinstance(v, dict):
continue
else:
v = v
item[k] = v
items.append(item)
header = True if p == 0 and index == 0 else False
print(item)
df = pd.DataFrame(data=item, index=['0'])
df.to_csv(output_filename, mode='a', index=False, header=header, encoding='utf_8_sig')
def read_csv():
df = pd.read_csv(output_filename, error_bad_lines=False)
for index, row in df.iterrows():
print('ShopId: ' + str(row['shopId']))
if __name__ == '__main__':
dict_cookies = get_cookies()
threads = []
t1 = Thread(target = get_data, args = (dict_cookies, ))
t2 = Thread(target = read_csv)
threads.append(t1)
threads.append(t2)
for index, t in enumerate(threads):
if index == 1:
time.sleep(15)
t.start()
for t in threads:
t.join()
解释下,先使用selenium 打开m.dianding.com网站,关键词已输入的情况下,进行翻页,然后获取到cookies,拿到cookies获取接口数据,保存到outputfilename (csv文件中),需要提供的是关键词与保存路径,默认是csv文件。
如果使用正常的selenium手法去访问,必然会出现验证码跟异常操作的问题。参考://www.greatytc.com/p/304f4dfae0bb
使用mitmproxy作为中间代理,selenium通过代理进行访问,代理中对请求进行过滤,过滤掉某些参数就可以防止这样的反爬手段。filter_js.py 屏蔽代码参上
import re
from mitmproxyimport ctx
def response(flow):
"""修改应答数据"""
if '/js/yoda.' in flow.request.url:
# 屏蔽selenium检测
for webdriver_keyin ['webdriver', '__driver_evaluate', '__webdriver_evaluate', '__selenium_evaluate',
'__fxdriver_evaluate', '__driver_unwrapped', '__webdriver_unwrapped',
'__selenium_unwrapped', '__fxdriver_unwrapped', '_Selenium_IDE_Recorder', '_selenium',
'calledSelenium', '_WEBDRIVER_ELEM_CACHE', 'ChromeDriverw', 'driver-evaluate',
'webdriver-evaluate', 'selenium-evaluate', 'webdriverCommand',
'webdriver-evaluate-response', '__webdriverFunc', '__webdriver_script_fn',
'__$webdriverAsyncExecutor', '__lastWatirAlert', '__lastWatirConfirm',
'__lastWatirPrompt', '$chrome_asyncScriptInfo', '$cdc_asdjflasutopfhvcZLmcfl_']:
ctx.log.info('Remove "{}" from {}.'.format(webdriver_key, flow.request.url))
flow.response.text = flow.response.text.replace('"{}"'.format(webdriver_key), '"NO-SUCH-ATTR"')
flow.response.text = flow.response.text.replace('t.webdriver', 'false')
flow.response.text = flow.response.text.replace('ChromeDriver', '')
代理命令截图参上
监听8888的端口,并自定义脚本filter_js屏幕selenium的检测。在csv文件中,拿到了shopID,通过它构建的url可以抓取详情页面数据,这个就比较简单了。