环境:Python2.7,requests,bs4,re,
获取数据的网址:西刺代理
得到的结果,后续可自行构建代理池或者保存文件:
代码段:
#coding=utf8
import requests
from bs4 import BeautifulSoup
import re
import os.path
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}
def getListProxies():
session = requests.session()
page = session.get("http://www.xicidaili.com/nn", headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
proxyList = []
taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
for trtag in taglist:
tdlist = trtag.find_all('td')
proxy = {'http': tdlist[1].string + ':' + tdlist[2].string,
'https': tdlist[1].string + ':' + tdlist[2].string}
# url = "http://ip.chinaz.com/getip.aspx" # 用来测试IP是否可用的url(现在该网址好像不能使用)
# try:
# print('proxy is ',proxy)
# response = session.get(url, proxies=proxy, timeout=5)
# print(response)
# proxyList.append(proxy)
# if (len(proxyList) == 3):
# break
# except Exception, e:
# continue
proxyList.append(proxy)
#设定代理ip个数
if len(proxyList)>=10:
break
return proxyList
res=getListProxies()
print len(res)
print(res)