有什么奇怪的需求可以在下面留言啊,如果正好我也感兴趣这个奇怪的需求,那就,嘿嘿嘿……
#encoding=utf-8
import urllib
import urllib.request
import http.cookiejar
import re
from collections import deque
import os
import gzip
import threading
from time import sleep
'''
page_num=0;#页数 类的static
myQueue=deque();#展示页队列 类的static
先在外面定义一个登陆类,用于登陆创建cookie,就不用每一次创建线程都走init函数登陆和创建cookie,这个类返回一个opener
一个单独的线程类用于下载,改变登陆类传出的opner的header参数执行每次下载请求
'''
class Login:
def __init__(self):
self.__url_login='http://hkbbcc.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1';#用来登录
self.__header_post={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'Content-Length':'96',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'hkbbcc.com',
'Origin':'http://hkbbcc.com',
'Proxy-Connection':'keep-alive',
'Referer':'http://hkbbcc.com/forum.php',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
};
def __createPostData(self):
post_data={
'fastloginfield':'username',
'username':'wyude',
'password':'3692580000000000',
'quickforward':'yes',
'handlekey':'ls'
};
poster=urllib.parse.urlencode(post_data).encode();
return poster;
def __createOpener(self):
cookieJ=http.cookiejar.CookieJar();#添加cookie
HCPro=urllib.request.HTTPCookieProcessor(cookieJ);
self.__opener=urllib.request.build_opener(HCPro);
headerTmp=[];
for key,value in self.__header_post.items():
elem=(key,value);
headerTmp.append(elem);
self.__opener.addheaders=headerTmp;
def go(self):
self.__createOpener();
__back=self.__opener.open(self.__url_login,self.__createPostData());
print(__back.read().decode());
#这里应该有一个getHeader,验证是否ok才能继续
return self.__opener;
class browserTest(threading.Thread):
myLock=threading.RLock();
myQueue=deque();
def __init__(self,opener,q,someone=False,name=None):#q待下载页码队列 #someone=true指定要下载的作者False就不指定,按顺序下载
threading.Thread.__init__(self);
self.__queue=q;
self.__opener=opener;
self.__url_host='http://hkbbcc.com/forum.php';#验证登录后的页面什么样
self.__header_after={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
};
#贴图发帖时间排序第一页
self.__urlStart='http://hkbbcc.com/forum.php?mod=forumdisplay&fid=18&orderby=dateline&orderby=dateline&filter=author&page=1';
self.__name=name;
self.__someone=someone;
self.__pageLike=re.compile('<a href=\\"(.+?)\\" onclick=\\"atarget\\(this\\)\\" title=\\"(.+?)\\" class=\\"z\\">');
self.__authLike=re.compile('<em class=\\"sum y xs0 xi1 xw1\\" title=\\".+?\\">.+?</em><a href=\\".+?\\.html\\">(.+?)<');
self.__picLike=re.compile('<img .+? zoomfile="(.+?)" file=');
self.__header_pic={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Host':'img.bipics.net',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
};
def createOpener(self,header):#创建带cookie和header的opener
headerTmp=[];
for key,value in header.items():
elem=(key,value);
headerTmp.append(elem);
self.__opener.addheaders=headerTmp;
def run(self):
while(True):
#验证成功登录后的样子
#self.createOpener(self.__header_after);#置个0,避免新的cookie覆盖登陆时的cookie
#print(self.__opener.open(self.__url_host).read().decode());
#sleep(60);
self.__class__.myLock.acquire();
if( len(self.__queue)==0 ):
print("页码访问完毕!");
break;
page=self.__queue.popleft();
self.__class__.myLock.release();
self.__header_bs={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Host':'hkbbcc.com',
'Proxy-Connection':'keep-alive',
'Referer':page,
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36'
};
self.go(page);
def go(self,pageNum):
print("正在爬取 "+str(pageNum)+" 页---------->");
pageUrl=self.__urlStart[:-1]+str(pageNum);
self.__class__.myLock.acquire();
self.createOpener(self.__header_after);#打开展示页
try:
pageData=self.__opener.open(pageUrl).read().decode();#打开展示页
self.__class__.myLock.release();
b=pageData.replace('amp;','');#删掉这个特殊字符
getUrl=self.__pageLike.findall(b);#取出本页所有帖子url
getAuth=self.__authLike.findall(b);#取出本页所有帖子对应作者
#print(getAuth);
j=0;
for mypage,dirname in getUrl:
#print(getAuth[j]);
if(self.__someone and getAuth[j] not in self.__name):
#print(getAuth[j] ,"not in ",self.__name);
j +=1;
continue;
else:
j +=1;
mypage='http://hkbbcc.com/'+mypage;
print('+++++'+str(pageNum)+" 页第 "+str(j)+" 帖 当前页网址--->"+mypage);
print('+++++'+"当前帖子主题--->"+dirname);
try:
folder='e:/loadbs/'+str(pageNum)+'/'+dirname;
if(not os.path.exists(folder)):
os.makedirs(folder);
try:
self.__class__.myLock.acquire();
self.createOpener(self.__header_bs);#这个干嘛的?
cur_page=self.__opener.open(mypage).read();
self.__class__.myLock.release();
try:
print("正在解压当前网页...............");
unzip_page=gzip.decompress(cur_page);
cur_page=unzip_page.decode();
print("当前网页解压完毕...............");
except:
print("网页解压失败");
continue;
pic=self.__picLike.findall(cur_page);
for picUrl in pic:
try:
picName=picUrl.split('/')[-1];
print("正在保存---》"+picName);
f=open(folder+'/'+picName,'wb');
#print(folder+'/'+picName);
#print(picUrl);
self.__class__.myLock.acquire();
self.createOpener(self.__header_pic);#图片下载请求
picRes=self.__opener.open(picUrl);
self.__class__.myLock.release();
picR=picRes.read();
#picRes=urllib.request.urlopen(picUrl);
f.write(picR);
f.close();
except:
print('保存图片失败');
continue;
except:
print('打开链接失败'+str(j));
continue;
except:
print("创建文件夹失败!"+dirname);
continue;
except:
print("打开展示页失败"+str(pageNum));
if __name__=='__main__':
#实现登陆返回带cookie的opener,之后替换header后用这个opener去请求下载
appLog=Login();
app_opener=appLog.go();
#要下载的页码
myQueue=deque();
for i in range(1,100):#不包括100
myQueue.append(i);
#执行多线程下载
name=['魏晴','魔幻王'];
threads=[];
#如果不行就不要传myQueue而是设为全局,然后browserTest.myQueue=myQueue;
for j in range(3):
app=browserTest(app_opener,myQueue,True,name);
app.setDaemon(True);
app.start();
threads.append(app);
for t in threads:
t.join();
print('Done !');