以下为代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
#@Author : BigBro
#@DateTime : 2015-09-25 16:14:29
#@Filename : crawler_tieba.py
#@Description : 爬虫,百度贴吧,海贼王
import urllib.request
import os,re,socket
from bs4 import BeautifulSoup
r_url=input('输入网址:') #复制贴吧新番地址
url=urllib.request.urlopen(r_url)
soup=BeautifulSoup(url)
tag_list=soup.select('img[class="BDE_Image"]') #百度图片 属性
jpg_url_list=[] #图片地址列表
pattern=re.compile(r'http.*?jpg')
for item in tag_list:
match=pattern.search(str(item))
if match:
jpg_url_list.append(match.group())
path = os.getcwd() #取得当前的执行路径
pic=str(soup.title.string)
if not os.path.exists(path+'\\'+pic+'\\'):
os.mkdir(path+'\\'+pic+'\\')
path = path+'\\'+pic+'\\' #图片保存文件夹
counter=0
#可能图片顺序还有问题,暂时默认已经排序好
for url_item in jpg_url_list:
filename=str(counter)+'.jpg'
if not os.path.exists(path+filename):
with open(path+filename,mode='wb') as i:
print(filename)
try:
resp=urllib.request.urlopen(url_item, timeout=5)
img=resp.read()
i.write(img)
except socket.timeout:
#raise socket.timeout
continue
counter +=1
else:
print('%d exists,then continue' %(counter))
counter +=1
print('Downloading is done')