写在前面
这么久了,代码还是那么烂。。。。
环境:Windows10
编辑器:Pycharm
用到的库:os requests bs4
URL = www.dbmeinv.com
开始爬取
第一步:获取单页图片地址
网页结构
图片放在ul标签下的img标签下面,所以我们可以这样写代码
def get_Imgs(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status() //判断状态码是否为200
response.encoding = response.apparent_encoding
#response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
for url in soup.find('ul', {'class':'thumbnails'}).find_all('img'):
all_url.append(url['src'])
return all_url
except:
return "error"
第二步:抓取多页
URL结构
抓取多页的话就可以改一下URL后边的数字,代码可以这样写
url = 'https://www.dbmeinv.com/?pager_offset='
try:
for i in range(10):
get_Imgs(url + str(i))
except:
return "error"
第三步:保存图片
def save_imgs():
dir_name = 'pic'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
try:
for i,url in enumerate(all_url):
with open('./' + str(i) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
except:
return "error"
放上完整的代码
import os
import requests
from bs4 import BeautifulSoup
all_url = []
def get_Imgs(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')
for url in soup.find('ul', {'class':'thumbnails'}).find_all('img'):
all_url.append(url['src'])
return all_url
except:
return "error"
def save_imgs():
dir_name = 'pic'
if not os.path.exists(dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
try:
for i,url in enumerate(all_url):
with open('./' + str(i) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
except:
return "error"
def main():
url = 'https://www.dbmeinv.com/?pager_offset='
try:
for i in range(200):
get_Imgs(url + str(i))
except:
return "error"
save_imgs()
if __name__ == '__main__':
main()
嘿嘿嘿