bmw/spiders/bmw5.py
from scrapy.spidersimport CrawlSpider,Rule
from scrapy.linkextractorsimport LinkExtractor
from bmw.itemsimport BmwItem
class Bmw5Spider(CrawlSpider):
name= 'bmw5'
allowed_domains= ['car.autohome.com.cn']
start_urls= ['https://car.autohome.com.cn/pic/series/159.html']
# https://car.autohome.com.cn/pic/series/159-10.html#pvareaid=2042222
# https://car.autohome.com.cn/pic/series/159-51-p2.html
rules= (
Rule(LinkExtractor(allow="https://car.autohome.com.cn/pic/series/159.+"),callback='parse_page',follow=True),
)
def parse_page(self,response):
title= response.xpath("//div[@class='uibox']/div/text()").get()
srcs= response.xpath("//div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
urls= list(map(lambda x:response.urljoin(x.replace('240x180_0_q95_c42',"1024x0_1_q95")),srcs))
yield BmwItem(title=title,image_urls=urls)
bmw/items.py
import scrapy
class BmwItem(scrapy.Item):
title= scrapy.Field()
image_urls= scrapy.Field()
images= scrapy.Field()
bmw/pipelines.py
import os
from urllibimport request
from scrapy.pipelines.imagesimport ImagesPipeline
from bmwimport settings
class BmwPipelines(ImagesPipeline):
def get_media_requests(self,item,info):
request_objs= super(BmwPipelines,self).get_media_requests(item,info)
for request_objin request_objs:
request_obj.item= item
return request_objs
def file_path(self,request,response=None,info=None):
path= super(BmwPipelines,self).file_path(request,response,info)
title= request.item.get('title')
images_store= settings.IMAGES_STORE
title_path= os.path.join(images_store,title)
if not os.path.exists(title_path):
os.mkdir(title_path)
image_name= path.replace("full/","")
image_path= os.path.join(title_path,image_name)
return image_path