目标网站:http://www.daomubiji.com/
输出结果:存入mongodb
首先settings配置
COOKIES_ENABLED = True
ITEM_PIPELINES = {
'douban.pipelines.NovelPipeline': 300,
}
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'novel'
MONGODB_DOCNAME = 'Book'
Item设置
class NovelItem(scrapy.Item):
book_name = scrapy.Field()
book_title = scrapy.Field()
book_desc = scrapy.Field()
chapter_num = scrapy.Field()
chapter_name = scrapy.Field()
chapter_url = scrapy.Field()
爬虫编写
class NovelSpider(scrapy.Spider):
name = 'novel'
start_urls = ['http://www.daomubiji.com/']
def parse(self, reponse):
book_urls = reponse.css('.article-content a::attr(href)').extract()
for book_url in book_urls:
print(book_url)
yield Request(book_url, callback=self.parse_book)
def parse_book(self, response):
book_name = response.css('.focusbox-title::text').extract()[0]
book_desc = response.css('.focusbox-text::text').extract()[0]
acticles = response.css('.excerpt-c3')
for acticle in acticles:
item = NovelItem()
content = acticle.css('a::text').extract()[0].split(' ')
chapter_url = acticle.css('a::attr(href)').extract()[0]
if len(content) == 4:
del content[0]
book_title = content[0]
chapter_num = content[1]
try:
chapter_name = content[2]
except Exception as e:
chapter_name = content[1][-3:]
item['book_name'] = book_name
item['book_title'] = book_title
item['book_desc'] = book_desc
item['chapter_num'] = chapter_num
item['chapter_name'] = chapter_name
item['chapter_url'] = chapter_url
yield item
NovelPipeline连接mongodb配置:
class NovelPipeline(object):
def __init__(self):
host = MONGODB_HOST
port = MONGODB_PORT
dbname = MONGODB_DBNAME
client = pymongo.MongoClient(host=host, port=port)
tdb = client[dbname]
self.post = tdb[MONGODB_DOCNAME]
def process_item(self, item, spider):
book_info = dict(item)
self.post.insert(book_info)
return item
运行main.py,结果