声明:
1、本博客所涉及爬虫及其爬取的数据仅用于爬虫技术研究,请勿用于商业用途。
2、本博文中涉及的网址均已隐去关键字段。
在上一篇博文基础上改用另一种方法。
(上一篇博文地址://www.greatytc.com/p/1279a13bd1a3)
# -*- coding: utf-8 -*-
import scrapy
from testscrapy01.items import xxxx_Zhuanli_Item
from scrapy.selector import Selector
import re
class QibebtZhuanliSpider(scrapy.Spider):
name = "xxxx_zhuanli"
allowed_domains = ["xxxx.cas.cn"]
start_urls = ['http://www.xxxx.cas.cn/kycg/zl/']
# for i in range(1,15):
# start_urls.append("http://www.xxxx.cas.cn/kycg/zl/index_" + str(i) +".html")
def parse(self, response):
#请求第一页
yield scrapy.Request(response.url, callback=self.parse_zhuanli)
for i in range(1,15):
url = "http://www.qibebt.cas.cn/kycg/zl/index_" + str(i) + ".html"
# 获取当前页面的url:respone.url
# 通过拼接response.url和href.extract(),将相对网址转换为绝对网址
# url = response.urljoin(url)
# print("完整url是:",url)
yield scrapy.Request(url, callback=self.parse_zhuanli)
def parse_zhuanli(self,response):
hxs = Selector(response)
qitem = QIBEBT_Zhuanli_Item()
# 把所有行抓取出来
zls = hxs.xpath("//tr[@bgcolor='#f2f7f1']")
# print(len(zls))
for zl in zls:
# 专利名称
name = zl.xpath("td[@height='26']/a[@target='_blank']/text()").extract()[0]
# name = zl.xpath("//a/text()").extract()
# 专利类别
type = zl.xpath("td[@align='center']/text()").extract()[0]
# 申请号
number = zl.xpath("td[@align='center']/text()").extract()[1]
# 申请日期
apply_date = zl.xpath("td[@align='center']/text()").extract()[2]
sq_date = "无"
try:
# 授权日期
sq_date = zl.xpath("td[@align='center']/text()").extract()[3]
except:
sq_date = "无"
# 发明人
# 有一个"\xa0"字符串,进行特殊处理
inventor = repr(zl.xpath("td[@align='left']/text()").extract()[0])
# print(isinstance(inventor,str))
inventor = re.sub(r"'", "", inventor)
# inventor = re.sub("\\xa0", ",", inventor)
main_inventor = inventor.split('\\xa0')[0]
other_inventors = inventor.split('\\xa0')[1]
qitem["name"] = name
qitem["type"] = type
qitem["number"] = number
qitem["apply_date"] = apply_date
qitem["sq_date"] = sq_date
qitem["main_inventor"] = main_inventor
qitem["other_inventors"] = other_inventors
# print("专利名称 = ", name)
# print("专利类别 = ", type)
# print("申请号 = ", number)
# print("申请日期 = ", apply_date)
# print("授权日期 = ", sq_date)
# print("第一发明人 = ", main_inventor)
# print("其他发明人 = ", other_inventors)
# print("")
yield qitem
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.conf import settings
import pymongo
class Testscrapy01Pipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
# for i in range(0,len(item["content"])):
# print("*******************************")
item["name"] =item["name"]
item["type"] =item["type"]
item["number"] =item["number"]
item["apply_date"] =item["apply_date"]
item["sq_date"] =item["sq_date"]
item["main_inventor"] =item["main_inventor"]
item["other_inventors"] =item["other_inventors"]
return item
另存为csv格式
scrapy crawl qibebt_zhuanli -o qibebt_zhuanli.csv