1、items.py
-- coding: utf-8 --
import scrapy
class LearnscrapyItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
2、settings.py
-- coding: utf-8 --
BOT_NAME = 'learnscrapy'
SPIDER_MODULES = ['learnscrapy.spiders']
NEWSPIDER_MODULE = 'learnscrapy.spiders'
ROBOTSTXT_OBEY = True
COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
'learnscrapy.middlewares.USERAGENT': 1,
}
ITEM_PIPELINES = {
'learnscrapy.pipelines.LearnscrapyPipeline': 300,
}
3、middlewares.py
-- coding: utf-8 --
导入随机模块
import random
导入有关IP池有关的模块
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
导入有关用户代理有关的模块
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
IP池
class HTTPPROXY(HttpProxyMiddleware):
# 初始化 注意一定是 ip=''
def init(self, ip=''):
self.ip = ip
def process_request(self, request, spider):
item = random.choice(IPPOOL)
try:
print("当前的IP是:"+item["ipaddr"])
request.meta["proxy"] = "http://"+item["ipaddr"]
except Exception as e:
print(e)
pass
设置IP池
IPPOOL = [
{"ipaddr": "182.117.102.10:8118"},
{"ipaddr": "121.31.102.215:8123"},
{"ipaddr": "1222.94.128.49:8118"}
]
用户代理
class USERAGENT(UserAgentMiddleware):
#初始化 注意一定是 user_agent=''
def init(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
item = random.choice(UPPOOL)
try:
print("当前的User-Agent是:"+item)
request.headers.setdefault('User-Agent', item)
except Exception as e:
print(e)
pass
设置用户代理池
UPPOOL = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]
4、pipeline.py
-- coding: utf-8 --
import pymysql
import json
class LearnscrapyPipeline(object):
def init(self):
# 数据库连接
self.conn = pymysql.connect(host='192.168.126.181', user='wx', password='wx', database='test',
charset='utf8')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
for j in range(0, len(item["name"])):
nam = item["name"][j]
lin = item["link"][j]
sql = "insert into site(name,link) values(%s,%s)"
self.cur.execute(sql, (nam, lin))
self.conn.commit()
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()
5、spiders/test.py
-- coding: utf-8 --
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from learnscrapy.items import LearnscrapyItem
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['sohu.com']
start_urls = ['http://www.sohu.com/']
rules = (
Rule(LinkExtractor(allow=('http://news.sohu.com'), allow_domains=('sohu.com')), callback='parse_item',
follow=False),
)
def parse_item(self, response):
i = LearnscrapyItem()
i['name'] = response.xpath('//div[@class="news"]/p/a/text()').extract()
i['link'] = response.xpath('//div[@class="news"]/p/a/@href').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
6、main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(file)))
execute(['scarpy', 'crawl', 'test'])