spider(爬虫)_获取网页
一 . requests
- pip install requests
- import request
1. get
res = requests.get('http://www.baidu.com')
print(res) # 网页对象
print(res.text) # utf-8类型文本
print(res.content) # byte类型的数据
print(res.content.decode('GBK')) # GBK类型的数据
2. post
import re
import requests
url = 'http://www.heibanke.com/lesson/crawler_ex01/'
jsondata = {
'username': 'test',
'password': ''
}
for i in range(31):
jsondata['password'] = i
req2 =requests.post(url, data=jsondata)
if '您输入的密码错误' not in req2.text:
print('密码是:', i)
break
3 . headers
def get_html(url):
headers = {
'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
res = requests.get(url, headers=headers)
return res.text
4. 使用代理
5. 补充说明
在很多需要登录认证的情况下,需要带上cookie和埋点
针对有些网站还需要使用代理池,随机User-agent
二. urllib(py自带)
import urllib.request
import urllib.error
import urllib.response
import urllib.parse
1. get
response = urllib.request.urlopen('http://www.baidu.com')
# read 读取 decode 解码
print(response.read().decode())
2. post
# 在urlopen中有 data参数 只要使用data 就是post请求
3. headers
headers = {
'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
req = urllib.request.Request('http://www.baidu.com',
headers=headers)
4. 使用代理
proxyHandler = urllib.request.ProxyHandler({
'https': 'https://220.168.52.245:55255'
})
opener = urllib.request.build_opener(proxyHandler)
response = opener.open(fullurl='http://www.baidu.com/s?wd=Python')
print(response.read().decode('utf8'))
spider(爬虫)_数据筛选
一 . re(正则)
1.正则符号
. - 匹配任意字符
\w - 匹配一个ASCII表中字母、数字、下划线,还可以匹配一个所有非ASCII表字符
\s - 匹配一个空白字符
\d - 匹配一个数字字符
\b - 检测单词边界(检测\b所在的位置是否是单词边界)
^ - 检测是否是字符串开头
$ - 检测是否是字符串结尾
\W - 匹配非数字、字母、下划线
\S - 匹配非空白字符
\D - 匹配非数字字符
\B - 检测是否是非单词边界
[] - 匹配一个中括号中的任意的一个字符
*,+,?,{N},{M,N},{M,},{,N} -- 贪婪
*?,+?,??,{N}?,{M,N}?,{M,}?,{,N}? -- 非贪婪
正则1|正则2
()分组
2.re模块
- 完全匹配re.fullmatch()
- 匹配字符串开头re.match()
- 查找re.search()
- 搜索re.findall() .re.finditer
- 分割:re.split()
- 替换:re.sub()
3. 应用
# 正则匹配( names = ("排名", "影片", "演员", "上映时间", "评分"))
#创建正则
patterns = re.compile(r'class="board-index board-index-(.*?)">.*?movie-item-info">.*?title="(.*?)" data-act.*?主演:(.*?)\n.*?上映时间:(.*?)</p>.*?integer">(.*?)</i><i class="fraction">(.*?)</i>', re.S)
#匹配
result = patterns.findall(req.text)
- re.S 表示支持换行
- (.*?) 表示 需要返回的内容
- ? 表示非贪婪模式
二. BeautifulSoup
- pip install beautifulsoup4
- from bs4 import BeautifulSoup
1. 标签选择器
from bs4 import BeautifulSoup
html1 = """
<html><head><title>学习爬虫好开心</title></head>
<body>
<p class="title" name="dromouse"><b>( ̄TT ̄)笔芯</b></p>
<p class="story">喵了个猫
<a href="http://example.com/elsie" class="sister" id="link1">汪汪汪,汪星人</a> and
<a href="http://example.com/lacie" class="sister" id="link2">喵喵喵,喵星人</a>
最后变成一锅高汤</p>
<p class="story">...</p>
"""
# 标签解析器
# 通过soup对象.标签 获取整个标签的内容
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(soup.title.string)
# # 多个p标签获取的只是第一个
print(soup.p)
print(soup.p.string)
# # 通过soup对象.标签[属性] 或者, soup对象.标签.attrs[属性] 获取属性的值
# 两种方法结果一样
# print(soup.p['name'])
# print(soup.p['class'])
# print(soup.p.attrs['class'])
# print(soup.p.attrs['name'])
# 获取兄弟节点
# soup对象.p标签.next_siblings
print(soup.p.next_sibling)
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))
# soup对象.p标签.next_previous
print(soup.p.previous_siblings)
print(list(soup.p.previous_siblings))
# 获取子节点
print(soup.body)
print(list(soup.body.children))
# 获取第二个p标签
print(list((list(soup.p.next_siblings)[1].children)))
print(list((list(soup.p.next_siblings)[1].a)))
# # 父节点
print(soup.a.parent)
print(list(soup.a.parents))
# 子孙节点
print(list(soup.body.children))
print(list(soup.body.descendants))
# prettify 补充优化html文档和结构
print(soup.prettify())
# 去节点名字
print(soup.p.name)
print(soup.a.name)
2. 标准选择器
# 标准选择器
'''
find()
find_all()
'''
html = '''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo<>
<li class="element">Bar<>
<li class="element">Jay<>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
# #####################
'''
find()
find_all()
'''
soup = BeautifulSoup(html, 'lxml')
# 通过标签查
print(soup.find_all('ul'))
print(soup.find_all('li'))
# 通过属性查
print('^-^ '*20)
print(soup.find_all(attrs={'id': 'list-2'}))
print(soup.find_all(attrs={'class': 'list-small'}))
print('^-^ '*20)
print(soup.find_all(id='list-2'))
print(soup.find_all(class_='list-small'))
# 通过文本获取内容
print('^-^通过文本获取内容 '*5)
print(soup.find_all(text='Foo'))
3. css解释器
# #################################################
# CSS 选择器
print(soup.select('ul li'))
print(soup.select('#list-2 li'))
print(soup.select('.list-small li'))
for item in soup.select('.list-small li'):
print(item.string)
spider(爬虫)_数据存储
1. text
# 组装 str1
str1 = ''
for item in infolist:
tmp = '排名:{} 影片:{} 演员:{} 上映时间:{} 评分:{} \n'.format(item[0], item[1], item[2], item[3], item[4]+item[5])
print(tmp)
str1 += tmp
# 写入文件
with open('top100.txt', 'w')as file:
file.write(str1)
2. excl
import xlwt
mybook = xlwt.Workbook()
mySheet = mybook.add_sheet('xxxxxxxxx')
names = ("排名", "影片", "演员", "上映时间", "评分")
for i in range(5):
mySheet.write(0, i, names[i])
for index, item in enumerate(infolist):
for i in range(5):
data = item[i]
if i == 4:
data = item[i]+item[i+1]
mySheet.write(index+1, i, data)
mybook.save('text.xls')
3. sql
import pymysql
def get_mysql_connect():
"""
创建数据库对象
:return:
"""
conn = pymysql.Connection(host='127.0.0.1',
port=3307,
user='root',
password='123456',
database='1901spider'
)
return conn
def get_cursor(conn):
"""
获取游标
:param conn:
:return:
"""
cursor = conn.cursor()
return cursor
def execute_sql(cursor, data):
"""
执行sql语句
:param cursor:
:param data:
:return:
"""
for item in data:
sql = 'insert into maoyantop100\
(vactor, vindex, releasetime, vname, vscore, vimg)\
values ("{}", "{}", "{}", "{}", "{}", "{}")'.format(
item['actor'], item['index'], item['time'], item['name'], item['score'], item['logo'])
cursor.execute(sql)
def close_conn(conn):
"""
提交和关闭链接
:param conn:
:return:
"""
conn.commit()
conn.close()