python爬虫期末预习

某些需要知道的知识
正则表达式：
20\d* 匹配任意数字
http.?.html 匹配xxxx 输出整条匹配内容
xxx/(.?).xxx 匹配只输出( .? )\内的内容
\d 数字 \D非数字 \s 空白（空格制表符换页符） \S 非空白 \w匹配包括下划线的字母 \W 非字母 .任意所有数量
pattern1=r'https://www.xxx.com/a/(.?).html' 匹配项目书写规则
a=re.findall(pattern1,s3)
a=re.findall(需要匹配的内容,被匹配的内容)
实例：

import re
s1="This is a work day"
s2="Todat is 2019=11=20 to 2009-11-25"
s3="https://www.xxx.com/a/rrrrrk1111.html sajhd asfef afwasf w http://www.xxx.com/a/dew23r234k11.html http://www.xxx.com/a/r221.html"
pattern1=r'https*://www.xxx.com/a/(.*?)\.html'  #消除转义字符
pattern2=r'20\d*'
pattern3=r'http.*?\.html'
a=re.findall(pattern1,s3)
b=re.findall(pattern2,s2)
c=re.findall(pattern3,s3)
print(a)   ['rrrrrk1111', 'dew23r234k11', 'r221']
print(b)    ['2019', '20', '2009']
print(c)    ['https://www.xxx.com/a/rrrrrk1111.html', 'http://www.xxx.com/a/dew23r234k11.html', 'http://www.xxx.com/a/r221.html']

数据库的增删改查 mysql

---------查--------
select * from 表名  #查询表内所有内容
select 列1,列2 from #表名;  查询指定列的内容
select distinct 列…. From 表名;    #数据去重
select concat(列1,列2) from 表名  #拼接查询结果
select 列… from 表名 where 条件;    条件查询
##条件中比较运算符：( 等于:=  大于:>  大于等于:>=  小于:<  小于等于:<=  不等于:!= 或 <>  )
where 列 like '%0'     模糊查询 以0结尾
where 列 like  '%0%  数据包含 0    _任意字符
---------- 插入数据------
insert into 表名(字段1,字段2..) values(值1,值2…);    
insert into 表名 values(值1,值2)；   #全表所有字段进行插入
--------改---------
update 表 set 字段=值 where 条件;  //带条件修改指定数据，否则修改全表
--------删---------
delete from 表 where 条件;  //删除数据带条件指定数据，否则删除全表数据

实例：

import pymysql
def data():
    conn = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='library')
    #数据库连接语句
    c=conn.cursor()
    return c
def chashu():
    num=input("1.书名查找  2.ID查找")
    if num=="1":
        bookname = input("输入书名")
        bsql = "select * from book where book_name like '%{}'".format(bookname)
    elif num=="2":
        bid=input("输入id")
        b1=input("输入高价")
        b2=input("输入低价")
        bsql = "select * from books where book_id like '%{0}%' " \
               "and b_price between {1} and {2}".format(bid, b1, b2)
    c1=data()
    c1.execute(bsql)
    m=c1.fetchall()
    for i in m:
        print(i)
if __name__ == '__main__':
    while True:
        aa=input("1.查找 3.退出")
        if aa=="1":
            chashu()
        elif aa=="3":
            break

循环实例：

x=["Hello","world","aaa"]
abc=lambda x:len(x)
for itme in x:
    print(abc(itme))
# 输出第二个元素
abb1 = lambda x: x[1]
for aa1 in x:
        print(abb1(aa1))
# 输出最后一个元素
abb=lambda x:x[-1]
for aa in x:
    print(abb(aa))
# 输出“l”的个数
acb=lambda x:x.count("l")
for bb in x:
    print(acb(bb))



tuple1=(1,2,3,"a","b","c")
a1=list(tuple1)
a1.append("abc")
print(a1)  #[1, 2, 3, 'a', 'b', 'c', 'abc']
print(tuple(a1))  #(1, 2, 3, 'a', 'b', 'c', 'abc')

多线程实例：

from multiprocessing import Pool
import  time
import datetime
import  requests
import re

mis=["http://imga5.5054399.com/upload_pic/2019/10/25/4399_11081341008.jpg",
    "http://imga5.5054399.com/upload_pic/2019/7/1/4399_16544599503.jpg",
    "http://imga2.5054399.com/upload_pic/2019/10/24/4399_17424840649.jpg",
    "http://imga4.5054399.com/upload_pic/2019/11/7/4399_10042503678.jpg"]
def tupian(mis):
    pattren=r"4399_(.*)\.jpg"
    b=re.findall(pattren,mis)
    heads = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36"}
    res= requests.get(mis, headers=heads)
    with open("D:/pytupian/{0}.jpg".format(b[0]),"wb") as f:
       f.write(res.content)

m=[2,4,6]
def a(m):
    time.sleep(2)
    print(m*m)

if __name__ == '__main__':
    start = datetime.datetime.now()
    p = Pool(2)
    s = p.map(a, m)
    p.map(tupian, mis)
    p.close()
    p.join()
    e = datetime.datetime.now()
    print(e-start)

1.保存网页到本地
"w"写入 “wb”二进制写入
decode() 方法以 encoding 指定的编码格式解码字符串。默认编码为字符串编码。
伪装网页的

import requests   
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "#浏览器的伪装
                 "AppleWebKit/537.36 (KHTML, like Gecko) "
                 "Chrome/63.0.3239.26 Safari/537.36 "
                 "Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
baiduLink="https://www.baidu.com"  #网站的地址
pic="https://www.baidu.com/img/bd_logo1.png"   #图片的地址

res =requests.get(baiduLink,headers=aa)  #爬取网站
res.content                                                 #二进制
with open("D:/python/q.html","w",encoding="utf-8")as f:     
    f.write(res.content.decode("utf8"))        #二进制解码
res =requests.get(pic,headers=aa)
with open("D:\python\q.png","wb")as f:
    f.write(res.content)

2，输入贴吧名，获取内容
find 只返回搜索的第一个对象
find_all 返回所有的匹配对象
soup.sellect 搜索所有匹配内容
其中搜索格式

搜索标签
print soup.select('title') 
#[<title>The Dormouse's story</title>]
搜索类名
print soup.select('.sister')
#[<a class="sister" href="http://example.com/elsie" id="link1">
通过ID查找
print soup.select('#link1')
#[<a class="sister" href="http://example.com/elsie" id="link1">
组合标签查询
print soup.select("head > title")
#[<title>The Dormouse's story</title>]
标签内属性查找
print soup.select('a[href="http://example.com/elsie"]')
#[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]

查找实例

from bs4 import BeautifulSoup
import requests
aa={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) "
                 "AppleWebKit/537.36 (KHTML, like Gecko) "
                 "Chrome/63.0.3239.26 Safari/537.36 "
                 "Core/1.63.6788.400 QQBrowser/10.3.2816.400"}
tiebaName=input("请输入贴吧名\n")#输出文字  输入后赋值给tixxx
for i in range(10):#赋予i0-9共十个值
    url1="https://tieba.baidu.com/f?ie=utf-8&kw={0}&fr=search".format(tiebaName)
    res=requests.get(url1,headers=aa)
    fliePath="D:/python/贴吧.txt"
    with open(fliePath,"w",encoding="utf-8")as f:
        f.write(res.content.decode("utf-8"))
with open("D:/python/贴吧.txt","r",encoding="utf-8")as f:
    soup = BeautifulSoup(f, 'html.parser')    #html.parser解码方式
    a = soup.find('meta', {'name': 'description'})
    print(a['content'])  # 取属性值
    b = soup.find('title')
    print(b.string)  #取字符串
    c=soup.find_all('meta') #列表，定位的时候可以用列表索引
    print(c[1]['content'])
    d=soup.select('meta[name="description"]')
    print(d)

lista =soup.select('div#nav_menu > a')     #查找  div标签中  类名为 nva__xxx  中下级标签 a
for item in lista:
    # listNav.append(item['href'])
    print(item['href'],item.string)   #输出  href的内容    输出里面的文本内容

listb = soup.find('ul',{'class':'post_nav_block'})      #查找 第一个搜索到的ul标签 其 class值为post_nav_block
print(listb)
for item in listb:
    rul=item.find('a')                   #无标签a 及结束输出
    if rul==-1:
        continue
    print(rul['href'],rul['title'],rul.string)   #输出   第一个  herf的值    title的值    文本值

最后编辑于：2019.12.29 18:01:45

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 224,861评论 6赞 522
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 96,263评论 3赞 402
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 172,033评论 0赞 366
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 60,999评论 1赞 300
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 70,000评论 6赞 400
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 53,483评论 1赞 314
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 41,850评论 3赞 428
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 40,827评论 0赞 279
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 47,366评论 1赞 324
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 39,404评论 3赞 346
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 41,525评论 1赞 355
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 37,130评论 5赞 351
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 42,853评论 3赞 338
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 33,293评论 0赞 25
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 34,426评论 1赞 276
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 50,082评论 3赞 381
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 46,590评论 2赞 366

python爬虫期末预习

推荐阅读更多精彩内容