1、获取指定标签内容
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj = BeautifulSoup(html, 'lxml')
namelist = bs0bj.findAll('span', {'class': 'green'}) # 获取页面所有指定标签
for name in namelist:
print(name.get_text())
2、处理子标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj = BeautifulSoup(html, 'lxml')
for child in bs0bj.find('table', {'id': 'giftList'}).children:
print(child)
3、处理兄弟标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
for sibling in bsObj.find("table", {"id": "giftList"}).tr.next_siblings:
print(sibling)
# previous_siblings 前一组
# next_siblings 后一组
# previous_sibling前一个
# next_siblings后一个
4、父标签处理
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
print(bsObj.find("img", {"src": "../img/gifts/img1.jpg"
}).parent.previous_sibling.get_text()
)
---------------------------------------------------------------------------
#打印输出
$15.00
---------------------------------------------------------------------------
(1) 选择图片标签src="../img/gifts/img1.jpg";
(2) 选择图片标签的父标签(在示例中是<td> 标签);
(3) 选择<td> 标签的前一个兄弟标签previous_sibling(在示例中是包含美元价格的<td>
标签);
(4) 选择标签中的文字,“$15.00”。
5、正则表达式
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
images = bsObj.findAll("img", {"src": re.compile(r"../img/gifts/img.*.jpg")})
for image in images:
print(image["src"])