Week 1
第一节练习项目:动手做自己的网页
<!DOCTYPE html>
<head>
<meta charset="UTF-8">
<title>Homework</title>
<link rel="stylesheet" type="text/css" href="homework.css">
</head>
<body>
<div class="header">
<img src="images/blah.png">
<ul class="nav">
<li><a href="#">Home</a></li>
<li><a href="#">Site</a></li>
<li><a href="#">Other</a></li>
</ul>
</div>
<div class="main-content">
<h2>The blah</h2>
<hr>
<ul class="photos">
<li><img src="images/0001.jpg" width="150" height="150"></li>
<li><img src="images/0003.jpg" width="150" height="150"></li>
<li><img src="images/0004.jpg" width="150" height="150"></li>
</ul>
<p>
Wa! Wa! Wa! Wa! Wa!
</p>
<p>Wow!</p>
</div>
<div class="footer">
<p>©MUGGLECODING</p>
</div>
</body>
</html>
笔记:CSS 写得好就是省事,HTML 和 CSS 互相促进结构化……之前对其有大致了解,在此不深究
第二节练习项目:爬取商品信息
from bs4 import BeautifulSoup
with open('/Users/arischow/Downloads/Plan-for-combating-master/week1/1_2/1_2answer_of_homework/1_2_homework_required/index.html', 'r', encoding='utf-8') as wb_data:
Soup = BeautifulSoup(wb_data, 'lxml')
titles = Soup.select('div.caption > h4 > a')
prices = Soup.select('div.caption > h4.pull-right')
descs = Soup.select('div.caption > p')
reviews = Soup.select('div.ratings > p.pull-right')
rates = Soup.select('div.ratings > p:nth-of-type(2)')
for title, price, desc, review, rate in zip(titles, prices, descs, reviews, rates):
data = {
'title': title.get_text(),
'price': price.get_text(),
'desc': desc.get_text(),
'review': review.get_text(),
'rate': len(rate.find_all('span', 'glyphicon glyphicon-star'))
}
print(data)
笔记:看手册,find_all 返回的会是一个 list,len 的值就代表有多少颗星星……
第三节练习项目:爬取租房信息
from bs4 import BeautifulSoup
import requests
import re
searchPage = ['http://bj.xiaozhu.com/search-duanzufang-p1-0/']
searchPages = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,14)]
homePages = []
def get300():
for url in searchPages:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('ul > li')
for link in links:
link = link.find_all(href=re.compile('fangzi'))
for l in link:
homePages.append(l.get('href'))
return homePages
def get300info():
for url in homePages:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addrs = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
firstImgs = soup.select('#curBigImage')
prices = soup.select('#pricePart > div.day_l')
ownerImgs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
ownerNames = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
genders = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic')
for title, addr, firstImg, price, ownerImg, ownerName, gender in zip(titles, addrs, firstImgs, prices, ownerImgs, ownerNames, genders):
data = {
'title': title.get_text(),
'addr': addr.get_text().strip(),
'firstImg': firstImg.get('src'),
'price': price.get_text(),
'ownerImg': ownerImg.get('src'),
'ownerName': ownerName.get_text(),
}
if gender.find_all(class_='member_ico1') != []:
data['gender'] = 'female'
else:
data['gender'] = 'male'
print(data)
get300()
get300info()
笔记:这个也是很久前就写好了,现在回想起来,感觉直接跑就可以了,根本不需要用 def 封装,房东性别不同,div.member_pic 所对应的 class 也不同,所以写个简单的 if 判断就可以搞定。
第一周实战作业:爬取一页商品数据
from bs4 import BeautifulSoup
import requests, time
url = 'http://bj.58.com/pbdn/0/'
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
}
wb_data = requests.get(url, headers=header)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('tr > td.t > a.t') # 未过滤推广/转转商品
filters = []
for title in titles:
if title.get('data-addtype') == None and title.get('onclick') == None:
filters.append(title)
titles = filters[:] # 过滤完毕, 复制列表
itemlinks = []
counter_base = 'http://jst1.58.com/counter?infoid='
for title in titles:
itemlinks.append(title.get('href')[:title.get('href').find('?psid')])
for link in itemlinks:
item_url = link
item_url_digits_only = item_url[item_url.rfind('/') + 1 : -7] # counter infoid 需要
counter_url = counter_base + item_url_digits_only
counter_header = {'Referer': item_url}
item_data = requests.get(item_url, headers=header)
item_soup = BeautifulSoup(item_data.text, 'lxml')
# 爬取内容
item_categories = item_soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
item_titles = item_soup.select('div.per_ad_left > div.col_sub.mainTitle > h1')
item_published_dates = item_soup.select('ul.mtit_con_left.fl > li.time')
item_prices = item_soup.select('div.col_sub.sumary > ul > li > div.su_con > span.price.c_f50')
item_purities = item_soup.select('div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
item_zones = item_soup.select('div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
item_views = str(requests.get(counter_url, headers=counter_header).content) # 将 byte 转化为 str 作字符串处理
item_views = item_views[item_views.find('total=') + 6 : -1] # 提取浏览量
for c, t, pd, pr, pu, z in zip(item_categories, item_titles, item_published_dates, item_prices, item_purities, item_zones):
zones = list(z.stripped_strings) # print 一下就知道, 连那个 '-' 都在 list 里面, 没用的东西自然要去掉
zones = [x for x in zones if x != '-'] # 不是 '-' 的加进 list, 换而言之就是 带 '-' 的去掉了
data = {
'Category': c.get_text(),
'Title': t.get_text(),
'Published Date': pd.get_text(),
'Price': pr.get_text(),
'Purity': pu.get_text().strip(),
'Zone': list(z.stripped_strings),
'Views': item_views,
#'URL': item_url
}
print(data)
笔记:推广商品在 <a>
里面会带 'data-addtype=level2'
这个属性,而转转商品则会有 'onclick=xxxxxxxxx'
属性,而正常商品是不会有这两个属性的,所以写个判断把它们排除。item_views
返回的这堆字符串是要经过自己处理的,合理利用 find / lfind / rfind
处理起来会非常方便,另外header
带着Referer
去爬 counter 是必须的,否则 js 返回的值始终为 0
。多个地区的问题参考 1-2 时的代码,用stripped_strings
,这里解决是解决了,不过感觉代码还可以精简。