frombs4importBeautifulSoup
importrequests
importtime
url='http://bj.58.com/pingbandiannao/25936435448255x.shtml?psid=110197818191709710732024550&entinfo=25936435448255_0&iuType=p_0'
defget_58_goods_page(url):
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
cates=soup.find_all('a',href='http://bj.58.com/pbdn/')
#print(cates[0].get_text())
titles=soup.select('div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
#print(titles[0].get_text())
times=soup.select('ul.mtit_con_left.fl > li.time')
# print(times[0].get_text())
prices=soup.select('div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span')
# print(prices[0].get_text()+'元')
locations=soup.select(' div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span > a')
# print(locations[0].get_text(),'-',locations[1].get_text())
iflen(locations)==2:
location1=locations[0].get_text() +'-'+ locations[1].get_text()
eliflen(locations)==1:
location1=locations[0].get_text()
forcate,title,time,price,locationinzip(cates,titles,times,prices,locations):
data={
'cate':cate.get_text(),
'title':title.get_text(),
'time':time.get_text(),
'price':price.get_text()+'元',
'location':location1,
'url':url
}
print(data)
# get_58_goods_page(url)
url2='http://bj.58.com/pbdn/0/pn'
defget_58_index_page(url):
page2=[]
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
# page=soup.find_all(class_='t')
pages=soup.select('a.t')
forpageinpages:
page1=page.get('href')
if'entinfo'inpage1and'zhineng'not inpage1and'jing'not inpage1:
page2.append(page1)
returnpage2
defnumber_of_url(start,end,url):
fornuminrange(start,end+1):
page=url+str(num)
page2=get_58_index_page(page)
forpage3inpage2:
time.sleep(2)
get_58_goods_page(page3)
number_of_url(1,2,url2)