Week1_Practice3

get_Href

from getPageInformation import *
from getMainPageInformation import *
from bs4 import  BeautifulSoup
import requests
import time

url1='http://bj.xiaozhu.com/'
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,20,1)]

sourceData=[]
hreflist=MainPageInformation(url1)
for i in urls:
    hreflist=hreflist+MainPageInformation(i)
    time.sleep(1)
for i in hreflist:
    sourceData.append(getPageInformation(i))
    time.sleep(1)
for i in sourceData:
    print(i)
    print('\n')

getMainPageInformation

from bs4 import  BeautifulSoup
import requests

def MainPageInformation(url):
    self_url=url
    pageData=requests.get(self_url)
    data=BeautifulSoup(pageData.text,'lxml')
    href=data.select(' ul > li > a[class="resule_img_a"]')
    hreflist=[]
    for i in href:
        hreflist.append(i.get('href'))
    return hreflist


getPageInformation

from bs4 import  BeautifulSoup
import requests

def getPageInformation(url):
    self_url=url
    self_pageData=requests.get(url)
    self_data=BeautifulSoup(self_pageData.text,'lxml')
    # print(self_data)
    titles=self_data.select('div.pho_info > h4 > em')
    roomImages=self_data.select('#curBigImage')
    prices=self_data.select("div.day_l > span")
    addresses=self_data.select('div.pho_info > p > span.pr5')
    hosterImages=self_data.select('div.member_pic > a > img')
    hosterName=self_data.select('div.w_240 > h6 > a')
    # print(titles)
    data={}
    for title,roomImage,price,address,hosterImage,name in zip(titles,roomImages,prices,addresses,hosterImages,hosterName):
        data={
            'title':title.get_text(),
            'roomImage':roomImage.get('src'),
            'price':price.get_text(),
            'address':address.get_text().strip(),
            'hosterImage':hosterImage.get('src'),
            'hosterName':name.get_text()
        }
    return data




    # print(titles)
    # print("-------------------------------------------")
    # print(roomImages)
    # print("-------------------------------------------")
    # print(price)
    # print("-------------------------------------------")
    # print(address)
    # print("-------------------------------------------")
    # print(hosterImage)



# url='http://bj.xiaozhu.com/fangzi/269024901.html'
# getPageInformation(url)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容