Python爬取豆瓣电影250的相关信息,然后保存到mongodb。
代码如下:
import requests
from bs4 importBeautifulSoup
import pymongo
from pymongo importMongoClient
conn =MongoClient('192.168.129.150', 27017)
movie=conn['movie']
top250=movie['top250']
urls=['https://movie.douban.com/top250?start={}&filter='.format(str(i*25)) fori in range(0,10)]
defget_movieinfo(urls,data = None):
web_data=requests.get(urls)
soup =BeautifulSoup(web_data.text,'lxml')
for data in soup.select('.item'):
rank = data.select('em')[0].text
name =data.select('.info')[0].select('a')[0].text.split('\n')[1]
score =data.select('.rating_num')[0].text
link = data.select('a')[0]['href']
#nation =data.select('.info')[0].select('.bd')[0].text
director_actor =data.select('.bd')[0].text.split('\n')[2].lstrip()
time_country =data.select('.bd')[0].text.split('\n')[3].lstrip()
print(rank, name, score,link,director_actor, time_country)
movie.top250.insert_one({'rank':rank,'name':name,'score':score,'link':link,'director_actor':director_actor,'time_country':time_country})
for a in urls:
get_movieinfo(a)
mogodb查询的结果如下: