抓取人人日志
把自己的第一篇博客地址和Cookie保存到代码中,执行python renren_spider.py
即可
https://github.com/MioYvo/renren_blog_spider
附上代码
# coding=utf-8
# __author__ = 'Mio'
import requests
from bs4 import BeautifulSoup
from tornado.escape import utf8
download_dir = "/Users/mio/Downloads/renren_blog/"
first_url = "http://blog.renren.com/blog/408842825/938039669"
cookie = {"Cookie": " .... "}
def save_blog(blog_url):
print blog_url
html = requests.get(blog_url, cookies=cookie).content
# soup = BeautifulSoup(open("/Users/mio/Desktop/r_blog.html"), "html.parser")
soup = BeautifulSoup(html, "html.parser")
# 日期
blog_date = soup.find('span', class_="blogDetail-ownerOther-date")
blog_date = utf8(blog_date.contents[0])
# 标题
title = soup.find('h2', class_="blogDetail-title")
title = utf8(title.contents[0])
title = title.replace("/", "\\")
print title
# print soup
a = soup.find_all("div", class_="blogDetail-content")
blog_content = a[0]
with open("{}{}.md".format(download_dir, title), "wb") as fw:
fw.write("# {}\n".format(title))
fw.write("> {}\n\n".format(blog_date))
for i in blog_content:
try:
fw.write(str(i))
except Exception as e:
print e
pass
return get_next(soup)
def get_next(soup):
# 下一篇
pre_and_next = soup.find_all(class_="blogDetail-pre")
if pre_and_next:
next_blog_url = pre_and_next[0].findAll('a', href=True)
if next_blog_url:
return next_blog_url[0]['href']
return False
if __name__ == '__main__':
while True:
next_url = save_blog(first_url)
if next_url:
url = next_url
else:
print 'done'
break