life is short , i use python
Python 的火热并不是因为Python 是一种简单的语言,在我看来Python的简洁优雅,和它应用范围的广泛,以及强大的类库,都是Python 受到欢迎的因素。
任何一种语言都不不能说简单, Python可能入手很简单,但你想要深入研究并写出优秀的代码, 不下一段苦功夫不不可能达到的。
我今天要写的就是Python强大功能之一的 爬虫, 我要爬取的是某视频直播网站,抓取主播名字和人气,并作出排名。
话不多说上代码。
import re
from urllib import request
class Spider():
# url可以跟换别的页面url
url = 'https://egame.qq.com/livelist?layoutid=1104466820'
root_pattern = '<div class="info-anchor">([\s\S]*?)</div>'
name_pattern = '<p class="name">([\s\S]*?)</p>'
number_pattern = 'alt="火">([\s\S]*?)</span>'
def __fetch_content(self):
# 请求
r = request.urlopen(Spider.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8')
return htmls
def __analysis(self, htmls):
# 提取数据
r = re.findall(Spider.root_pattern, htmls)
anchors = []
for html in r:
name = re.findall(Spider.name_pattern, html)
number = re.findall(Spider.number_pattern, html)
anchor = {'name': name, 'number':number}
anchors.append(anchor)
return anchors
def __refine(self, anchors):
# 数据精炼
l = lambda anchor : {
'name' : anchor['name'][0],
'number' : anchor['number'][0].strip()
}
return map(l, anchors)
def __sort(self, anchors):
# 排序
anchors = sorted(anchors, key=self.__sort_seed, reverse = True)
return anchors
def __sort_seed(self, anchor):
r = re.findall('\d*', anchor['number'])
number = float(r[0])
if '万' in anchor['number']:
number *= 10000
return number
def __show(self, anchors):
# 展示
for x in range(0, len(anchors)):
print(
'rank ' + str(x+1) + ' : '
+ anchors[x]['name'] + ' '
+ anchors[x]['number']
)
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = list(self.__refine(anchors))
anchors = self.__sort(anchors)
self.__show(anchors)
spider = Spider()
spider.go()