爬取新闻标题,新闻简述,标签并写进csv文件
#__author:'cuiwnehao'__
#coding:utf-8
import requests,csv
from lxml import etree
root_url='http://scitech.people.com.cn'
def get_urls():
base_url='http://scitech.people.com.cn/index{}.html#fy01'
urls=[]
for page in range(0,13):
req_url=base_url.format(str(page))
#print(req_url)
urls.append(req_url)
return urls
def parse_data(url):
req = requests.get(url)
if req.status_code==200:
req.encoding = "GB2312"
html = req.text
selector=etree.HTML(html)
infos=selector.xpath('//div[@class="hdNews clearfix"]')
for info in infos:
item={}
short_contents = info.xpath('div[@class="on"]/em/a/text()')
if short_contents:
title=info.xpath('div[@class="on"]/h5/a/text()')[0]
title_url=root_url+info.xpath('div[@class="on"]/h5/a/@href')[0]
short_content=short_contents[0]
tags = '-'.join(info.xpath('h6/em[@class="gray"]/a/text()'))
#print(title,title_url,short_contents,tags)
item['url'] = url
item['title_url'] = title_url
item['title'] = title
item['short_content'] = short_content
item['tags'] = tags
datas.append(item)
return datas
if __name__=="__main__":
urls=get_urls()
global datas
datas=[]
for url in urls:
parse_data(url)
with open('datas1.csv','w',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
writer.writerow(datas[0].keys())
for data in datas:
writer.writerow(data.values())