# import requests
# import requests
#
# response=requests.get("https://s.weibo.com/weibo?q=%E6%B4%9B%E5%A4%A9%E7%84%B6%EF%BC%8C&wvr=6&b=1&Refer=SWeibo_box")
#
# #打印类型
# print(type(response))
# #打印状态码
# print(response.status_code)
# #打印网页源码类型
# #字符串类型,不需要decode(区别于urllib)
# print(type(response.text))
# #打印网页源码
# for x in range(20):
# response.xpath("//*[@id="pl_feedlist_index"]/div[1]/div[3]/div[2]/div[1]/div[2]/div[1]")
#
# print(response.text)
# #打印cookie
# print(response.cookies)
import time
from lxmlimport etree
import requests
import xlsxwriter
from requests.authimport HTTPBasicAuth
from requests.exceptionsimport RequestException
def get_url(url):
try:
# b = requests.post(url, auth=HTTPBasicAuth('xxxxx', yyyyyy)) # xxxxx为禅道登录的用户名 yyyyyy 为禅道登录的密码
b = requests.get(url)# xxxxx为禅道登录的用户名 yyyyyy 为禅道登录的密码
if b.status_code ==200:
return b.text
return b.status_code
except RequestException:
return None
def get_text(html):
tree = etree.HTML(html)
# "//*[@id="pl_feedlist_index"]/div[1]/div[2]/div[2]/div[1]/div[2]"
workbook = xlsxwriter.Workbook(r"C:\Users\Administrator\Desktop\projects\tencent_pro\excel_file\weibo.xlsx",
options={# 全局设置
'strings_to_numbers':True,# str 类型数字转换为 int 数字
'strings_to_urls':False,# 自动识别超链接
'constant_memory':False,# 连续内存模式 (True 适用于大数据量输出)
'default_format_properties': {
'font_name':'微软雅黑',# 字体. 默认值 "Arial"
'font_size':10,# 字号. 默认值 11
'bold':False,# 字体加粗
'border':1,# 单元格边框宽度. 默认值 0
'align':'vcenter',# 对齐方式
'valign':'vcenter',# 垂直对齐方式
# 'text_wrap': False, # 单元格内是否自动换行
}})
# 创建工作表
worksheet = workbook.add_worksheet('first_sheet')
# 写单元格
worksheet.write_row(0,0, ['链接','发布时间','播放次数','视频时长','标题','发布者','发布者ID','平台名称','原本标题','原版链接'])
worksheet.set_column('A:C',10)# 列宽约等于8像素 行高约等于1.37像素
worksheet.set_column('B:C',30)
worksheet.set_column('D:E',30)
worksheet.set_column('F:G',30)
worksheet.set_column('H:I',30)
xp ="/html/body/div[1]/div[3]/div/div[1]/div[2]/div[@class='card-wrap']"
nodetitle = tree.xpath(xp)
print(len(nodetitle), nodetitle)
i =1
for itemin nodetitle:
data_text = item.xpath("/html/body/div[1]/div[3]/div/div[1]/div[2]/div[{i}]/div/div[1]/div[2]/div[2]/div[2]/a".format(i=i))# 取出第一列的值(ID)
try:
if not data_text[0].get('href',''):
continue
# title
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[1]/div/div[1]/div[2]/p[1]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/div[1]/div[2]/p[1]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[3]/div/div[1]/div[2]/p[1]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[6]/div/div[1]/div[2]/p[1]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[11]/div/div[1]/div[2]/p[1]/a[3]"
weibo_tiele_list = item.xpath("/html/body/div[1]/div[3]/div/div[1]/div[2]/div[{i}]/div/div[1]/div[2]/p[1]/a".format(i=i))
title = weibo_tiele_list[0].text
print(title)
# for obj_title in weibo_tiele_list:
# title+=obj_title.text
# print(title,"dsadasdsadasdsa")
#视频地址
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[4]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/video"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[5]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/video"
# time.sleep(10)
# "//*[@id="H5_hltdd_1605103535400178213_html5_api"]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[18]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/video"
video_url = item.xpath("/html/body/div[1]/div[3]/div/div[1]/div[2]/div[17]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]".format(i=i))
video_url = item.xpath(".//video")
# print(item)
# print(video_url,'dsdsd')
# 时间
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[1]/div/div[1]/div[2]/p[3]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/div[1]/div[2]/p[3]/a"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[2]/div/div[1]/div[2]/p[3]/a"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[8]/div/div[1]/div[2]/p[3]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[16]/div/div[1]/div[2]/p[3]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[18]/div/div[1]/div[2]/p[3]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[19]/div/div[1]/div[2]/p[3]/a[1]"
# 视频时长
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[7]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/div[5]/div[4]/div/span"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[11]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/div[5]/div[4]/div/span"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[12]/div/div[1]/div[2]/div[2]/div[2]/a/div[2]/div/div[5]/div[4]/div/span"
# 发布者 发布者id 取href
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[13]/div/div[1]/div[2]/div[1]/div[2]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[18]/div/div[1]/div[2]/div[1]/div[2]/a[1]"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[19]/div/div[1]/div[2]/div[1]/div[2]/a[1]"
# 播放次数
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[8]/div/div[2]/ul/li[4]/a/em"
"/html/body/div[1]/div[3]/div/div[1]/div[2]/div[10]/div/div[2]/ul/li[4]/a/em"
except:
pass
# try:
# data_text = item.xpath("/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[{i}]/div[2]/div[1]/div[2]/p[@class='txt']/a".format( i=i)) # 取出第一列的值(ID)
# print(type(data_text))
# print(data_text[0].text)
# except:
# pass
# data_text = item.xpath(
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[{i}]/div/div[1]/div[2]/p[@class='txt']".format(i=i)) # 取出第一列的值(ID)
i+=1
# i2 = item.xpath('td[2]/a') # 取出第二列的值(真实姓名)
# if i2 == []:
# i2 = item.xpath('td[2]')
# print(i2[0].text)
def main():
# url = 'https://s.weibo.com/weibo?q=234%5C&xsort=hot&Refer=hotmore'
url ='https://s.weibo.com/weibo?q=123\&wvr=6&b=1&Refer=SWeibo_box'
url ='https://s.weibo.com/weibo?q=123%5C&xsort=hot&Refer=hotmore'
html =get_url(url)
t =get_text(html)
print(t)
if __name__ =='__main__':
main()
# 热门 是否是视频判断
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[4]/div[2]/div[1]/div[2]/div[2]/div[2]/a"
#
# "/html/body/div[1]/div[3]/div/div[1]/div[2]/div[3]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div/div[1]/div[2]/div[5]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div/div[1]/div[2]/div[7]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div/div[1]/div[2]/div[5]/div/div[1]/div[2]/div[2]/div[2]/a"
# # 热门文章 是否是视频判断
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[10]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[18]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[23]/div/div[1]/div[2]/div[2]/div[2]/a"
# "/html/body/div[1]/div[3]/div[2]/div[1]/div[1]/div[25]/div/div[1]/div[2]/div[2]/div[2]/a"