import requests
from lxml import etree
import json
import os
import time
# import numpy as np
# import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode
# 配置ehcarts路径
from pyecharts.globals import CurrentConfig
CurrentConfig.ONLINE_HOST = "http://172.16.102.9:8087/echarts_lib/"
def requestd(url):
# request:
t = requests.get(url=url, headers={
"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
}).text
return t
def sina2hot(url):
# 新浪热搜数据:
r = requestd(url)
html = etree.HTML(r)
# 匹配需要的字段
a_herfs = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/@href')
a_text = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/text()')
a_num = html.xpath('//table/tbody/tr/td[@class="td-02"]/span/text()')
# 组装成一个list
big_list = []
host = 'https://s.weibo.com'
for i, value in enumerate(a_num):
obj = dict()
obj['url'] = host + a_herfs[i]
obj['name'] = a_text[i]
obj['num'] = int(a_num[i])
obj['type'] = '新浪热搜'
obj['strnum'] = str(int(obj['num']/10000))+'万'
big_list.append(obj)
return big_list
def baidu2hot(url):
# 百度热搜数据:
b = requestd(url)
html = etree.HTML(b)
hotsearch = html.xpath('//textarea[@id="hotsearch_data"]/text()')
big_list = []
host = 'http://www.baidu.com/s?tn=news&wd='
for item in hotsearch:
josondb = json.loads(item)
for items in josondb['hotsearch']:
obj = dict()
obj['url'] = host + items['pure_title']
obj['name'] = items['pure_title']
obj['num'] = int(items['heat_score'])
obj['type'] = '百度热搜'
obj['strnum'] = str(int(obj['num']/10000))+'万'
# obj['htmls'] = getnewdetails(obj['url'])
big_list.append(obj)
return big_list
def getnewdetails(url):
# 获取新闻详情数据:
t = requestd(url)
html = etree.HTML(t)
aherf = html.xpath('//h3[@class="c-title"]/a/@href')
newdb = ''
for item in aherf:
if 'baijiahao.baidu.com' in item:
s = requestd(item)
htmls = etree.HTML(s)
# # 获取元素=>转成html
result = htmls.xpath('//*[@class="article-content"]/p')
for i in result:
newdb += etree.tostring(i, encoding='utf-8').decode()
# print(newdb)
return newdb
def save2json(file_save_path, file_db_list):
# 数据存json文件:
# 中文禁用ascii,采用UTF8
if file_save_path:
os.chdir(file_save_path)
with open('test.json', 'w', encoding='utf-8') as F:
json.dump(file_db_list, F, ensure_ascii=False)
pass
def init2db(file_save_path):
# 合并数据list:
sinaurl = 'https://s.weibo.com/top/summary?cate=realtimehot'
baidurl = 'http://www.baidu.com/'
# 合并list
all_list = sina2hot(sinaurl) + baidu2hot(baidurl)
# 倒序=》根据浏览量排序
new_list = sorted(all_list, key=lambda num: num['num'], reverse=True)
# print('总共', str(len(new_list)))
# 去重
new_list_1 = []
seen = []
for d in new_list:
t = d['name']
if t not in seen:
seen.append(t)
new_list_1.append(d)
print('去重排序后总共', str(len(new_list_1)))
# 图表展示
json2charts(new_list_1, file_save_path)
save2json(file_save_path, new_list_1)
pass
def json2charts(all_list, file_save_path):
# 图形化:
bar = Bar()
# 重新处理echarts需要的数据
for item in all_list:
db = []
obj = dict()
obj['name'] = item['name']
obj['value'] = item['num']
db.append(obj)
bar.add_yaxis(item['type'], db)
nowtime = time.strftime("%Y-%m-%d")
bar.add_xaxis([nowtime])
bar.set_series_opts(
label_opts=opts.LabelOpts(is_show=False),
# label_opts=opts.LabelOpts(formatter=JsCode(
# "function(x){return parseInt(x.value/10000)+'万'}"
# )),
# 数据要传递过去
tooltip_opts=opts.TooltipOpts(formatter=JsCode(
"function(x){return '平台:'+ x.seriesName+'
'+'事件:'+x.name+'
'+'搜索量:'+ parseInt(x.value/10000) + '万' }"
))
)
bar.render(file_save_path+'/index.html')
if __name__ == "__main__":
# 保存位置自定
init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
# 每分钟抓一次
while True:
time.sleep(600)
init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
pass