| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- import requests
- from lxml import etree
- import json
- import os
- import time
- # import numpy as np
- # import matplotlib.pyplot as plt
- from pyecharts import options as opts
- from pyecharts.charts import Bar
- from pyecharts.commons.utils import JsCode
- # 配置ehcarts路径
- from pyecharts.globals import CurrentConfig
- CurrentConfig.ONLINE_HOST = "http://172.16.102.9:8087/echarts_lib/"
- def requestd(url):
- # request:
- t = requests.get(url=url, headers={
- "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
- "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
- }).text
- return t
- def sina2hot(url):
- # 新浪热搜数据:
- r = requestd(url)
- html = etree.HTML(r)
- # 匹配需要的字段
- a_herfs = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/@href')
- a_text = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/text()')
- a_num = html.xpath('//table/tbody/tr/td[@class="td-02"]/span/text()')
- # 组装成一个list
- big_list = []
- host = 'https://s.weibo.com'
- for i, value in enumerate(a_num):
- obj = dict()
- obj['url'] = host + a_herfs[i]
- obj['name'] = a_text[i]
- obj['num'] = int(a_num[i])
- obj['type'] = '新浪热搜'
- obj['strnum'] = str(int(obj['num']/10000))+'万'
- big_list.append(obj)
- return big_list
- def baidu2hot(url):
- # 百度热搜数据:
- b = requestd(url)
- html = etree.HTML(b)
- hotsearch = html.xpath('//textarea[@id="hotsearch_data"]/text()')
- big_list = []
- host = 'http://www.baidu.com/s?tn=news&wd='
- for item in hotsearch:
- josondb = json.loads(item)
- for items in josondb['hotsearch']:
- obj = dict()
- obj['url'] = host + items['pure_title']
- obj['name'] = items['pure_title']
- obj['num'] = int(items['heat_score'])
- obj['type'] = '百度热搜'
- obj['strnum'] = str(int(obj['num']/10000))+'万'
- # obj['htmls'] = getnewdetails(obj['url'])
- big_list.append(obj)
- return big_list
- def getnewdetails(url):
- # 获取新闻详情数据:
- t = requestd(url)
- html = etree.HTML(t)
- aherf = html.xpath('//h3[@class="c-title"]/a/@href')
- newdb = ''
- for item in aherf:
- if 'baijiahao.baidu.com' in item:
- s = requestd(item)
- htmls = etree.HTML(s)
- # # 获取元素=>转成html
- result = htmls.xpath('//*[@class="article-content"]/p')
- for i in result:
- newdb += etree.tostring(i, encoding='utf-8').decode()
- # print(newdb)
- return newdb
- def save2json(file_save_path, file_db_list):
- # 数据存json文件:
- # 中文禁用ascii,采用UTF8
- if file_save_path:
- os.chdir(file_save_path)
- with open('test.json', 'w', encoding='utf-8') as F:
- json.dump(file_db_list, F, ensure_ascii=False)
- pass
- def init2db(file_save_path):
- # 合并数据list:
- sinaurl = 'https://s.weibo.com/top/summary?cate=realtimehot'
- baidurl = 'http://www.baidu.com/'
- # 合并list
- all_list = sina2hot(sinaurl) + baidu2hot(baidurl)
- # 倒序=》根据浏览量排序
- new_list = sorted(all_list, key=lambda num: num['num'], reverse=True)
- # print('总共', str(len(new_list)))
- # 去重
- new_list_1 = []
- seen = []
- for d in new_list:
- t = d['name']
- if t not in seen:
- seen.append(t)
- new_list_1.append(d)
- print('去重排序后总共', str(len(new_list_1)))
- # 图表展示
- json2charts(new_list_1, file_save_path)
- save2json(file_save_path, new_list_1)
- pass
- def json2charts(all_list, file_save_path):
- # 图形化:
- bar = Bar()
- # 重新处理echarts需要的数据
- for item in all_list:
- db = []
- obj = dict()
- obj['name'] = item['name']
- obj['value'] = item['num']
- db.append(obj)
- bar.add_yaxis(item['type'], db)
- nowtime = time.strftime("%Y-%m-%d")
- bar.add_xaxis([nowtime])
- bar.set_series_opts(
- label_opts=opts.LabelOpts(is_show=False),
- # label_opts=opts.LabelOpts(formatter=JsCode(
- # "function(x){return parseInt(x.value/10000)+'万'}"
- # )),
- # 数据要传递过去
- tooltip_opts=opts.TooltipOpts(formatter=JsCode(
- "function(x){return '平台:'+ x.seriesName+'<br/>'+'事件:'+x.name+'<br/>'+'搜索量:'+ parseInt(x.value/10000) + '万' }"
- ))
- )
- bar.render(file_save_path+'/index.html')
- if __name__ == "__main__":
- # 保存位置自定
- init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
- # 每分钟抓一次
- while True:
- time.sleep(600)
- init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
- pass
|