init.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. import requests
  2. from lxml import etree
  3. import json
  4. import os
  5. import time
  6. # import numpy as np
  7. # import matplotlib.pyplot as plt
  8. from pyecharts import options as opts
  9. from pyecharts.charts import Bar
  10. from pyecharts.commons.utils import JsCode
  11. # 配置ehcarts路径
  12. from pyecharts.globals import CurrentConfig
  13. CurrentConfig.ONLINE_HOST = "http://172.16.102.9:8087/echarts_lib/"
  14. def requestd(url):
  15. # request:
  16. t = requests.get(url=url, headers={
  17. "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
  18. "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
  19. }).text
  20. return t
  21. def sina2hot(url):
  22. # 新浪热搜数据:
  23. r = requestd(url)
  24. html = etree.HTML(r)
  25. # 匹配需要的字段
  26. a_herfs = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/@href')
  27. a_text = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/text()')
  28. a_num = html.xpath('//table/tbody/tr/td[@class="td-02"]/span/text()')
  29. # 组装成一个list
  30. big_list = []
  31. host = 'https://s.weibo.com'
  32. for i, value in enumerate(a_num):
  33. obj = dict()
  34. obj['url'] = host + a_herfs[i]
  35. obj['name'] = a_text[i]
  36. obj['num'] = int(a_num[i])
  37. obj['type'] = '新浪热搜'
  38. obj['strnum'] = str(int(obj['num']/10000))+'万'
  39. big_list.append(obj)
  40. return big_list
  41. def baidu2hot(url):
  42. # 百度热搜数据:
  43. b = requestd(url)
  44. html = etree.HTML(b)
  45. hotsearch = html.xpath('//textarea[@id="hotsearch_data"]/text()')
  46. big_list = []
  47. host = 'http://www.baidu.com/s?tn=news&wd='
  48. for item in hotsearch:
  49. josondb = json.loads(item)
  50. for items in josondb['hotsearch']:
  51. obj = dict()
  52. obj['url'] = host + items['pure_title']
  53. obj['name'] = items['pure_title']
  54. obj['num'] = int(items['heat_score'])
  55. obj['type'] = '百度热搜'
  56. obj['strnum'] = str(int(obj['num']/10000))+'万'
  57. # obj['htmls'] = getnewdetails(obj['url'])
  58. big_list.append(obj)
  59. return big_list
  60. def getnewdetails(url):
  61. # 获取新闻详情数据:
  62. t = requestd(url)
  63. html = etree.HTML(t)
  64. aherf = html.xpath('//h3[@class="c-title"]/a/@href')
  65. newdb = ''
  66. for item in aherf:
  67. if 'baijiahao.baidu.com' in item:
  68. s = requestd(item)
  69. htmls = etree.HTML(s)
  70. # # 获取元素=>转成html
  71. result = htmls.xpath('//*[@class="article-content"]/p')
  72. for i in result:
  73. newdb += etree.tostring(i, encoding='utf-8').decode()
  74. # print(newdb)
  75. return newdb
  76. def save2json(file_save_path, file_db_list):
  77. # 数据存json文件:
  78. # 中文禁用ascii,采用UTF8
  79. if file_save_path:
  80. os.chdir(file_save_path)
  81. with open('test.json', 'w', encoding='utf-8') as F:
  82. json.dump(file_db_list, F, ensure_ascii=False)
  83. pass
  84. def init2db(file_save_path):
  85. # 合并数据list:
  86. sinaurl = 'https://s.weibo.com/top/summary?cate=realtimehot'
  87. baidurl = 'http://www.baidu.com/'
  88. # 合并list
  89. all_list = sina2hot(sinaurl) + baidu2hot(baidurl)
  90. # 倒序=》根据浏览量排序
  91. new_list = sorted(all_list, key=lambda num: num['num'], reverse=True)
  92. # print('总共', str(len(new_list)))
  93. # 去重
  94. new_list_1 = []
  95. seen = []
  96. for d in new_list:
  97. t = d['name']
  98. if t not in seen:
  99. seen.append(t)
  100. new_list_1.append(d)
  101. print('去重排序后总共', str(len(new_list_1)))
  102. # 图表展示
  103. json2charts(new_list_1, file_save_path)
  104. save2json(file_save_path, new_list_1)
  105. pass
  106. def json2charts(all_list, file_save_path):
  107. # 图形化:
  108. bar = Bar()
  109. # 重新处理echarts需要的数据
  110. for item in all_list:
  111. db = []
  112. obj = dict()
  113. obj['name'] = item['name']
  114. obj['value'] = item['num']
  115. db.append(obj)
  116. bar.add_yaxis(item['type'], db)
  117. nowtime = time.strftime("%Y-%m-%d")
  118. bar.add_xaxis([nowtime])
  119. bar.set_series_opts(
  120. label_opts=opts.LabelOpts(is_show=False),
  121. # label_opts=opts.LabelOpts(formatter=JsCode(
  122. # "function(x){return parseInt(x.value/10000)+'万'}"
  123. # )),
  124. # 数据要传递过去
  125. tooltip_opts=opts.TooltipOpts(formatter=JsCode(
  126. "function(x){return '平台:'+ x.seriesName+'<br/>'+'事件:'+x.name+'<br/>'+'搜索量:'+ parseInt(x.value/10000) + '万' }"
  127. ))
  128. )
  129. bar.render(file_save_path+'/index.html')
  130. if __name__ == "__main__":
  131. # 保存位置自定
  132. init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
  133. # 每分钟抓一次
  134. while True:
  135. time.sleep(600)
  136. init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
  137. pass