caner
/
python-spider-baidu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
							import requests
from lxml import etree
import json
import os
import time
# import numpy as np
# import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode
# 配置ehcarts路径
from pyecharts.globals import CurrentConfig
CurrentConfig.ONLINE_HOST = "http://172.16.102.9:8087/echarts_lib/"


def requestd(url):
    # request:
    t = requests.get(url=url, headers={
        "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
        "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
    }).text
    return t


def sina2hot(url):
    # 新浪热搜数据:
    r = requestd(url)
    html = etree.HTML(r)
    # 匹配需要的字段
    a_herfs = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/@href')
    a_text = html.xpath('//table/tbody/tr/td[@class="td-02"]/a/text()')
    a_num = html.xpath('//table/tbody/tr/td[@class="td-02"]/span/text()')
    # 组装成一个list
    big_list = []
    host = 'https://s.weibo.com'
    for i, value in enumerate(a_num):
        obj = dict()
        obj['url'] = host + a_herfs[i]
        obj['name'] = a_text[i]
        obj['num'] = int(a_num[i])
        obj['type'] = '新浪热搜'
        obj['strnum'] = str(int(obj['num']/10000))+'万'
        big_list.append(obj)
    return big_list


def baidu2hot(url):
    # 百度热搜数据:
    b = requestd(url)
    html = etree.HTML(b)
    hotsearch = html.xpath('//textarea[@id="hotsearch_data"]/text()')
    big_list = []
    host = 'http://www.baidu.com/s?tn=news&wd='
    for item in hotsearch:
        josondb = json.loads(item)
        for items in josondb['hotsearch']:
            obj = dict()
            obj['url'] = host + items['pure_title']
            obj['name'] = items['pure_title']
            obj['num'] = int(items['heat_score'])
            obj['type'] = '百度热搜'
            obj['strnum'] = str(int(obj['num']/10000))+'万'
            # obj['htmls'] = getnewdetails(obj['url'])
            big_list.append(obj)
    return big_list


def getnewdetails(url):
    # 获取新闻详情数据:
    t = requestd(url)
    html = etree.HTML(t)
    aherf = html.xpath('//h3[@class="c-title"]/a/@href')
    newdb = ''
    for item in aherf:
        if 'baijiahao.baidu.com' in item:
            s = requestd(item)
            htmls = etree.HTML(s)
            # # 获取元素=>转成html
            result = htmls.xpath('//*[@class="article-content"]/p')
            for i in result:
                newdb += etree.tostring(i, encoding='utf-8').decode()
            # print(newdb)
    return newdb


def save2json(file_save_path, file_db_list):
    # 数据存json文件:
    # 中文禁用ascii,采用UTF8
    if file_save_path:
        os.chdir(file_save_path)
    with open('test.json', 'w', encoding='utf-8') as F:
        json.dump(file_db_list, F, ensure_ascii=False)
    pass


def init2db(file_save_path):
    # 合并数据list:
    sinaurl = 'https://s.weibo.com/top/summary?cate=realtimehot'
    baidurl = 'http://www.baidu.com/'
    # 合并list
    all_list = sina2hot(sinaurl) + baidu2hot(baidurl)

    # 倒序=》根据浏览量排序
    new_list = sorted(all_list, key=lambda num: num['num'], reverse=True)
    # print('总共', str(len(new_list)))
    # 去重
    new_list_1 = []
    seen = []
    for d in new_list:
        t = d['name']
        if t not in seen:
            seen.append(t)
            new_list_1.append(d)
    print('去重排序后总共', str(len(new_list_1)))
    # 图表展示
    json2charts(new_list_1, file_save_path)
    save2json(file_save_path, new_list_1)
    pass


def json2charts(all_list, file_save_path):
    # 图形化:
    bar = Bar()
    # 重新处理echarts需要的数据
    for item in all_list:
        db = []
        obj = dict()
        obj['name'] = item['name']
        obj['value'] = item['num']
        db.append(obj)
        bar.add_yaxis(item['type'], db)
    nowtime = time.strftime("%Y-%m-%d")
    bar.add_xaxis([nowtime])
    bar.set_series_opts(
        label_opts=opts.LabelOpts(is_show=False),
        # label_opts=opts.LabelOpts(formatter=JsCode(
        #     "function(x){return parseInt(x.value/10000)+'万'}"
        # )),
        # 数据要传递过去
        tooltip_opts=opts.TooltipOpts(formatter=JsCode(
            "function(x){return '平台：'+ x.seriesName+'<br/>'+'事件：'+x.name+'<br/>'+'搜索量：'+ parseInt(x.value/10000) + '万' }"
        ))
    )
    bar.render(file_save_path+'/index.html')


if __name__ == "__main__":
    # 保存位置自定
    init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
    # 每分钟抓一次
    while True:
        time.sleep(600)
        init2db('C:/Users/Canner/Desktop/BaiduAndSinaHotSearch2pyecharts')
        pass