| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- import requests
- from lxml import etree
- import json
- import os
- import time
- def requestd(url):
- # request:
- t = requests.get(url=url, headers={
- "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
- "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
- }).text
- return t
- def baidu2hot(url):
- # 百度热搜数据:
- b = requestd(url)
- html = etree.HTML(b)
- imgs = html.xpath('//a[@class="img-wrapper_29V76"]/img/@src')
- titles = html.xpath('//div[@class="c-single-text-ellipsis"]/text()')
- nums = html.xpath('//div[@class="hot-index_1Bl1a"]/text()')
- big_list = []
- for i, j in enumerate(imgs):
- obj = dict()
- obj['name'] = titles[i].replace(" ", "")
- obj['url'] = 'http://www.baidu.com/s?tn=news&wd=' + obj['name']
- obj['num'] = int(nums[i])
- obj['strnum'] = str(int(obj['num']/10000))+'万'
- obj['imgSrc'] = j
- big_list.append(obj)
- return big_list
- def save2json(file_save_path, file_db_list):
- # 数据存json文件:
- # 中文禁用ascii,采用UTF8
- if file_save_path:
- os.chdir(file_save_path)
- with open('test.json', 'w', encoding='utf-8') as F:
- json.dump(file_db_list, F, ensure_ascii=False)
- pass
- def init2db(file_save_path):
- # 合并数据list:
- baidurl = 'https://top.baidu.com/board?tab=realtime'
- try:
- # 合并list
- all_list = baidu2hot(baidurl)
- # 倒序=》根据浏览量排序
- new_list_1 = sorted(all_list, key=lambda num: num['num'], reverse=True)
- print('总共', str(len(new_list_1)), new_list_1)
- # 图表展示
- save2json(file_save_path, new_list_1)
- except TypeError as e:
- print('报错', e)
- pass
- if __name__ == "__main__":
- # 保存位置自定
- init2db('/var/www/web/static/')
- # 每分钟抓一次
- while True:
- time.sleep(300)
- init2db('/var/www/web/static/')
- pass
|