import requests from lxml import etree import json import os import time def requestd(url): # request: t = requests.get(url=url, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823" }).text return t def baidu2hot(url): # 百度热搜数据: b = requestd(url) html = etree.HTML(b) imgs = html.xpath('//a[@class="img-wrapper_29V76"]/img/@src') titles = html.xpath('//div[@class="c-single-text-ellipsis"]/text()') nums = html.xpath('//div[@class="hot-index_1Bl1a"]/text()') big_list = [] for i, j in enumerate(imgs): obj = dict() obj['name'] = titles[i].replace(" ", "") obj['url'] = 'http://www.baidu.com/s?tn=news&wd=' + obj['name'] obj['num'] = int(nums[i]) obj['strnum'] = str(int(obj['num']/10000))+'万' obj['imgSrc'] = j big_list.append(obj) return big_list def save2json(file_save_path, file_db_list): # 数据存json文件: # 中文禁用ascii,采用UTF8 if file_save_path: os.chdir(file_save_path) with open('test.json', 'w', encoding='utf-8') as F: json.dump(file_db_list, F, ensure_ascii=False) pass def init2db(file_save_path): # 合并数据list: baidurl = 'https://top.baidu.com/board?tab=realtime' try: # 合并list all_list = baidu2hot(baidurl) # 倒序=》根据浏览量排序 new_list_1 = sorted(all_list, key=lambda num: num['num'], reverse=True) print('总共', str(len(new_list_1)), new_list_1) # 图表展示 save2json(file_save_path, new_list_1) except TypeError as e: print('报错', e) pass if __name__ == "__main__": # 保存位置自定 init2db('/var/www/web/static/') # 每分钟抓一次 while True: time.sleep(300) init2db('/var/www/web/static/') pass