init.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import requests
  2. from lxml import etree
  3. import json
  4. import os
  5. import time
  6. def requestd(url):
  7. # request:
  8. t = requests.get(url=url, headers={
  9. "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
  10. "Cookie": "BAIDUID=B7F548F09EEC3FB817FEE54D9E4FB734:FG=1; BD_NOT_HTTPS=1; PSTM=1589783384; BIDUPSID=B7F548F09EEC3FB817FEE54D9E4FB734; BDSVRTM=10; BD_HOME=1; H_PS_PSSID=1445_31326_21081_31111_31593_31525_31464_31322_30823"
  11. }).text
  12. return t
  13. def baidu2hot(url):
  14. # 百度热搜数据:
  15. b = requestd(url)
  16. html = etree.HTML(b)
  17. imgs = html.xpath('//a[@class="img-wrapper_29V76"]/img/@src')
  18. titles = html.xpath('//div[@class="c-single-text-ellipsis"]/text()')
  19. nums = html.xpath('//div[@class="hot-index_1Bl1a"]/text()')
  20. big_list = []
  21. for i, j in enumerate(imgs):
  22. obj = dict()
  23. obj['name'] = titles[i].replace(" ", "")
  24. obj['url'] = 'http://www.baidu.com/s?tn=news&wd=' + obj['name']
  25. obj['num'] = int(nums[i])
  26. obj['strnum'] = str(int(obj['num']/10000))+'万'
  27. obj['imgSrc'] = j
  28. big_list.append(obj)
  29. return big_list
  30. def save2json(file_save_path, file_db_list):
  31. # 数据存json文件:
  32. # 中文禁用ascii,采用UTF8
  33. if file_save_path:
  34. os.chdir(file_save_path)
  35. with open('test.json', 'w', encoding='utf-8') as F:
  36. json.dump(file_db_list, F, ensure_ascii=False)
  37. pass
  38. def init2db(file_save_path):
  39. # 合并数据list:
  40. baidurl = 'https://top.baidu.com/board?tab=realtime'
  41. try:
  42. # 合并list
  43. all_list = baidu2hot(baidurl)
  44. # 倒序=》根据浏览量排序
  45. new_list_1 = sorted(all_list, key=lambda num: num['num'], reverse=True)
  46. print('总共', str(len(new_list_1)), new_list_1)
  47. # 图表展示
  48. save2json(file_save_path, new_list_1)
  49. except TypeError as e:
  50. print('报错', e)
  51. pass
  52. if __name__ == "__main__":
  53. # 保存位置自定
  54. init2db('C:/Users/Caner/Desktop/MoreFun/static/')
  55. # 每分钟抓一次
  56. while True:
  57. time.sleep(300)
  58. init2db('C:/Users/Caner/Desktop/MoreFun/static/')
  59. pass