1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
   |  import requests import pandas as pd from bs4 import BeautifulSoup import re import json import datetime
 
  def get_url():     """获取top50的link"""     top_list=[]     url_list=[]     headers = {         'cookie': '在此替换 你的知乎登陆cookie',         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55'     }     home_url = 'https://www.zhihu.com/hot'     res = requests.get(url=home_url, headers=headers)     soup = BeautifulSoup(res.text, 'html.parser')     for section in soup.find_all("section"):         top=int(section.find("div",attrs={"class":"HotItem-rank"}).string)         top_list.append(top)         url=section.find("a",attrs={"target":"_blank","rel":"noopener noreferrer","data-za-not-track-link":"true"})["href"]         url_list.append(url)     return top_list, url_list
 
  def save_section(link_list):     """获取每一节的内容"""     data_list=[]     headers = {         'cookie': '在此替换 你的知乎登陆cookie',         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55',     }     for top,url in zip(link_list[0],link_list[1]):         data = {             'top': top,             'url': url,             'question': None,             'recommend': None,             'star': None,             'watcher': None,             'date': None,         }         print(url)         print(top)         res = requests.get(url=url, headers=headers)         soup = BeautifulSoup(res.text, 'html.parser')         try:             data['question']=str(soup.find('h1').string)             print(data['question'])         except:             data['question']=None         try:             data['recommend']=soup.find('button',attrs={'type':"button","class":"Button GoodQuestionAction-commonBtn Button--plain Button--withIcon Button--withLabel"}).contents[-1]             print(data['recommend'])             data['recommend']=int(re.sub(r'好问题','',data['recommend']).strip())             print(data['recommend'])         except:             data['recommend']=None         try:             NumberBoard=soup.find_all('strong', attrs={'class': 'NumberBoard-itemValue'})             data['star']=int(str(NumberBoard[0].string).replace(',',''))             print('关注者',data['star'])         except:             data['star']=None         try:             NumberBoard = soup.find_all('strong', attrs={'class': 'NumberBoard-itemValue'})             data['watcher'] = int(str(NumberBoard[1].string).replace(',',''))             print('浏览者',data['watcher'])         except:             data['watcher']=None         try:             res_time = requests.get(url=url+'/log', headers=headers)             soup_time=BeautifulSoup(res_time.text, 'html.parser')             data['date']=str(soup_time.find('time').string)             print(data['date'])         except:             data['date']=None         data_list.append(data)     with open('hot_list.json','w',encoding='utf-8') as f:         f.write(json.dumps(data_list,indent=4,ensure_ascii=False, separators=(', ', ': ')))     data_tab=pd.DataFrame(data_list)          data_tab.to_csv('hot_list.csv', encoding='utf_8_sig')  
 
  def tab_clean():     with open('hot_list.json', 'r', encoding='utf-8') as f:         hot_list = json.loads(f.read())     data_tab = pd.DataFrame(hot_list)     data_tab['date']=pd.to_datetime(data_tab['date'],format='%Y-%m-%d %H:%M:%S')     print(data_tab.dtypes)     print(data_tab.notnull())          data_tab['recommend']=data_tab['recommend'].fillna(data_tab['recommend'].median())     data_tab['star']=data_tab['star'].fillna(data_tab['star'].median())     data_tab['watcher']=data_tab['watcher'].fillna(data_tab['watcher'].median())          data_tab=data_tab.dropna(subset=['date'])          data_tab['update'] = data_tab['date'] - pd.to_datetime(datetime.datetime.now(), format='%Y-%m-%d %H:%M:%S')     data_tab.to_csv('hot_list2.csv', encoding='utf_8_sig')       print(data_tab.notnull())     data_tab=pd.read_csv('hot_list2.csv')     print(data_tab)
 
 
  if __name__=="__main__":          tab_clean()
 
  |