1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
| import requests import pandas as pd from bs4 import BeautifulSoup import re import json import datetime
def get_url(): """获取top50的link""" top_list=[] url_list=[] headers = { 'cookie': '在此替换 你的知乎登陆cookie', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55' } home_url = 'https://www.zhihu.com/hot' res = requests.get(url=home_url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') for section in soup.find_all("section"): top=int(section.find("div",attrs={"class":"HotItem-rank"}).string) top_list.append(top) url=section.find("a",attrs={"target":"_blank","rel":"noopener noreferrer","data-za-not-track-link":"true"})["href"] url_list.append(url) return top_list, url_list
def save_section(link_list): """获取每一节的内容""" data_list=[] headers = { 'cookie': '在此替换 你的知乎登陆cookie', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55', } for top,url in zip(link_list[0],link_list[1]): data = { 'top': top, 'url': url, 'question': None, 'recommend': None, 'star': None, 'watcher': None, 'date': None, } print(url) print(top) res = requests.get(url=url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') try: data['question']=str(soup.find('h1').string) print(data['question']) except: data['question']=None try: data['recommend']=soup.find('button',attrs={'type':"button","class":"Button GoodQuestionAction-commonBtn Button--plain Button--withIcon Button--withLabel"}).contents[-1] print(data['recommend']) data['recommend']=int(re.sub(r'好问题','',data['recommend']).strip()) print(data['recommend']) except: data['recommend']=None try: NumberBoard=soup.find_all('strong', attrs={'class': 'NumberBoard-itemValue'}) data['star']=int(str(NumberBoard[0].string).replace(',','')) print('关注者',data['star']) except: data['star']=None try: NumberBoard = soup.find_all('strong', attrs={'class': 'NumberBoard-itemValue'}) data['watcher'] = int(str(NumberBoard[1].string).replace(',','')) print('浏览者',data['watcher']) except: data['watcher']=None try: res_time = requests.get(url=url+'/log', headers=headers) soup_time=BeautifulSoup(res_time.text, 'html.parser') data['date']=str(soup_time.find('time').string) print(data['date']) except: data['date']=None data_list.append(data) with open('hot_list.json','w',encoding='utf-8') as f: f.write(json.dumps(data_list,indent=4,ensure_ascii=False, separators=(', ', ': '))) data_tab=pd.DataFrame(data_list) data_tab.to_csv('hot_list.csv', encoding='utf_8_sig')
def tab_clean(): with open('hot_list.json', 'r', encoding='utf-8') as f: hot_list = json.loads(f.read()) data_tab = pd.DataFrame(hot_list) data_tab['date']=pd.to_datetime(data_tab['date'],format='%Y-%m-%d %H:%M:%S') print(data_tab.dtypes) print(data_tab.notnull()) data_tab['recommend']=data_tab['recommend'].fillna(data_tab['recommend'].median()) data_tab['star']=data_tab['star'].fillna(data_tab['star'].median()) data_tab['watcher']=data_tab['watcher'].fillna(data_tab['watcher'].median()) data_tab=data_tab.dropna(subset=['date']) data_tab['update'] = data_tab['date'] - pd.to_datetime(datetime.datetime.now(), format='%Y-%m-%d %H:%M:%S') data_tab.to_csv('hot_list2.csv', encoding='utf_8_sig') print(data_tab.notnull()) data_tab=pd.read_csv('hot_list2.csv') print(data_tab)
if __name__=="__main__": tab_clean()
|