Python 3 抓取知乎指定答案下的点赞名单

抓包分析

「如何看待《万万没想到西游篇》大电影在知乎上被众多大神唾弃的现象? - 张兆杰的回答」为例。

使用Chrome抓包发现,点击「等人赞同」链接时,浏览器向如下网址发送GET请求:

https://www.zhihu.com/answer/26236952/voters_profile

26236952是此答案对应的aid:
aid.png

返回的JSON数据中有这么一项:

"next": "/answer/26236952/voters_profile?total=808&offset=10&follows=NYRY--TZ3-l5H4gXp5RoqQVv"

下拉滚动条,浏览器会继续发送GET请求:

https://www.zhihu.com/answer/26236952/voters_profile?total=808&offset=10&follows=NYRY--TZ3-l5H4gXp5RoqQVv

正好是next指向的网址。

每次GET请求返回的JSON数据都包含了10个点赞用户的信息,单个用户的信息如下:

<div class="zm-profile-card clearfix no-hovercard">
<div class="zg-right">
<button data-follow="m:button" data-id="6b500d547283f3cc186eafff2a8033b4" class="zg-btn zg-btn-follow zm-rich-follow-btn small nth-0">关注她</button>
</div>
<a title="薛定谔的喵"
data-tip="p$t$fang-ting-20-32"
class="zm-item-link-avatar"
target="_blank"
href="/people/fang-ting-20-32">
<img src="https://pic1.zhimg.com/52ae34056536f4088c7667890f14aaa8_m.jpg" class="zm-item-img-avatar">
</a>
<div class="body">
<div class="author ellipsis">
<a data-tip="p$t$fang-ting-20-32" href="https://www.zhihu.com/people/fang-ting-20-32" target="_blank" class="zg-link" title="薛定谔的喵">薛定谔的喵</a>
<span class="bio hidden-phone">世界那么大,我来看看!</span>
</div>
<ul class="status">
<li><span>13 赞同</span></li>
<li><span>6 感谢</span></li>
<li class="hidden-phone"><a href="/people/fang-ting-20-32/asks" target="_blank">0 提问</a></li>
<li class="hidden-phone"><a href="/people/fang-ting-20-32/answers" target="_blank">19 回答</a></li>
</ul>
</div>
</div>

由于0回答则必然0赞同0感谢,只须提取提问和回答数。

如果是匿名用户:

<div class="zm-profile-card clearfix no-hovercard">
<span class="zm-item-link-avatar">
<img title="匿名用户" class="zm-item-img-avatar" src="https://pic2.zhimg.com/aadd7b895_m.jpg"/>
</span>
<div class="body">
匿名用户
</div>
</div>

使用说明

1. 答案对应的aid需要自己在网页源码中找
2. 如果需要获取完整的抓取结果,将下面save这一行取消注释
    # 想看完整结果可以保存到csv文件,用Excel查看
    # save(infos, aid)
3. 首次使用时需要运行 ZhiHuClient().login(username, password) 登录一次,以后不需要
4. 控制台输出如下:
    总赞数:822,四零用户数:182,比例:0.22141119221411193

四零用户是指:0赞同、0感谢、0提问、0回答。有些知友万年潜水,从来不提问和回答,他们的数据也会是四个0。所以四零用户不等于水军号。而且现在养水军号一般会养上一段时间伪装成正常号再使用,很难判断。
四零用户.png

改进空间:后来测了一下,发送GET请求的网址可以精简:

https://www.zhihu.com/answer/26236952/voters_profile?offset=10

只需改变offset值就能获取不同的信息,可以改成多线程爬取。


脚本源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: loveNight
# @Date: 2015-12-17 00:52:27
# @Last Modified by: loveNight
# @Last Modified time: 2015-12-17 02:50:20
import requests
import time
import json
import os
import re
import sys
import subprocess
from bs4 import BeautifulSoup as BS


class ZhiHuClient(object):

"""连接知乎的工具类,维护一个Session
2015.11.11

用法:

client = ZhiHuClient()

# 第一次使用时需要调用此方法登录一次,生成cookie文件
# 以后可以跳过这一步
client.login("username", "password")

# 用这个session进行其他网络操作,详见requests库
session = client.getSession()
"""


# 网址参数是账号类型
TYPE_PHONE_NUM = "phone_num"
TYPE_EMAIL = "email"
loginURL = r"http://www.zhihu.com/login/{0}"
homeURL = r"http://www.zhihu.com"
captchaURL = r"http://www.zhihu.com/captcha.gif"

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Host": "www.zhihu.com",
"Upgrade-Insecure-Requests": "1",
}

captchaFile = os.path.join(sys.path[0], "captcha.gif")
cookieFile = os.path.join(sys.path[0], "cookie")

def __init__(self):
os.chdir(sys.path[0]) # 设置脚本所在目录为当前工作目录

self.__session = requests.Session()
self.__session.headers = self.headers # 用self调用类变量是防止将来类改名
# 若已经有 cookie 则直接登录
self.__cookie = self.__loadCookie()
if self.__cookie:
print("检测到cookie文件,直接使用cookie登录")
self.__session.cookies.update(self.__cookie)
soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser")
print("已登陆账号: %s" % soup.find("span", class_="name").getText())
else:
print("没有找到cookie文件,请调用login方法登录一次!")

# 登录
def login(self, username, password):
"""
验证码错误返回:
{'errcode': 1991829, 'r': 1, 'data': {'captcha': '请提交正确的验证码 :('}, 'msg': '请提交正确的验证码 :('}
登录成功返回:
{'r': 0, 'msg': '登陆成功'}
"""

self.__username = username
self.__password = password
self.__loginURL = self.loginURL.format(self.__getUsernameType())
# 随便开个网页,获取登陆所需的_xsrf
html = self.open(self.homeURL).text
soup = BS(html, "html.parser")
_xsrf = soup.find("input", {"name": "_xsrf"})["value"]
# 下载验证码图片
while True:
captcha = self.open(self.captchaURL).content
with open(self.captchaFile, "wb") as output:
output.write(captcha)
# 人眼识别
print("=" * 50)
print("已打开验证码图片,请识别!")
subprocess.call(self.captchaFile, shell=True)
captcha = input("请输入验证码:")
os.remove(self.captchaFile)
# 发送POST请求
data = {
"_xsrf": _xsrf,
"password": self.__password,
"remember_me": "true",
self.__getUsernameType(): self.__username,
"captcha": captcha
}
res = self.__session.post(self.__loginURL, data=data)
print("=" * 50)
# print(res.text) # 输出脚本信息,调试用
if res.json()["r"] == 0:
print("登录成功")
self.__saveCookie()
break
else:
print("登录失败")
print("错误信息 --->", res.json()["msg"])

def __getUsernameType(self):
"""判断用户名类型
经测试,网页的判断规则是纯数字为phone_num,其他为email
"""

if self.__username.isdigit():
return self.TYPE_PHONE_NUM
return self.TYPE_EMAIL

def __saveCookie(self):
"""cookies 序列化到文件
即把dict对象转化成字符串保存
"""

with open(self.cookieFile, "w") as output:
cookies = self.__session.cookies.get_dict()
json.dump(cookies, output)
print("=" * 50)
print("已在同目录下生成cookie文件:", self.cookieFile)

def __loadCookie(self):
"""读取cookie文件,返回反序列化后的dict对象,没有则返回None"""
if os.path.exists(self.cookieFile):
print("=" * 50)
with open(self.cookieFile, "r") as f:
cookie = json.load(f)
return cookie
return None

def open(self, url, delay=0, timeout=10):
"""打开网页,返回Response对象"""
if delay:
time.sleep(delay)
return self.__session.get(url, timeout=timeout)

def getSession(self):
return self.__session


URL_PREFIX = r'https://www.zhihu.com'
re_question = re.compile(r'>(\d+)\s*?提问<')
re_answer = re.compile(r'>(\d+)\s*?回答<')
session = ZhiHuClient().getSession()


def getVoters(aid):
url = r'https://www.zhihu.com/answer/{aid}/voters_profile'.format(aid=aid)
infos = []
while True:
# print("正在访问网址:", url)
data = session.get(url).json()
url = URL_PREFIX + data["paging"]["next"]
infos.extend([getVoterInfo(x) for x in data["payload"]])
if url == URL_PREFIX:
break
# 想看完整结果可以保存到csv文件,用Excel查看
# save(infos, aid)
count40(infos)


def count40(infos):
total = len(infos)
count = 0
for info in infos:
if info:
if info[1] == info[2] == 0:
count += 1
print("总赞数:%s,四零用户数:%s,比例:%s" % (total, count, count / total))


def save(infos, aid):
result_file = os.path.join(sys.path[0], str(aid) + ".csv")
with open(result_file, "w", encoding='gbk') as output:
output.write("昵称,提问,回答,头像,主页\n")
for info in infos:
if info:
info = map(str, info)
output.write(",".join(info) + "\n")
else:
output.write("匿名用户,,,,\n")
print("脚本执行完毕,请查看", result_file)


def getVoterInfo(html):
soup = BS(html, "lxml")
if not 'img title="匿名用户"' in html:
tmp = soup.find("a", class_="zg-link")
name = tmp["title"]
user_url = tmp["href"]
avatar_url = soup.find("img", class_="zm-item-img-avatar")["src"]
question_count = int(re_question.findall(html)[0])
answer_count = int(re_answer.findall(html)[0])
return name, question_count, answer_count, avatar_url, user_url


if __name__ == '__main__':
# 第一次使用时需要调用此方法登录一次,生成cookie文件
# 以后可以跳过这一步
# ZhiHuClient().login(username, password)

# 如何看待《万万没想到西游篇》大电影在知乎上被众多大神唾弃的现象? - 张兆杰的回答
# https://www.zhihu.com/question/38490150/answer/77029343
# getVoters(26236952)

# 能利用爬虫技术做到哪些很酷很有趣很有用的事情? - Emily L 的回答
# https://www.zhihu.com/question/27621722/answer/48177710
# getVoters(14668992)

# 如何评价杭州女生郭文景被哈佛录取? - 匿名用户的回答
# https://www.zhihu.com/question/38444099/answer/76550778
# getVoters(26045034)

# 行列式的本质是什么? - 童哲的回答
# https://www.zhihu.com/question/36966326/answer/70687817
# getVoters(23695740)

# 为什么男生比女生多,却有很多优质女生找不到男朋友? - Mingo鸣哥的回答
# https://www.zhihu.com/question/37373471/answer/77068061
# getVoters(26252564)

# 什么时候才是开掉「技术合伙人」的最佳时机? - 匿名用户的回答
# https://www.zhihu.com/question/38531356/answer/76871265
getVoters(26173592)
loveNight wechat
我的微信公众号,放一些有趣的内容,不定期更新。