抓包分析

以「如何看待《万万没想到西游篇》大电影在知乎上被众多大神唾弃的现象？ - 张兆杰的回答」为例。

使用Chrome抓包发现，点击「等人赞同」链接时，浏览器向如下网址发送GET请求：

https://www.zhihu.com/answer/26236952/voters_profile

26236952是此答案对应的aid:

返回的JSON数据中有这么一项：

"next": "/answer/26236952/voters_profile?total=808&offset=10&follows=NYRY--TZ3-l5H4gXp5RoqQVv"

下拉滚动条，浏览器会继续发送GET请求：

https://www.zhihu.com/answer/26236952/voters_profile?total=808&offset=10&follows=NYRY--TZ3-l5H4gXp5RoqQVv

正好是next指向的网址。

每次GET请求返回的JSON数据都包含了10个点赞用户的信息，单个用户的信息如下：

<div class="zm-profile-card clearfix no-hovercard">
<div class="zg-right">
<button data-follow="m:button" data-id="6b500d547283f3cc186eafff2a8033b4" class="zg-btn zg-btn-follow zm-rich-follow-btn small nth-0">关注她</button>
</div>
<a title="薛定谔的喵"
data-tip="p$t$fang-ting-20-32"
class="zm-item-link-avatar"
target="_blank"
href="/people/fang-ting-20-32">
<img src="https://pic1.zhimg.com/52ae34056536f4088c7667890f14aaa8_m.jpg" class="zm-item-img-avatar">
</a>
<div class="body">
<div class="author ellipsis">
<a data-tip="p$t$fang-ting-20-32" href="https://www.zhihu.com/people/fang-ting-20-32" target="_blank" class="zg-link" title="薛定谔的喵">薛定谔的喵</a>
<span class="bio hidden-phone">世界那么大，我来看看！</span>
</div>
<ul class="status">
<li><span>13 赞同</span></li>
<li><span>6 感谢</span></li>
<li class="hidden-phone"><a href="/people/fang-ting-20-32/asks" target="_blank">0 提问</a></li>
<li class="hidden-phone"><a href="/people/fang-ting-20-32/answers" target="_blank">19 回答</a></li>
</ul>
</div>
</div>

由于0回答则必然0赞同0感谢，只须提取提问和回答数。

如果是匿名用户：

<div class="zm-profile-card clearfix no-hovercard">
<span class="zm-item-link-avatar">
<img title="匿名用户" class="zm-item-img-avatar" src="https://pic2.zhimg.com/aadd7b895_m.jpg"/>
</span>
<div class="body">
匿名用户
</div>
</div>

使用说明

1. 答案对应的aid需要自己在网页源码中找
2. 如果需要获取完整的抓取结果，将下面save这一行取消注释
    # 想看完整结果可以保存到csv文件，用Excel查看
    # save(infos, aid)
3. 首次使用时需要运行 ZhiHuClient().login(username, password) 登录一次，以后不需要
4. 控制台输出如下：
    总赞数：822，四零用户数：182，比例：0.22141119221411193

四零用户是指：0赞同、0感谢、0提问、0回答。有些知友万年潜水，从来不提问和回答，他们的数据也会是四个0。所以四零用户不等于水军号。而且现在养水军号一般会养上一段时间伪装成正常号再使用，很难判断。

改进空间：后来测了一下，发送GET请求的网址可以精简：

https://www.zhihu.com/answer/26236952/voters_profile?offset=10

只需改变offset值就能获取不同的信息，可以改成多线程爬取。

脚本源码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: loveNight
# @Date:   2015-12-17 00:52:27
# @Last Modified by:   loveNight
# @Last Modified time: 2015-12-17 02:50:20
import requests
import time
import json
import os
import re
import sys
import subprocess
from bs4 import BeautifulSoup as BS


class ZhiHuClient(object):

    """连接知乎的工具类，维护一个Session
    2015.11.11

    用法：

    client = ZhiHuClient()

    # 第一次使用时需要调用此方法登录一次，生成cookie文件
    # 以后可以跳过这一步
    client.login("username", "password")

    # 用这个session进行其他网络操作，详见requests库
    session = client.getSession()
    """

    # 网址参数是账号类型
    TYPE_PHONE_NUM = "phone_num"
    TYPE_EMAIL = "email"
    loginURL = r"http://www.zhihu.com/login/{0}"
    homeURL = r"http://www.zhihu.com"
    captchaURL = r"http://www.zhihu.com/captcha.gif"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "www.zhihu.com",
        "Upgrade-Insecure-Requests": "1",
    }

    captchaFile = os.path.join(sys.path[0], "captcha.gif")
    cookieFile = os.path.join(sys.path[0], "cookie")

    def __init__(self):
        os.chdir(sys.path[0])  # 设置脚本所在目录为当前工作目录

        self.__session = requests.Session()
        self.__session.headers = self.headers  # 用self调用类变量是防止将来类改名
        # 若已经有 cookie 则直接登录
        self.__cookie = self.__loadCookie()
        if self.__cookie:
            print("检测到cookie文件，直接使用cookie登录")
            self.__session.cookies.update(self.__cookie)
            soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser")
            print("已登陆账号： %s" % soup.find("span", class_="name").getText())
        else:
            print("没有找到cookie文件，请调用login方法登录一次！")

    # 登录
    def login(self, username, password):
        """
        验证码错误返回：
        {'errcode': 1991829, 'r': 1, 'data': {'captcha': '请提交正确的验证码 :('}, 'msg': '请提交正确的验证码 :('}
        登录成功返回：
        {'r': 0, 'msg': '登陆成功'}
        """
        self.__username = username
        self.__password = password
        self.__loginURL = self.loginURL.format(self.__getUsernameType())
        # 随便开个网页，获取登陆所需的_xsrf
        html = self.open(self.homeURL).text
        soup = BS(html, "html.parser")
        _xsrf = soup.find("input", {"name": "_xsrf"})["value"]
        # 下载验证码图片
        while True:
            captcha = self.open(self.captchaURL).content
            with open(self.captchaFile, "wb") as output:
                output.write(captcha)
            # 人眼识别
            print("=" * 50)
            print("已打开验证码图片，请识别！")
            subprocess.call(self.captchaFile, shell=True)
            captcha = input("请输入验证码：")
            os.remove(self.captchaFile)
            # 发送POST请求
            data = {
                "_xsrf": _xsrf,
                "password": self.__password,
                "remember_me": "true",
                self.__getUsernameType(): self.__username,
                "captcha": captcha
            }
            res = self.__session.post(self.__loginURL, data=data)
            print("=" * 50)
            # print(res.text) # 输出脚本信息，调试用
            if res.json()["r"] == 0:
                print("登录成功")
                self.__saveCookie()
                break
            else:
                print("登录失败")
                print("错误信息 --->", res.json()["msg"])

    def __getUsernameType(self):
        """判断用户名类型
        经测试，网页的判断规则是纯数字为phone_num，其他为email
        """
        if self.__username.isdigit():
            return self.TYPE_PHONE_NUM
        return self.TYPE_EMAIL

    def __saveCookie(self):
        """cookies 序列化到文件
        即把dict对象转化成字符串保存
        """
        with open(self.cookieFile, "w") as output:
            cookies = self.__session.cookies.get_dict()
            json.dump(cookies, output)
            print("=" * 50)
            print("已在同目录下生成cookie文件：", self.cookieFile)

    def __loadCookie(self):
        """读取cookie文件，返回反序列化后的dict对象，没有则返回None"""
        if os.path.exists(self.cookieFile):
            print("=" * 50)
            with open(self.cookieFile, "r") as f:
                cookie = json.load(f)
                return cookie
        return None

    def open(self, url, delay=0, timeout=10):
        """打开网页，返回Response对象"""
        if delay:
            time.sleep(delay)
        return self.__session.get(url, timeout=timeout)

    def getSession(self):
        return self.__session


URL_PREFIX = r'https://www.zhihu.com'
re_question = re.compile(r'>(\d+)\s*?提问<')
re_answer = re.compile(r'>(\d+)\s*?回答<')
session = ZhiHuClient().getSession()


def getVoters(aid):
    url = r'https://www.zhihu.com/answer/{aid}/voters_profile'.format(aid=aid)
    infos = []
    while True:
        # print("正在访问网址：", url)
        data = session.get(url).json()
        url = URL_PREFIX + data["paging"]["next"]
        infos.extend([getVoterInfo(x) for x in data["payload"]])
        if url == URL_PREFIX:
            break
    # 想看完整结果可以保存到csv文件，用Excel查看
    # save(infos, aid)
    count40(infos)


def count40(infos):
    total = len(infos)
    count = 0
    for info in infos:
        if info:
            if info[1] == info[2] == 0:
                count += 1
    print("总赞数：%s，四零用户数：%s，比例：%s" % (total, count, count / total))


def save(infos, aid):
    result_file = os.path.join(sys.path[0], str(aid) + ".csv")
    with open(result_file, "w", encoding='gbk') as output:
        output.write("昵称,提问,回答,头像,主页\n")
        for info in infos:
            if info:
                info = map(str, info)
                output.write(",".join(info) + "\n")
            else:
                output.write("匿名用户,,,,\n")
    print("脚本执行完毕，请查看", result_file)


def getVoterInfo(html):
    soup = BS(html, "lxml")
    if not 'img title="匿名用户"' in html:
        tmp = soup.find("a", class_="zg-link")
        name = tmp["title"]
        user_url = tmp["href"]
        avatar_url = soup.find("img", class_="zm-item-img-avatar")["src"]
        question_count = int(re_question.findall(html)[0])
        answer_count = int(re_answer.findall(html)[0])
        return name, question_count, answer_count, avatar_url, user_url


if __name__ == '__main__':
    # 第一次使用时需要调用此方法登录一次，生成cookie文件
    # 以后可以跳过这一步
    # ZhiHuClient().login(username, password)

    # 如何看待《万万没想到西游篇》大电影在知乎上被众多大神唾弃的现象？ - 张兆杰的回答
    # https://www.zhihu.com/question/38490150/answer/77029343
    # getVoters(26236952)

    # 能利用爬虫技术做到哪些很酷很有趣很有用的事情？ - Emily L 的回答
    # https://www.zhihu.com/question/27621722/answer/48177710
    # getVoters(14668992)

    # 如何评价杭州女生郭文景被哈佛录取？ - 匿名用户的回答
    # https://www.zhihu.com/question/38444099/answer/76550778
    # getVoters(26045034)

    # 行列式的本质是什么？ - 童哲的回答
    # https://www.zhihu.com/question/36966326/answer/70687817
    # getVoters(23695740)

    # 为什么男生比女生多，却有很多优质女生找不到男朋友？ - Mingo鸣哥的回答
    # https://www.zhihu.com/question/37373471/answer/77068061
    # getVoters(26252564)

    # 什么时候才是开掉「技术合伙人」的最佳时机？ - 匿名用户的回答
    # https://www.zhihu.com/question/38531356/answer/76871265
    getVoters(26173592)