小菜鸟的python爬虫之路:爬取知乎有趣的话题及详细回答

5,757 阅读6分钟

听说python爬虫经典实战项目有:豆瓣、知乎、 网易云音乐、 糗事百科,今天我们就来实战一下知乎。

一.分析知乎页面

我用的是chrome浏览器,打开知乎首页 www.zhihu.com/, 按F12进入开发者模式,选择 network 选项,刷新一下网页,发现有一个网络请求返回了一堆奇怪的json数据。 copy json数据到 www.json.cn/ 查看 这正是我们要的数据. 查看header信息部分

二.爬取知乎推荐话题

依赖库

import os
from bs4 import BeautifulSoup
import requests
import json
import sys
import random

请求头部分: cookie直接从header里的cookie复制

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'referer': 'https: // www.zhihu.com /',
    'cookie': '_zap=5be1e1ef-530e-4b98-a2c5-a74abc65328c; __DAYU_PP=AnJrB73abrJaaAUnRnbqffffffffd4f021478a79; d_c0="ACDgi6Ziaw2PTutstmH1EZlTofQ8yy_sDcI=|1523324808"; _xsrf=vRONRqkfHequ3JiJ6UySgM2Hx5NnU9XO; __utmv=51854390.100--|2=registration_date=20150709=1^3=entry_date=20150709=1; __utma=51854390.1655730157.1544059342.1547004017.1553246228.4; z_c0="2|1:0|10:1561618697|4:z_c0|92:Mi4xZlhQVkFRQUFBQUFBSU9DTHBtSnJEU1lBQUFCZ0FsVk5DYmNCWGdBY3F3SjlTcjdtejdFWXUtQkFZU09JNFI1OVp3|c9c13407693c7aa1191c89b04336502a27175b5410e17829bae2465d8c07bc0f"; tst=r; q_c1=52fdb38e9fc747d599734bd7aad8309f|1573643201000|1522743249000; tgw_l7_route=64ba0a179156dda09fec37a3b2d556ed; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1574772879,1574842181,1574845078,1574845587; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1574845886'
}

分析url
第一页:

Request URL: https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=79c35cffed3a414440c35037c8d49d2f&desktop=true&page_number=2&limit=6&action=down&after_id=5

第二页

Request URL: https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=79c35cffed3a414440c35037c8d49d2f&desktop=true&page_number=2&limit=6&action=down&after_id=10

末尾的 after_id 代表分页属性,第一页 after_id=5,第二页 after_id=10,第三页 after_id=15...
个人ip大量爬取容易封ip,这里我们使用代理ip(www.goubanjia.com/ 使用的是免费的代理ip,有时效限制)。网络请求部分封装如下

poxy_list=[]

def initPoxy():
    poxy_list.append({"http":"223.111.131.100:8080"})
    poxy_list.append({"http": "218.60.8.99:3129"})
    poxy_list.append({"http": "39.137.107.98:80"})
    poxy_list.append({"http": "118.89.234.236:8787"})
    poxy_list.append({"http": "218.22.7.62:53281"})
    poxy_list.append({"http": "117.57.91.235:9999"})
    poxy_list.append({"http": "110.243.23.117:9999"})

def getZhiHu():
    baseurl = "https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=266ffeaebe639cb2631838b446eccd67&desktop=true&page_number=4&limit=6&action=down&after_id="
    url_list = []
    for num in range(1,3):#这里我们只请求前两页的推荐话题,可以自己改
        url_list.append(baseurl + str(num*5))
    for url in url_list:
        regPoxy(url,1)

#查询代理ip是否可用,不可用重新随机挑选一个ip,num =1 表示爬取推荐话题,num =2 表示爬取某一个话题全部回答
def regPoxy(url,num):
    response = requests.get(url, headers=headers, proxies=random.choice(poxy_list))
    if response.status_code == 200:
        if num == 1:
            getzhihutitle(response)
        else:
            getZhiHuItemDetail(response)
    else:
        regPoxy(url)

网络请求后返回json数据,接下来解析


#获取知乎推荐标题
def getzhihutitle(response):

    html = response.text
    dict_json = json.loads(html)
    dit_list = dict_json['data']
    for ditc in dit_list:
        ditTarget = ditc['target']
        # 标题
        try:
            dict_question = ditTarget['question']
            print(dict_question['title'])
        except:
            print(ditTarget['title'])

        # 回答者
        print('回答者: ' + ditTarget['author']['name'])
        # 回答者个人签名
        print('个人签名: ' + ditTarget['author']['headline'])

        if dict_question['type'] == 'question':
            # 问题具体页面
            print('问题详细url:  https://www.zhihu.com/question/' + str(dict_question['id']))

        # ques_id_list.append(dict_question['id'])

        # 回答内容
        htmls = BeautifulSoup(ditTarget['content'], "html.parser")
        print(htmls.get_text())
        print('')

上面的数据是知乎推荐的话题,包括标题和一条回答。
接下来我们再爬取某一条话题里面的所有回答

三.爬取某一条话题的所有回答

分析:
随便点击一条话题详细回答页面,按F12查看network信息,刷新一下页面

发现

Request URL: https://www.zhihu.com/api/v4/questions/36789686/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset=&limit=3&sort_by=default&platform=desktop

这个url的resopnse 返回的json正式我们要的数据

上面的url可分为两部分:
www.zhihu.com/api/v4/ques… + id + 后面一大堆 这里的id即为上面我们获取到的 dict_question['id'] 里面的id
我们分析返回的json 发现,更多回答在'next'属性里面

获取某一个话题的详细回答:

#单个问题抓取
def getZhiHuItemDetail(response):

    html = response.text
    dict_json = json.loads(html)
    dit_total = dict_json['paging']
    print('总共回答数:'+ str(dit_total['totals']))

    dit_list = dict_json['data']
    for dictBean in dit_list:
        question_info = []  # 该问题所有数据
        global  count
        if count ==0 :
            print('问题: ' + dictBean['question']['title'])
            count = count + 1
        print('回答者:'+dictBean['author']['name'])
        print('个人签名:' + dictBean['author']['headline'])

        question_info.append('回答者:'+dictBean['author']['name'])
        question_info.append('个人签名:' + dictBean['author']['headline'])

        # 回答内容
        htmls = BeautifulSoup(dictBean['content'], "html.parser")
        print(htmls.get_text())
        question_info.append(htmls.get_text())
        print('')
        saveQuesInfo(question_info,dictBean['question']['title'],dit_total['totals'])

    is_end = dit_total['is_end']
    if is_end :
        print('.................结束......................')
        # exit()
        startSpider()
    else:
        # getZhiHuItemDetail(dit_total['next'])
        regPoxy(dit_total['next'], 2)

保存某一详细回答到电脑本地

#保存某一个问题详情
def saveQuesInfo(question_info,title,totalNum):

    basePath=r"C:/Users/fp/Desktop/zhihu/"
    file = open(os.path.join(basePath)+"{}.txt".format(title), "a",encoding='utf-8')
    for ques in question_info:
        for i in range(len(ques)):
            file.write(ques[i])
        file.write('\n')
    file.write('\n')
    file.close()

全部代码如下:

#!/usr/bin/python3
# _*_ coding: utf-8 _*_
import os
from bs4 import BeautifulSoup
import requests
import json
import sys
import random
sys.setrecursionlimit(10000)

#问题id
ques_id_list =[]

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'referer': 'https: // www.zhihu.com /',
    'cookie': '_zap=5be1e1ef-530e-4b98-a2c5-a74abc65328c; __DAYU_PP=AnJrB73abrJaaAUnRnbqffffffffd4f021478a79; d_c0="ACDgi6Ziaw2PTutstmH1EZlTofQ8yy_sDcI=|1523324808"; _xsrf=vRONRqkfHequ3JiJ6UySgM2Hx5NnU9XO; __utmv=51854390.100--|2=registration_date=20150709=1^3=entry_date=20150709=1; __utma=51854390.1655730157.1544059342.1547004017.1553246228.4; z_c0="2|1:0|10:1561618697|4:z_c0|92:Mi4xZlhQVkFRQUFBQUFBSU9DTHBtSnJEU1lBQUFCZ0FsVk5DYmNCWGdBY3F3SjlTcjdtejdFWXUtQkFZU09JNFI1OVp3|c9c13407693c7aa1191c89b04336502a27175b5410e17829bae2465d8c07bc0f"; tst=r; q_c1=52fdb38e9fc747d599734bd7aad8309f|1573643201000|1522743249000; tgw_l7_route=64ba0a179156dda09fec37a3b2d556ed; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1574772879,1574842181,1574845078,1574845587; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1574845886'
}

poxy_list=[]

def initPoxy():
    poxy_list.append({"http":"223.111.131.100:8080"})
    poxy_list.append({"http": "218.60.8.99:3129"})
    poxy_list.append({"http": "39.137.107.98:80"})
    poxy_list.append({"http": "118.89.234.236:8787"})
    poxy_list.append({"http": "218.22.7.62:53281"})
    poxy_list.append({"http": "117.57.91.235:9999"})
    poxy_list.append({"http": "110.243.23.117:9999"})

def getZhiHu():
    baseurl = "https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=266ffeaebe639cb2631838b446eccd67&desktop=true&page_number=4&limit=6&action=down&after_id="
    url_list = []
    for num in range(1,3):
        url_list.append(baseurl + str(num*5))
    for url in url_list:
        regPoxy(url,1)

def regPoxy(url,num):
    response = requests.get(url, headers=headers, proxies=random.choice(poxy_list))
    if response.status_code == 200:
        if num == 1:
            getzhihutitle(response)
        else:
            getZhiHuItemDetail(response)
    else:
        regPoxy(url)

#获取知乎推荐标题
def getzhihutitle(response):

    html = response.text
    dict_json = json.loads(html)
    dit_list = dict_json['data']
    for ditc in dit_list:
        ditTarget = ditc['target']
        # 标题
        try:
            dict_question = ditTarget['question']
            print(dict_question['title'])
        except:
            print(ditTarget['title'])

        # 回答者
        print('回答者: ' + ditTarget['author']['name'])
        # 回答者个人签名
        print('个人签名: ' + ditTarget['author']['headline'])

        if dict_question['type'] == 'question':
            # 问题具体页面
            print('问题详细url:  https://www.zhihu.com/question/' + str(dict_question['id']))

        # ques_id_list.append(dict_question['id'])

        # 回答内容
        htmls = BeautifulSoup(ditTarget['content'], "html.parser")
        print(htmls.get_text())
        print('')

base_item_url_start = 'https://www.zhihu.com/api/v4/questions/'
base_item_url_end = '/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset=&limit=3&sort_by=default&platform=desktop'

count = 0
#单个问题抓取
def getZhiHuItemDetail(response):

    html = response.text
    dict_json = json.loads(html)
    dit_total = dict_json['paging']
    print('总共回答数:'+ str(dit_total['totals']))

    dit_list = dict_json['data']
    for dictBean in dit_list:
        question_info = []  # 该问题所有数据
        global  count
        if count ==0 :
            print('问题: ' + dictBean['question']['title'])
            count = count + 1
        print('回答者:'+dictBean['author']['name'])
        print('个人签名:' + dictBean['author']['headline'])

        question_info.append('回答者:'+dictBean['author']['name'])
        question_info.append('个人签名:' + dictBean['author']['headline'])

        # 回答内容
        htmls = BeautifulSoup(dictBean['content'], "html.parser")
        print(htmls.get_text())
        question_info.append(htmls.get_text())
        print('')
        saveQuesInfo(question_info,dictBean['question']['title'],dit_total['totals'])

    is_end = dit_total['is_end']
    if is_end :
        print('.................结束......................')
        # exit()
        startSpider()
    else:
        # getZhiHuItemDetail(dit_total['next'])
        regPoxy(dit_total['next'], 2)

#保存某一个问题详情
def saveQuesInfo(question_info,title,totalNum):

    basePath=r"C:/Users/fp/Desktop/zhihu/" #保存爬取的话题地址,自己更改
    file = open(os.path.join(basePath)+"{}.txt".format(title), "a",encoding='utf-8')
    for ques in question_info:
        for i in range(len(ques)):
            file.write(ques[i])
        file.write('\n')
    file.write('\n')
    file.close()


#爬取某一个问题
def startSpider():
    id = input('请输入想要查看的问题id:')
    if id != 'exit':
        url = base_item_url_start + id + base_item_url_end
        regPoxy(url,2)
    else:
        print('.................结束......................')
        exit()

if __name__ == "__main__":
    initPoxy()
    getZhiHu()
    startSpider()

最后

如果对知乎推荐的话题不感兴趣,想爬取自己找的某一个话题到本地,可以直接网页打开该话题首页,copy一下话题id,然后输入,也可以爬取该话题全部回答。