爬取知乎某专栏的数据并进行数据可视化在存储时间戳数据时，可以转为时间字符串后存入或者不做转换直接存入，此处考虑到之后要根

需要安装的库

request
pandas
simplejson
matplotlib
nump

导入要用的库

import requests
import simplejson as json
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np

分析页面数据来源

F12后查看XHR，找到数据来源的请求

查看请求的header，初步确认请求的参数，以及请求的方法

去除无用的参数

下拉网页加载更多，注意请求的增加，通过分析三次请求传递的参数，确定有效的参数为limit和offset

获取数据

class MYSpider(object):
    def __init__(self, base_url, url_headers):
        # 最基础的URL
        self.base_url = base_url
        self.headers = url_headers

    # 获取几页的内容
    def get_onePage(self, start_num):

        num_page = 0
        while True:
            num_page += 1

            # 发送请求
            url = self.base_url.format(num_page * 10)
            response = requests.get(url=url, headers=self.headers)
            if response.status_code == 200:
                self.parse_onePage(response.content.decode('utf-8'))
            else:
                return None

            if num_page == start_num:
                break
                
                
if __name__ == "__main__":

    base_url = 'https://zhuanlan.zhihu.com/api/columns/pythoneer/articles?limit=10&offset={}'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.83 Safari/537.36 Edg/81.0.416.41"
    }
    myspider = MYSpider(base_url, headers)
    myspider.get_onePage(5)

解析数据

在全局设置一个字典变量用来存储数据

data_dict = {
    'name': [],
    'p_date': [],
    'like': [],
    'comment': []
}

在存储时间戳数据时，可以转为时间字符串后存入或者不做转换直接存入，此处考虑到之后要根据年进行分组，便没做转换。

# 解析数据
    def parse_onePage(self, res):

        # 将数据转化成Python对象
        conntent_dict = json.loads(res)

        Data_dict = conntent_dict['data']


        # 拿到每一条数据
        for value_dict in Data_dict:
            # 标题
            title = value_dict['title']
            data_dict['name'].append(title)
            # 点赞数
            voteup_count = value_dict['voteup_count']
            data_dict['like'].append(voteup_count)
            # 评论数
            comment_count = value_dict['comment_count']
            data_dict['comment'].append(comment_count)
            #日期
            created = value_dict['created']
            # # 时间戳： 1970-01-01 00：00：00 到当前时间的的秒数(10位数)或者是毫秒数（13位）
            # timearray = time.localtime(created)
            # tt = time.strftime('%Y--%m--%d %H:%M:%S',timearray)
            data_dict['p_date'].append(created)

绘制评论数随时间变化的折线图

通过map来修改某列的值，将时间那列的所有时间戳准换为年-月-日格式

    def to_zhe(self, df):

        df['p_date']= df['p_date'].map(lambda x:time.strftime('%Y-%m-%d',time.localtime(x)))

        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        ax = df.plot(y='comment', x='p_date', title="文章评论量趋势", figsize=(9, 6))
        # 设置y轴标签
        ax.set_ylabel("评论量")
        # 设置x轴标签
        ax.set_xlabel("")
        # 隐藏图例
        ax.legend().set_visible(False)
        plt.show()

结果

绘制不同年份文章发布量的条形图

需要按年分组，所以将所有时间戳转换为年格式

    # 发布量条形图
    def to_bar(self, df):
        df['p_date'] = df['p_date'].map(lambda x: time.strftime('%Y', time.localtime(x)))

        year_df = df.groupby('p_date').size().reset_index(name='total')

        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        ax = year_df.plot(x='p_date', y='total', kind='bar', figsize=(9, 6), fontsize=15)
        ax.set_ylabel("文章数")
        ax.set_xlabel("")
        ax.legend().set_visible(False)
        # 柱状图上显示数字
        for p in ax.patches:
            ax.annotate(str(p.get_height()), xy=(p.get_x(), p.get_height()))

        plt.show()

结果

绘制赞同数排名前十的文章的横向条形图

列排序取前十

 #获取赞同数排名前十的文章
    def get_top(self, df):

        # 根据赞同数排序，ascending 表示降序排列

        top_read_num_10 = df.sort_values(by=['like'], ascending=False)[:10]
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
        ax = top_read_num_10.plot(x='name',
                                 y='like',
                                 kind='barh',
                                 figsize=(9, 6),
                                 fontsize=14)
        ax.set_ylabel("")
        ax.set_xlabel("赞同数")
        ax.legend().set_visible(False)

        plt.show()

结果

绘制评论数与赞同数的散点图

    #评论数与赞同数散点图
    def to_scatter(self,df):

        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签

        # 散点图
        ax = df.plot(kind="scatter", y='like', x='comment', s=10, figsize=(9, 6), fontsize=15)
        ax.set_xlabel("评论量")
        ax.set_ylabel("赞同数")

        z = np.polyfit(df.comment, df.like, 1)
        p = np.poly1d(z)
        plt.plot(df.comment, p(df.comment), "r--")
        plt.show()

结果

生成所有文章标题的词云

需要jieba库和worldcloud库

from wordcloud import WordCloud
import jieba
    
    #文章标题词云
    def to_wordcloud(self, df):
        words = []
        for i in df.name:
            seg_list = jieba.cut(i, cut_all=False)
            words.append(" ".join(seg_list))
        wordcloud = WordCloud(font_path="c:\windows\Fonts\simhei.ttf",
                              background_color="white",
                              max_words=80, ).generate(" ".join(words))
        plt.figure(figsize=(9, 6))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

结果