通过爬虫看看如何在掘金写出爆款文章

645 阅读4分钟

数据爬取

category.png

前期分析

通过浏览器Network分析出以下API:

  1. 获取首页全部标签

api.juejin.cn/tag_api/v1/…

  1. 子级标签页全部标签

api.juejin.cn/recommend_a…

  1. 获取指定标签页文章列表

api.juejin.cn/recommend_a…

数据爬取

获取首页标签信息

# 获取首页标签信息
def get_index_categories():
    response = requests.get(INDEX_TAGS_URL)
    if not response.ok:
        raise Exception("无法获取到主页标签信息:{0}".format(INDEX_TAGS_URL))

    categories = json.loads(response.text)["data"]

    logging.info("首页标签:{0}".format(",".join(map(lambda c: c["category_name"] + " " + c["category_id"], categories))))

    return categories

获取子标签页全部标签

# 获取子标签页全部标签
def get_sub_tags(category):
    data = {
        "cate_id": category["category_id"]
    }

    response = requests.post(url=SUB_TAGS_URL, json=data)
    if not response.ok:
        raise Exception("无法获取到主页标签信息:{0}, {1}".format(SUB_TAGS_URL, data))

    tags = json.loads(response.text)["data"]
    logging.info(
        "{0}:{1}".format(category["category_name"], ",".join(map(lambda t: t["tag_name"] + " " + t["tag_id"], tags))))

    return tags

根据标签获取文章列表

# 根据标签获取文章列表
def get_articles(category, tag):
    data = {
        "id_type": 2,
        "sort_type": 200,
        "cate_id": category["category_id"],
        "tag_id": tag["tag_id"],
        "cursor": "0",
        "limit": PAGE_SIZE
    }

    response = requests.post(url=ARTICLE_URL, json=data)
    if not response.ok:
        raise Exception("无法获取到列表标签信息:{0}, {1}".format(ARTICLE_URL, data))

    articles = json.loads(response.text)["data"]
    logging.info("{0}-{1}:\n {2}".format(category["category_name"],
                                         tag["tag_name"],
                                         "\n".join(map(lambda a: a["article_info"]["title"], articles))))

    return articles

输出文章列表文件

# 输出文章列表至指定文件
def output_article_list(result_file_path):
    # 写入标题
    with open(result_file_path, 'w') as result_file:
        result_file.write("领域,标签,热度,文章标题,浏览数,收藏数,点赞数,评论数,文章链接\n")

    # 获取领域列表
    categories = get_index_categories()
    for category in categories:
        # 获取标签列表
        tags = get_sub_tags(category)
        for tag in tags:
            # 获取文章列表
            articles = get_articles(category, tag)
            for article in articles:
                # 将文章列表写入文件
                with open(result_file_path, 'a+') as result_file:
                    article_url = "{0}/{1}".format(ARTICLE_DETAIL_URL, article["article_id"])
                    line = "{cate_name},{tag_name},{hot_index},{title}," \
                           "{view_count},{collect_count},{digg_count}," \
                           "{comment_count},{article_url}\n".format(cate_name=category["category_name"],
                                                                    tag_name=tag["tag_name"],
                                                                    hot_index=article["article_info"]["hot_index"],
                                                                    title=article["article_info"]["title"].replace(",", ""),
                                                                    view_count=article["article_info"]["view_count"],
                                                                    collect_count=article["article_info"][
                                                                        "collect_count"],
                                                                    digg_count=article["article_info"]["digg_count"],
                                                                    comment_count=article["article_info"][
                                                                        "comment_count"],
                                                                    article_url=article_url)

                    result_file.write(line)

    logging.info("全部爬取完成并输出至{0}!".format(result_file_path))

分析输出词云图和柱状图

# 根据领域和标签输出词云图
def output_hot_words(titles_file_path, word_cloud_dir):

    # 准备目录
    category_dir = "{0}/领域".format(word_cloud_dir)
    tag_dir = "{0}/标签".format(word_cloud_dir)

    if not os.path.isdir(category_dir):
        os.mkdir(category_dir)
        logging.warning("{0}目录不存在,已创建!".format(category_dir))

    if not os.path.isdir(tag_dir):
        os.mkdir(tag_dir)
        logging.warning("{0}标签不存在,已创建!".format(tag_dir))

    # 载入文章列表
    articles = list()
    with open(titles_file_path, 'r') as titles_file:
        lines = csv.reader(titles_file)
        next(lines)
        for line in lines:
            articles.append(line)

    # 获取领域和标签列表
    df = pd.DataFrame(articles)
    categories = df.get(0).unique()
    tags = df.get(1).unique()

    # 按领域输出热词图
    for category in categories:
        with_index_file_path = "{0}/{1}(热度加权).png".format(category_dir, category)
        without_index_file_path = "{0}/{1}(无加权).png".format(category_dir, category)

        titles = df.loc[df[0] == category].values
        titles_with_index, titles_without_index = [],[]
        for title in titles:
            titles_with_index.append(title[3]*int(title[2]))
            titles_without_index.append(title[3])

        with_index_segments = list(jieba.cut("".join(titles_with_index), cut_all=True))
        without_index_segments = list(jieba.cut("".join(titles_without_index), cut_all=True))

        result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in category.lower(), with_index_segments))
        WC.generate(result).to_file(with_index_file_path)
        result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in category.lower(), without_index_segments))
        WC.generate(result).to_file(without_index_file_path)

        logging.info("领域-{0}热词图已输出至:{1}, {2}.".format(category, with_index_file_path, without_index_file_path))

    # 按输出热词图
    for tag in tags:
        with_index_file_path = "{0}/{1}(热度加权).png".format(tag_dir, tag)
        without_index_file_path = "{0}/{1}(无加权).png".format(tag_dir, tag)

        titles = df.loc[df[1] == tag].values
        titles_with_index, titles_without_index = [],[]
        for title in titles:
            titles_with_index.append(title[3]*int(title[2]))
            titles_without_index.append(title[3])

        with_index_segments = list(jieba.cut("".join(titles_with_index), cut_all=True))
        without_index_segments = list(jieba.cut("".join(titles_without_index), cut_all=True))

        result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in tag.lower(), with_index_segments))
        WC.generate(result).to_file(with_index_file_path)
        result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in tag.lower(), without_index_segments))
        WC.generate(result).to_file(without_index_file_path)

        logging.info("标签-{0}热词图已输出至:{1},{2}.".format(tag, with_index_file_path, without_index_file_path))


# 根据数据创建柱状图
def create_bar(counter_by_hot, counter_by_view,
               counter_by_collect, counter_by_digg,
               counter_by_comment, result_file_path):
    plt.rcParams["font.family"] = u"Arial Unicode MS"
    plt.subplots_adjust(wspace=0.2, hspace=0.2)
    plt.figure(figsize=(45, 30))

    p1 = plt.subplot(2, 3, 1)
    pd.Series(counter_by_hot).plot(color='y', kind='bar')
    p1.set_title("按热度", y=0.9)

    p2 = plt.subplot(2, 3, 2)
    pd.Series(counter_by_view).plot(color='r', kind='bar')
    p2.set_title("按浏览数", y=0.9)

    p3 = plt.subplot(2, 3, 3)
    pd.Series(counter_by_collect).plot(color='c', kind='bar')
    p3.set_title("按收藏数", y=0.9)

    p4 = plt.subplot(2, 3, 4)
    pd.Series(counter_by_digg).plot(color='m', kind='bar')
    p4.set_title("按点赞数", y=0.9)

    p5 = plt.subplot(2, 3, 5)
    pd.Series(counter_by_comment).plot(color='g', kind='bar')
    p5.set_title("按评论数", y=0.9)

    plt.sca(p1)
    plt.sca(p2)
    plt.sca(p3)
    plt.sca(p4)
    plt.sca(p5)

    plt.savefig(result_file_path)
    logging.info("{0}输出完成!".format(result_file_path))


# 根据领域和标签输出柱状图
def output_category_and_tags(titles_file_path, bar_dir):

    # 准备目录
    category_file_path = "{0}/领域分布.png".format(bar_dir)
    tag_file_path = "{0}/标签分布.png".format(bar_dir)

    if not os.path.isdir(bar_dir):
        os.mkdir(bar_dir)
        logging.warning("{0}目录不存在,已创建!".format(bar_dir))

    # 载入文章列表
    articles = list()
    with open(titles_file_path, 'r') as titles_file:
        lines = csv.reader(titles_file)
        next(lines)
        for line in lines:
            articles.append(line)

    # 获取领域和标签列表
    df = pd.DataFrame(articles)

    categories_counter_by_hot = defaultdict(int)
    categories_counter_by_view = defaultdict(int)
    categories_counter_by_collect = defaultdict(int)
    categories_counter_by_digg = defaultdict(int)
    categories_counter_by_comment = defaultdict(int)

    tags_counter_by_hot = defaultdict(int)
    tags_counter_by_view = defaultdict(int)
    tags_counter_by_collect = defaultdict(int)
    tags_counter_by_digg = defaultdict(int)
    tags_counter_by_comment = defaultdict(int)

    for i, v in df.iterrows():

        # 按领域求和
        categories_counter_by_hot[v[0]]=categories_counter_by_hot[v[0]]+int(v[2])
        categories_counter_by_view[v[0]]=categories_counter_by_view[v[0]]+int(v[4])
        categories_counter_by_collect[v[0]]=categories_counter_by_collect[v[0]]+int(v[5])
        categories_counter_by_digg[v[0]]=categories_counter_by_collect[v[0]]+int(v[6])
        categories_counter_by_comment[v[0]]=categories_counter_by_collect[v[0]]+int(v[7])

        # 按标签求和
        tags_counter_by_hot[v[1]]=tags_counter_by_hot[v[1]]+int(v[2])
        tags_counter_by_view[v[1]]=tags_counter_by_view[v[1]]+int(v[4])
        tags_counter_by_collect[v[1]]=tags_counter_by_collect[v[1]]+int(v[5])
        tags_counter_by_digg[v[1]]=tags_counter_by_collect[v[1]]+int(v[6])
        tags_counter_by_comment[v[1]]=tags_counter_by_collect[v[1]]+int(v[7])

    # 按领域分布柱状图
    create_bar(counter_by_hot=categories_counter_by_hot,
               counter_by_view=categories_counter_by_view,
               counter_by_collect=categories_counter_by_collect,
               counter_by_digg=categories_counter_by_digg,
               counter_by_comment=categories_counter_by_comment,
               result_file_path=category_file_path)

    # 按标签分布柱状图
    create_bar(counter_by_hot=tags_counter_by_hot,
               counter_by_view=tags_counter_by_view,
               counter_by_collect=tags_counter_by_collect,
               counter_by_digg=tags_counter_by_digg,
               counter_by_comment=tags_counter_by_comment,
               result_file_path=tag_file_path)

分析

领域-Android

按掘金热度加权

Android(热度加权).png

不加权

Android(无加权).png

标签-大数据

按掘金热度加权

大数据(热度加权).png

不加权

大数据(无加权).png

领域分布

领域分布.png

标签分布

标签分布.png

结论

  1. 掘金目前的主推方向还是前端
  2. 大家关注比较多的是面试、算法相关文章
  3. 掘金还是比较注重流量扶持的,会给很多文章流量热度,有助于更多信任入驻和成长
  4. 其他结论可以通过各标签和领域的词云图深入分析