数据爬取
前期分析
通过浏览器Network分析出以下API:
- 获取首页全部标签
- 子级标签页全部标签
- 获取指定标签页文章列表
数据爬取
获取首页标签信息
# 获取首页标签信息
def get_index_categories():
response = requests.get(INDEX_TAGS_URL)
if not response.ok:
raise Exception("无法获取到主页标签信息:{0}".format(INDEX_TAGS_URL))
categories = json.loads(response.text)["data"]
logging.info("首页标签:{0}".format(",".join(map(lambda c: c["category_name"] + " " + c["category_id"], categories))))
return categories
获取子标签页全部标签
# 获取子标签页全部标签
def get_sub_tags(category):
data = {
"cate_id": category["category_id"]
}
response = requests.post(url=SUB_TAGS_URL, json=data)
if not response.ok:
raise Exception("无法获取到主页标签信息:{0}, {1}".format(SUB_TAGS_URL, data))
tags = json.loads(response.text)["data"]
logging.info(
"{0}:{1}".format(category["category_name"], ",".join(map(lambda t: t["tag_name"] + " " + t["tag_id"], tags))))
return tags
根据标签获取文章列表
# 根据标签获取文章列表
def get_articles(category, tag):
data = {
"id_type": 2,
"sort_type": 200,
"cate_id": category["category_id"],
"tag_id": tag["tag_id"],
"cursor": "0",
"limit": PAGE_SIZE
}
response = requests.post(url=ARTICLE_URL, json=data)
if not response.ok:
raise Exception("无法获取到列表标签信息:{0}, {1}".format(ARTICLE_URL, data))
articles = json.loads(response.text)["data"]
logging.info("{0}-{1}:\n {2}".format(category["category_name"],
tag["tag_name"],
"\n".join(map(lambda a: a["article_info"]["title"], articles))))
return articles
输出文章列表文件
# 输出文章列表至指定文件
def output_article_list(result_file_path):
# 写入标题
with open(result_file_path, 'w') as result_file:
result_file.write("领域,标签,热度,文章标题,浏览数,收藏数,点赞数,评论数,文章链接\n")
# 获取领域列表
categories = get_index_categories()
for category in categories:
# 获取标签列表
tags = get_sub_tags(category)
for tag in tags:
# 获取文章列表
articles = get_articles(category, tag)
for article in articles:
# 将文章列表写入文件
with open(result_file_path, 'a+') as result_file:
article_url = "{0}/{1}".format(ARTICLE_DETAIL_URL, article["article_id"])
line = "{cate_name},{tag_name},{hot_index},{title}," \
"{view_count},{collect_count},{digg_count}," \
"{comment_count},{article_url}\n".format(cate_name=category["category_name"],
tag_name=tag["tag_name"],
hot_index=article["article_info"]["hot_index"],
title=article["article_info"]["title"].replace(",", ""),
view_count=article["article_info"]["view_count"],
collect_count=article["article_info"][
"collect_count"],
digg_count=article["article_info"]["digg_count"],
comment_count=article["article_info"][
"comment_count"],
article_url=article_url)
result_file.write(line)
logging.info("全部爬取完成并输出至{0}!".format(result_file_path))
分析输出词云图和柱状图
# 根据领域和标签输出词云图
def output_hot_words(titles_file_path, word_cloud_dir):
# 准备目录
category_dir = "{0}/领域".format(word_cloud_dir)
tag_dir = "{0}/标签".format(word_cloud_dir)
if not os.path.isdir(category_dir):
os.mkdir(category_dir)
logging.warning("{0}目录不存在,已创建!".format(category_dir))
if not os.path.isdir(tag_dir):
os.mkdir(tag_dir)
logging.warning("{0}标签不存在,已创建!".format(tag_dir))
# 载入文章列表
articles = list()
with open(titles_file_path, 'r') as titles_file:
lines = csv.reader(titles_file)
next(lines)
for line in lines:
articles.append(line)
# 获取领域和标签列表
df = pd.DataFrame(articles)
categories = df.get(0).unique()
tags = df.get(1).unique()
# 按领域输出热词图
for category in categories:
with_index_file_path = "{0}/{1}(热度加权).png".format(category_dir, category)
without_index_file_path = "{0}/{1}(无加权).png".format(category_dir, category)
titles = df.loc[df[0] == category].values
titles_with_index, titles_without_index = [],[]
for title in titles:
titles_with_index.append(title[3]*int(title[2]))
titles_without_index.append(title[3])
with_index_segments = list(jieba.cut("".join(titles_with_index), cut_all=True))
without_index_segments = list(jieba.cut("".join(titles_without_index), cut_all=True))
result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in category.lower(), with_index_segments))
WC.generate(result).to_file(with_index_file_path)
result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in category.lower(), without_index_segments))
WC.generate(result).to_file(without_index_file_path)
logging.info("领域-{0}热词图已输出至:{1}, {2}.".format(category, with_index_file_path, without_index_file_path))
# 按输出热词图
for tag in tags:
with_index_file_path = "{0}/{1}(热度加权).png".format(tag_dir, tag)
without_index_file_path = "{0}/{1}(无加权).png".format(tag_dir, tag)
titles = df.loc[df[1] == tag].values
titles_with_index, titles_without_index = [],[]
for title in titles:
titles_with_index.append(title[3]*int(title[2]))
titles_without_index.append(title[3])
with_index_segments = list(jieba.cut("".join(titles_with_index), cut_all=True))
without_index_segments = list(jieba.cut("".join(titles_without_index), cut_all=True))
result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in tag.lower(), with_index_segments))
WC.generate(result).to_file(with_index_file_path)
result = " ".join(filter(lambda w: w.lower() not in EXCLUDE_WORDS and w.lower() not in tag.lower(), without_index_segments))
WC.generate(result).to_file(without_index_file_path)
logging.info("标签-{0}热词图已输出至:{1},{2}.".format(tag, with_index_file_path, without_index_file_path))
# 根据数据创建柱状图
def create_bar(counter_by_hot, counter_by_view,
counter_by_collect, counter_by_digg,
counter_by_comment, result_file_path):
plt.rcParams["font.family"] = u"Arial Unicode MS"
plt.subplots_adjust(wspace=0.2, hspace=0.2)
plt.figure(figsize=(45, 30))
p1 = plt.subplot(2, 3, 1)
pd.Series(counter_by_hot).plot(color='y', kind='bar')
p1.set_title("按热度", y=0.9)
p2 = plt.subplot(2, 3, 2)
pd.Series(counter_by_view).plot(color='r', kind='bar')
p2.set_title("按浏览数", y=0.9)
p3 = plt.subplot(2, 3, 3)
pd.Series(counter_by_collect).plot(color='c', kind='bar')
p3.set_title("按收藏数", y=0.9)
p4 = plt.subplot(2, 3, 4)
pd.Series(counter_by_digg).plot(color='m', kind='bar')
p4.set_title("按点赞数", y=0.9)
p5 = plt.subplot(2, 3, 5)
pd.Series(counter_by_comment).plot(color='g', kind='bar')
p5.set_title("按评论数", y=0.9)
plt.sca(p1)
plt.sca(p2)
plt.sca(p3)
plt.sca(p4)
plt.sca(p5)
plt.savefig(result_file_path)
logging.info("{0}输出完成!".format(result_file_path))
# 根据领域和标签输出柱状图
def output_category_and_tags(titles_file_path, bar_dir):
# 准备目录
category_file_path = "{0}/领域分布.png".format(bar_dir)
tag_file_path = "{0}/标签分布.png".format(bar_dir)
if not os.path.isdir(bar_dir):
os.mkdir(bar_dir)
logging.warning("{0}目录不存在,已创建!".format(bar_dir))
# 载入文章列表
articles = list()
with open(titles_file_path, 'r') as titles_file:
lines = csv.reader(titles_file)
next(lines)
for line in lines:
articles.append(line)
# 获取领域和标签列表
df = pd.DataFrame(articles)
categories_counter_by_hot = defaultdict(int)
categories_counter_by_view = defaultdict(int)
categories_counter_by_collect = defaultdict(int)
categories_counter_by_digg = defaultdict(int)
categories_counter_by_comment = defaultdict(int)
tags_counter_by_hot = defaultdict(int)
tags_counter_by_view = defaultdict(int)
tags_counter_by_collect = defaultdict(int)
tags_counter_by_digg = defaultdict(int)
tags_counter_by_comment = defaultdict(int)
for i, v in df.iterrows():
# 按领域求和
categories_counter_by_hot[v[0]]=categories_counter_by_hot[v[0]]+int(v[2])
categories_counter_by_view[v[0]]=categories_counter_by_view[v[0]]+int(v[4])
categories_counter_by_collect[v[0]]=categories_counter_by_collect[v[0]]+int(v[5])
categories_counter_by_digg[v[0]]=categories_counter_by_collect[v[0]]+int(v[6])
categories_counter_by_comment[v[0]]=categories_counter_by_collect[v[0]]+int(v[7])
# 按标签求和
tags_counter_by_hot[v[1]]=tags_counter_by_hot[v[1]]+int(v[2])
tags_counter_by_view[v[1]]=tags_counter_by_view[v[1]]+int(v[4])
tags_counter_by_collect[v[1]]=tags_counter_by_collect[v[1]]+int(v[5])
tags_counter_by_digg[v[1]]=tags_counter_by_collect[v[1]]+int(v[6])
tags_counter_by_comment[v[1]]=tags_counter_by_collect[v[1]]+int(v[7])
# 按领域分布柱状图
create_bar(counter_by_hot=categories_counter_by_hot,
counter_by_view=categories_counter_by_view,
counter_by_collect=categories_counter_by_collect,
counter_by_digg=categories_counter_by_digg,
counter_by_comment=categories_counter_by_comment,
result_file_path=category_file_path)
# 按标签分布柱状图
create_bar(counter_by_hot=tags_counter_by_hot,
counter_by_view=tags_counter_by_view,
counter_by_collect=tags_counter_by_collect,
counter_by_digg=tags_counter_by_digg,
counter_by_comment=tags_counter_by_comment,
result_file_path=tag_file_path)
分析
领域-Android
按掘金热度加权
不加权
标签-大数据
按掘金热度加权
不加权
领域分布
标签分布
结论
- 掘金目前的主推方向还是前端
- 大家关注比较多的是面试、算法相关文章
- 掘金还是比较注重流量扶持的,会给很多文章流量热度,有助于更多信任入驻和成长
- 其他结论可以通过各标签和领域的词云图深入分析