在电商数据分析领域,商品评论作为用户反馈的核心载体,蕴含着消费偏好、产品缺陷、服务体验等关键信息。京东作为国内领先的电商平台,其评论数据的深度挖掘对商家优化产品策略、提升服务质量具有重要意义。本文将系统介绍京东评论数据的抓取、存储与分析全流程,并提供可落地的技术实现方案。
一、京东评论数据抓取技术实现
京东评论数据采用动态加载机制,需通过分析接口参数实现批量获取。我们以 Python 作为开发语言,结合 Requests 库与 JSON 解析技术完成数据抓取。
1. 接口分析与参数构造
京东商品评论接口为<font style="color:rgba(0, 0, 0, 0.85) !important;">https://club.jd.com/comment/productPageComments.action</font>,核心参数包括:
<font style="color:rgb(0, 0, 0);">productId</font>:商品 ID<font style="color:rgb(0, 0, 0);">score</font>:评分筛选(0-5)<font style="color:rgb(0, 0, 0);">page</font>:页码<font style="color:rgb(0, 0, 0);">pageSize</font>:每页数据量<font style="color:rgb(0, 0, 0);">isShadowSku</font>:是否包含影子商品
2. 爬虫实现代码
python
import requests
import json
import time
from fake_useragent import UserAgent
class JDCommentSpider:
def __init__(self, product_id):
self.product_id = product_id
self.base_url = "https://club.jd.com/comment/productPageComments.action"
self.ua = UserAgent()
self.headers = {
"User-Agent": self.ua.random,
"Referer": f"https://item.jd.com/{product_id}.html",
"Accept": "application/json, text/plain, */*"
}
def get_comments(self, page=0, score=0):
"""获取单页评论数据"""
params = {
"productId": self.product_id,
"score": score,
"sortType": 5,
"page": page,
"pageSize": 10,
"isShadowSku": 0,
"fold": 1
}
try:
response = requests.get(
self.base_url,
params=params,
headers=self.headers,
timeout=10
)
if response.status_code == 200:
return response.json()
else:
print(f"请求失败:{response.status_code}")
return None
except Exception as e:
print(f"获取评论异常:{str(e)}")
return None
def crawl_all_comments(self, max_pages=10):
"""抓取多页评论"""
all_comments = []
for page in range(max_pages):
print(f"正在抓取第{page+1}页评论...")
data = self.get_comments(page=page)
if not data or "comments" not in data:
break
comments = data["comments"]
if not comments:
break
# 提取关键信息
for comment in comments:
comment_info = {
"id": comment["id"],
"content": comment["content"],
"creationTime": comment["creationTime"],
"score": comment["score"],
"nickname": comment["nickname"],
"productColor": comment.get("productColor", ""),
"productSize": comment.get("productSize", ""),
"usefulVoteCount": comment["usefulVoteCount"]
}
all_comments.append(comment_info)
time.sleep(1) # 控制请求频率
return all_comments
# 使用示例
if __name__ == "__main__":
spider = JDCommentSpider(product_id="100012345678") # 替换为实际商品ID
comments = spider.crawl_all_comments(max_pages=5)
print(f"共抓取{len(comments)}条评论")
3. 反爬策略应对
- 使用随机 User-Agent 避免被识别为爬虫
- 设置请求间隔(1-2 秒)降低服务器压力
- 采用 IP 代理池(可选亿牛云代理)解决 IP 封禁问题
- 解析动态加载的 JSON 数据而非 HTML 页面
二、评论数据存储方案
抓取的评论数据需进行结构化存储,以便后续分析。我们提供两种主流存储方案:
1. MongoDB 存储(非结构化数据)
python
from pymongo import MongoClient
class CommentStorage:
def __init__(self):
self.client = MongoClient("mongodb://localhost:27017/")
self.db = self.client["jd_comments"]
self.collection = self.db["product_comments"]
def save_comments(self, comments):
"""批量存储评论数据"""
if comments:
result = self.collection.insert_many(comments)
return len(result.inserted_ids)
return 0
def get_comments_by_score(self, score):
"""按评分查询评论"""
return list(self.collection.find({"score": score}))
# 存储示例
storage = CommentStorage()
storage.save_comments(comments)
2. MySQL 存储(结构化数据)
运行
import pymysql
class MySQLStorage:
def __init__(self):
self.conn = pymysql.connect(
host="localhost",
user="root",
password="password",
database="jd_comments",
charset="utf8mb4"
)
self.cursor = self.conn.cursor()
self._create_table()
def _create_table(self):
"""创建评论表"""
sql = """
CREATE TABLE IF NOT EXISTS comments (
id VARCHAR(50) PRIMARY KEY,
content TEXT,
creation_time DATETIME,
score TINYINT,
nickname VARCHAR(50),
product_color VARCHAR(50),
product_size VARCHAR(50),
useful_vote_count INT
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
self.cursor.execute(sql)
self.conn.commit()
def save_comment(self, comment):
"""存储单条评论"""
sql = """
INSERT INTO comments
(id, content, creation_time, score, nickname, product_color, product_size, useful_vote_count)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE content=VALUES(content);
"""
try:
self.cursor.execute(sql, (
comment["id"],
comment["content"],
comment["creationTime"],
comment["score"],
comment["nickname"],
comment["productColor"],
comment["productSize"],
comment["usefulVoteCount"]
))
self.conn.commit()
except Exception as e:
print(f"存储失败:{str(e)}")
self.conn.rollback()
三、评论数据分析实践
1. 基础统计分析
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
class CommentAnalyzer:
def __init__(self, comments):
self.df = pd.DataFrame(comments)
self.df["creationTime"] = pd.to_datetime(self.df["creationTime"])
def score_distribution(self):
"""评分分布分析"""
score_count = self.df["score"].value_counts().sort_index()
plt.figure(figsize=(10, 6))
score_count.plot(kind="bar", color="skyblue")
plt.title("京东商品评论评分分布")
plt.xlabel("评分")
plt.ylabel("评论数量")
plt.xticks(rotation=0)
plt.show()
return score_count
def time_trend(self):
"""评论时间趋势分析"""
self.df["date"] = self.df["creationTime"].dt.date
daily_comments = self.df.groupby("date").size()
plt.figure(figsize=(12, 6))
daily_comments.plot(kind="line", marker="o", color="orange")
plt.title("评论时间发布趋势")
plt.xlabel("日期")
plt.ylabel("评论数量")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
def keyword_analysis(self):
"""评论关键词提取(需安装jieba)"""
import jieba
from wordcloud import WordCloud
all_content = " ".join(self.df["content"].tolist())
words = jieba.lcut(all_content)
stop_words = {"的", "了", "是", "我", "也", "都", "很", "在", "有", "就"}
filtered_words = [word for word in words if len(word) > 1 and word not in stop_words]
word_freq = Counter(filtered_words)
wordcloud = WordCloud(
font_path="simhei.ttf",
width=800,
height=600,
background_color="white"
).generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
return word_freq.most_common(10)
# 分析示例
analyzer = CommentAnalyzer(comments)
analyzer.score_distribution()
analyzer.time_trend()
print("评论高频词汇:", analyzer.keyword_analysis())
2. 情感分析应用
结合机器学习模型可实现评论情感倾向判断:
from snownlp import SnowNLP
def sentiment_analysis(comment):
"""情感分析(正面/负面)"""
s = SnowNLP(comment["content"])
return {
**comment,
"sentiment": "正面" if s.sentiments > 0.6 else "负面" if s.sentiments < 0.4 else "中性",
"sentiment_score": s.sentiments
}
# 情感分析示例
sentiment_comments = [sentiment_analysis(cmt) for cmt in comments]
sentiment_count = Counter([cmt["sentiment"] for cmt in sentiment_comments])
print("情感分布:", sentiment_count)
四、技术挑战与解决方案
- 动态数据加载:通过分析 XHR 请求获取真实数据接口
- 数据量大:采用分批抓取 + 异步存储提升效率
- 语义分析精度:结合领域词典优化关键词提取效果
- 数据更新:定时任务(Airflow)实现增量抓取
五、商业应用价值
通过京东评论数据分析,企业可实现:
- 产品缺陷识别:高频负面关键词定位产品问题
- 用户需求挖掘:提取用户潜在需求与改进建议
- 竞品对比分析:多商品评论数据横向对比
- 服务优化方向:从物流、售后等评论维度提升服务质量