一、准备工作
1.1 环境要求
- PHP 7.4+
- cURL扩展
- DOM扩展(用于HTML解析)
- 可选:Guzzle HTTP客户端库
1.2 淘宝反爬机制认知
- 动态cookie验证
- 请求频率限制
- 用户行为检测
- 数据加密渲染
二、基础爬取方案
2.1 模拟浏览器请求
php
复制
下载
function getTaobaoComments($itemId, $page = 1) {
$url = "https://rate.taobao.com/feedRateList.htm";
$params = [
'auctionNumId' => $itemId,
'currentPageNum' => $page,
'pageSize' => 20,
'rateType' => 1, // 1-全部评价 2-好评 3-中评 4-差评
'orderType' => 'sort_weight' // 排序方式
];
$headers = [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer: https://item.taobao.com/item.htm?id='.$itemId,
'Accept: application/json'
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url.'?'.http_build_query($params));
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_COOKIEFILE, 'taobao_cookies.txt');
curl_setopt($ch, CURLOPT_COOKIEJAR, 'taobao_cookies.txt');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$response = curl_exec($ch);
curl_close($ch);
// 处理JSONP响应
$response = preg_replace('/^jsonp\d+(/', '', rtrim($response, ')'));
return json_decode($response, true);
}
2.2 解析评论数据
php
复制
下载
function parseComments($rawData) {
$comments = [];
if (!isset($rawData['comments'])) {
return $comments;
}
foreach ($rawData['comments'] as $item) {
$comment = [
'id' => $item['id'],
'author' => $item['user']['nick'],
'content' => $item['content'],
'date' => $item['date'],
'rate' => $item['rate'],
'photos' => [],
'append' => null
];
// 处理晒图
if (isset($item['photos']) && is_array($item['photos'])) {
foreach ($item['photos'] as $photo) {
$comment['photos'][] = 'https:'.$photo['url'];
}
}
// 处理追评
if (isset($item['appendComment'])) {
$comment['append'] = [
'content' => $item['appendComment']['content'],
'date' => $item['appendComment']['date']
];
}
$comments[] = $comment;
}
return $comments;
}
三、高级反反爬策略
3.1 动态请求头生成
php
复制
下载
function generateRandomUserAgent() {
$browsers = [
'Chrome' => ['91.0.4472', '92.0.4515', '93.0.4577'],
'Firefox' => ['89.0', '90.0', '91.0'],
'Safari' => ['537.36', '605.1.15']
];
$browser = array_rand($browsers);
$version = $browsers[$browser][array_rand($browsers[$browser])];
if ($browser == 'Chrome') {
return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'.$version.' Safari/537.36';
} else {
return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:'.$version.') Gecko/20100101 Firefox/'.$version;
}
}
3.2 IP代理池集成
php
复制
下载
function getProxy() {
// 这里替换为你的代理IP获取接口
$proxyApi = 'http://your-proxy-service.com/get';
$proxy = json_decode(file_get_contents($proxyApi), true);
return [
'ip' => $proxy['ip'],
'port' => $proxy['port'],
'auth' => 'username:password' // 如果需要认证
];
}
function makeRequestWithProxy($url, $headers) {
$proxy = getProxy();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PROXY, $proxy['ip'].':'.$proxy['port']);
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $proxy['auth']);
// ...其他curl设置
return curl_exec($ch);
}
四、数据存储方案
4.1 MySQL存储结构
sql
复制
下载
CREATE TABLE `taobao_comments` (
`id` bigint(20) NOT NULL,
`item_id` bigint(20) NOT NULL,
`author` varchar(100) DEFAULT NULL,
`content` text,
`rating` tinyint(1) DEFAULT NULL,
`comment_date` datetime DEFAULT NULL,
`has_photos` tinyint(1) DEFAULT '0',
`append_content` text,
`append_date` datetime DEFAULT NULL,
`crawl_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_item_id` (`item_id`),
KEY `idx_date` (`comment_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
CREATE TABLE `comment_photos` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`comment_id` bigint(20) NOT NULL,
`url` varchar(255) NOT NULL,
`local_path` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `idx_comment_id` (`comment_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
4.2 数据存储实现
php
复制
下载
function saveCommentsToDB($itemId, $comments) {
$db = new PDO('mysql:host=localhost;dbname=crawl_data', 'username', 'password');
foreach ($comments as $comment) {
// 保存主评论
$stmt = $db->prepare("INSERT INTO taobao_comments
(id, item_id, author, content, rating, comment_date, has_photos, append_content, append_date)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE content=VALUES(content)");
$hasPhotos = !empty($comment['photos']) ? 1 : 0;
$stmt->execute([
$comment['id'],
$itemId,
$comment['author'],
$comment['content'],
$comment['rate'],
$comment['date'],
$hasPhotos,
$comment['append']['content'] ?? null,
$comment['append']['date'] ?? null
]);
// 保存图片
if (!empty($comment['photos'])) {
foreach ($comment['photos'] as $photoUrl) {
$photoStmt = $db->prepare("INSERT INTO comment_photos (comment_id, url) VALUES (?, ?)");
$photoStmt->execute([$comment['id'], $photoUrl]);
// 可选:下载图片到本地
// downloadImage($photoUrl, 'path/to/save/');
}
}
}
}
五、完整爬取流程
5.1 分页爬取控制
php
复制
下载
function crawlAllComments($itemId, $maxPage = 100) {
$allComments = [];
for ($page = 1; $page <= $maxPage; $page++) {
echo "正在爬取第 {$page} 页...";
try {
$rawData = getTaobaoComments($itemId, $page);
if (empty($rawData['comments'])) {
echo "没有更多评论了\n";
break;
}
$comments = parseComments($rawData);
$allComments = array_merge($allComments, $comments);
saveCommentsToDB($itemId, $comments);
// 随机延迟防止被封
sleep(rand(3, 10));
} catch (Exception $e) {
echo "第 {$page} 页爬取失败: ".$e->getMessage()."\n";
sleep(60); // 失败后等待1分钟
}
}
return $allComments;
}
六、异常处理与日志
6.1 错误处理增强
php
复制
下载
function safeCrawl($itemId, $page) {
try {
$rawData = getTaobaoComments($itemId, $page);
if (isset($rawData['error'])) {
logError("API返回错误: ".$rawData['error']);
return false;
}
return parseComments($rawData);
} catch (Exception $e) {
logError("爬取失败: ".$e->getMessage()." - 商品ID: {$itemId} 页码: {$page}");
return false;
}
}
function logError($message) {
$log = date('[Y-m-d H:i:s]').' '.$message.PHP_EOL;
file_put_contents('crawl_errors.log', $log, FILE_APPEND);
}
七、优化建议
- 分布式爬取:使用消息队列分发任务到多台服务器
- 验证码处理:集成打码平台自动识别验证码
- 增量爬取:记录最后爬取时间,只获取新评论
- 数据去重:使用Redis存储已爬取的评论ID
- 浏览器自动化:对JavaScript渲染的页面使用Selenium
八、法律声明
- 本代码仅用于学习研究,请勿用于商业用途
- 淘宝用户协议禁止未经授权的数据抓取
- 爬取频率过高可能导致IP被封禁
- 建议控制爬取速度(每秒不超过1次请求)