1. 准备工作
在开始之前,请确保:
- 你有基本的PHP编程知识
- 服务器已安装PHP(建议7.0+版本)
- 了解HTTP请求和HTML解析基础
2. 分析1688搜索接口
1688的搜索页面是:https://s.1688.com/selloffer/offer_search.htm
通过浏览器开发者工具分析,我们可以发现:
- 搜索请求是POST/GET方式
- 需要传递的关键参数包括:
keywords
(搜索词)、beginPage
(页码)等
3. 基本PHP爬虫实现
方法一:使用file_get_contents和正则表达式
php
<?php | |
---|---|
function search1688($keywords, $page = 1) { | |
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page; | |
// 设置用户代理模拟浏览器访问 | |
$options = [ | |
'http' => [ | |
'method' => 'GET', | |
'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\r\n" | |
] | |
]; | |
$context = stream_context_create($options); | |
// 获取页面内容 | |
$html = file_get_contents($url, false, $context); | |
if ($html === false) { | |
return ['error' => '无法获取页面内容']; | |
} | |
// 使用正则表达式提取商品信息 | |
preg_match_all('/<div class="item">.*?<a href="(.*?)" title="(.*?)">.*?<span class="price">([^<]+)</span>.*?</div>/s', $html, $matches, PREG_SET_ORDER); | |
$results = []; | |
foreach ($matches as $match) { | |
$results[] = [ | |
'url' => $match[1], | |
'title' => $match[2], | |
'price' => $match[3] | |
]; | |
} | |
return $results; | |
} | |
// 使用示例 | |
$products = search1688('手机壳'); | |
print_r($products); | |
?> |
方法二:使用cURL和DOM解析(推荐)
php
<?php | |
---|---|
function search1688WithCurl($keywords, $page = 1) { | |
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page; | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); | |
$html = curl_exec($ch); | |
curl_close($ch); | |
if (!$html) { | |
return ['error' => '无法获取页面内容']; | |
} | |
// 使用DOMDocument解析HTML | |
$dom = new DOMDocument(); | |
@$dom->loadHTML($html); | |
$xpath = new DOMXPath($dom); | |
// 查找商品元素 | |
$items = $xpath->query("//div[contains(@class, 'item')]"); | |
$results = []; | |
foreach ($items as $item) { | |
$titleElement = $xpath->query(".//a[contains(@class, 'title')]", $item)->item(0); | |
$priceElement = $xpath->query(".//span[contains(@class, 'price')]", $item)->item(0); | |
$product = [ | |
'title' => $titleElement ? trim($titleElement->nodeValue) : '', | |
'price' => $priceElement ? trim($priceElement->nodeValue) : '', | |
'url' => $titleElement ? $titleElement->getAttribute('href') : '' | |
]; | |
$results[] = $product; | |
} | |
return $results; | |
} | |
// 使用示例 | |
$products = search1688WithCurl('手机壳'); | |
print_r($products); | |
?> |
4. 处理反爬机制
1688有反爬机制,可能需要处理:
4.1 设置合理的请求头
php
$headers = [ | |
---|---|
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', | |
'Referer: https://www.1688.com/', | |
'DNT: 1' | |
]; | |
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); |
4.2 使用代理IP
php
curl_setopt($ch, CURLOPT_PROXY, 'proxy_ip:port'); | |
---|---|
curl_setopt($ch, CURLOPT_PROXYUSERPWD, 'username:password'); // 如果需要认证 |
4.3 限制请求频率
php
// 在多次请求之间添加延迟 | |
---|---|
sleep(rand(1, 3)); |
5. 使用API接口(推荐)
如果频繁爬取,建议使用1688官方API(需要申请权限):
php
function search1688ViaAPI($keywords, $page = 1) { | |
---|---|
$appKey = '你的APP_KEY'; | |
$appSecret = '你的APP_SECRET'; | |
$params = [ | |
'method' => 'alibaba.offer.search', | |
'app_key' => $appKey, | |
'timestamp' => date('Y-m-d H:i:s'), | |
'format' => 'json', | |
'v' => '2.0', | |
'keywords' => $keywords, | |
'page_no' => $page | |
]; | |
// 生成签名 | |
ksort($params); | |
$stringToSign = $appSecret; | |
foreach ($params as $k => $v) { | |
$stringToSign .= "$k$v"; | |
} | |
$stringToSign .= $appSecret; | |
$params['sign'] = strtoupper(md5($stringToSign)); | |
$queryString = http_build_query($params); | |
$url = 'http://gw.api.1688.com/openapi?' . $queryString; | |
$response = file_get_contents($url); | |
return json_decode($response, true); | |
} |
6. 完整示例代码
php
<?php | |
---|---|
class Ali1688Crawler { | |
private $userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; | |
private $delay = 2; // 请求间隔时间(秒) | |
public function search($keywords, $page = 1, $useApi = false) { | |
if ($useApi) { | |
return $this->searchViaAPI($keywords, $page); | |
} else { | |
return $this->searchViaCrawler($keywords, $page); | |
} | |
} | |
private function searchViaCrawler($keywords, $page) { | |
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page; | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
$html = curl_exec($ch); | |
curl_close($ch); | |
if (!$html) { | |
return ['error' => '无法获取页面内容']; | |
} | |
$dom = new DOMDocument(); | |
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); | |
$xpath = new DOMXPath($dom); | |
$items = $xpath->query("//div[contains(@class, 'item')]"); | |
$results = []; | |
foreach ($items as $item) { | |
try { | |
$titleElement = $xpath->query(".//a[contains(@class, 'title')]", $item)->item(0); | |
$priceElement = $xpath->query(".//span[contains(@class, 'price')]", $item)->item(0); | |
$salesElement = $xpath->query(".//span[contains(@class, 'sale-num')]", $item)->item(0); | |
$product = [ | |
'title' => $titleElement ? trim($titleElement->nodeValue) : '', | |
'price' => $priceElement ? trim($priceElement->nodeValue) : '', | |
'sales' => $salesElement ? preg_replace('/[^0-9]/', '', $salesElement->nodeValue) : 0, | |
'url' => $titleElement ? $this->getAbsoluteUrl($titleElement->getAttribute('href')) : '' | |
]; | |
$results[] = $product; | |
} catch (Exception $e) { | |
continue; | |
} | |
} | |
// 添加延迟以避免被封 | |
sleep($this->delay); | |
return $results; | |
} | |
private function getAbsoluteUrl($relativeUrl) { | |
if (strpos($relativeUrl, 'http') === 0) { | |
return $relativeUrl; | |
} | |
return 'https:' . $relativeUrl; | |
} | |
private function searchViaAPI($keywords, $page) { | |
// 这里需要替换为你的实际API密钥 | |
$appKey = 'your_app_key'; | |
$appSecret = 'your_app_secret'; | |
$params = [ | |
'method' => 'alibaba.offer.search', | |
'app_key' => $appKey, | |
'timestamp' => date('Y-m-d H:i:s'), | |
'format' => 'json', | |
'v' => '2.0', | |
'keywords' => $keywords, | |
'page_no' => $page, | |
'fields' => 'title,price,sale_num,detail_url' | |
]; | |
// 生成签名 | |
ksort($params); | |
$stringToSign = $appSecret; | |
foreach ($params as $k => $v) { | |
$stringToSign .= "$k$v"; | |
} | |
$stringToSign .= $appSecret; | |
$params['sign'] = strtoupper(md5($stringToSign)); | |
$queryString = http_build_query($params); | |
$url = 'http://gw.api.1688.com/openapi?' . $queryString; | |
$response = file_get_contents($url); | |
$result = json_decode($response, true); | |
if (isset($result['error_response'])) { | |
return ['error' => $result['error_response']['sub_msg'] ?? 'API请求失败']; | |
} | |
$products = []; | |
if (isset($result['response']['offers']['offer'])) { | |
foreach ($result['response']['offers']['offer'] as $offer) { | |
$products[] = [ | |
'title' => $offer['title'] ?? '', | |
'price' => $offer['price'] ?? '', | |
'sales' => $offer['sale_num'] ?? 0, | |
'url' => $offer['detail_url'] ?? '' | |
]; | |
} | |
} | |
return $products; | |
} | |
} | |
// 使用示例 | |
$crawler = new Ali1688Crawler(); | |
// 使用爬虫方式 | |
$products = $crawler->search('手机壳', 1, false); | |
print_r($products); | |
// 使用API方式(需要申请权限) | |
// $apiProducts = $crawler->search('手机壳', 1, true); | |
// print_r($apiProducts); | |
?> |
7. 注意事项
- 法律合规性:爬取1688数据前请确保遵守1688的robots.txt和服务条款,避免法律风险
- 请求频率:控制请求频率,避免被封IP
- 反爬机制:1688可能会更新反爬策略,需要持续关注和调整
- API优先:如果可能,优先使用官方API接口
- 数据解析:1688页面结构可能变化,需要定期检查并更新解析逻辑
8. 替代方案
如果遇到爬取困难,可以考虑:
- 使用Selenium等浏览器自动化工具模拟真实用户行为
- 使用现成的爬虫框架如Goutte、Panther等
- 购买商业数据服务
希望这个指南对你有所帮助!记得在实际应用中遵守相关法律法规和网站的使用条款。