利用PHP爬虫按关键字搜索1688商品:实战指南

2 阅读3分钟

1. 准备工作

在开始之前,请确保:

  • 你有基本的PHP编程知识
  • 服务器已安装PHP(建议7.0+版本)
  • 了解HTTP请求和HTML解析基础

2. 分析1688搜索接口

1688的搜索页面是:https://s.1688.com/selloffer/offer_search.htm

通过浏览器开发者工具分析,我们可以发现:

  • 搜索请求是POST/GET方式
  • 需要传递的关键参数包括:keywords(搜索词)、beginPage(页码)等

3. 基本PHP爬虫实现

方法一:使用file_get_contents和正则表达式

php

<?php
function search1688($keywords, $page = 1) {
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page;
// 设置用户代理模拟浏览器访问
$options = [
'http' => [
'method' => 'GET',
'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\r\n"
]
];
$context = stream_context_create($options);
// 获取页面内容
$html = file_get_contents($url, false, $context);
if ($html === false) {
return ['error' => '无法获取页面内容'];
}
// 使用正则表达式提取商品信息
preg_match_all('/<div class="item">.*?<a href="(.*?)" title="(.*?)">.*?<span class="price">([^<]+)</span>.*?</div>/s', $html, $matches, PREG_SET_ORDER);
$results = [];
foreach ($matches as $match) {
$results[] = [
'url' => $match[1],
'title' => $match[2],
'price' => $match[3]
];
}
return $results;
}
// 使用示例
$products = search1688('手机壳');
print_r($products);
?>

方法二:使用cURL和DOM解析(推荐)

php

<?php
function search1688WithCurl($keywords, $page = 1) {
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
$html = curl_exec($ch);
curl_close($ch);
if (!$html) {
return ['error' => '无法获取页面内容'];
}
// 使用DOMDocument解析HTML
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
// 查找商品元素
$items = $xpath->query("//div[contains(@class, 'item')]");
$results = [];
foreach ($items as $item) {
$titleElement = $xpath->query(".//a[contains(@class, 'title')]", $item)->item(0);
$priceElement = $xpath->query(".//span[contains(@class, 'price')]", $item)->item(0);
$product = [
'title' => $titleElement ? trim($titleElement->nodeValue) : '',
'price' => $priceElement ? trim($priceElement->nodeValue) : '',
'url' => $titleElement ? $titleElement->getAttribute('href') : ''
];
$results[] = $product;
}
return $results;
}
// 使用示例
$products = search1688WithCurl('手机壳');
print_r($products);
?>

4. 处理反爬机制

1688有反爬机制,可能需要处理:

4.1 设置合理的请求头

php

$headers = [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer: https://www.1688.com/',
'DNT: 1'
];
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);

4.2 使用代理IP

php

curl_setopt($ch, CURLOPT_PROXY, 'proxy_ip:port');
curl_setopt($ch, CURLOPT_PROXYUSERPWD, 'username:password'); // 如果需要认证

4.3 限制请求频率

php

// 在多次请求之间添加延迟
sleep(rand(1, 3));

5. 使用API接口(推荐)

如果频繁爬取,建议使用1688官方API(需要申请权限):

php

function search1688ViaAPI($keywords, $page = 1) {
$appKey = '你的APP_KEY';
$appSecret = '你的APP_SECRET';
$params = [
'method' => 'alibaba.offer.search',
'app_key' => $appKey,
'timestamp' => date('Y-m-d H:i:s'),
'format' => 'json',
'v' => '2.0',
'keywords' => $keywords,
'page_no' => $page
];
// 生成签名
ksort($params);
$stringToSign = $appSecret;
foreach ($params as $k => $v) {
$stringToSign .= "$k$v";
}
$stringToSign .= $appSecret;
$params['sign'] = strtoupper(md5($stringToSign));
$queryString = http_build_query($params);
$url = 'http://gw.api.1688.com/openapi?' . $queryString;
$response = file_get_contents($url);
return json_decode($response, true);
}

6. 完整示例代码

php

<?php
class Ali1688Crawler {
private $userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
private $delay = 2; // 请求间隔时间(秒)
public function search($keywords, $page = 1, $useApi = false) {
if ($useApi) {
return $this->searchViaAPI($keywords, $page);
} else {
return $this->searchViaCrawler($keywords, $page);
}
}
private function searchViaCrawler($keywords, $page) {
$url = "https://s.1688.com/selloffer/offer_search.htm?keywords=" . urlencode($keywords) . "&beginPage=" . $page;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$html = curl_exec($ch);
curl_close($ch);
if (!$html) {
return ['error' => '无法获取页面内容'];
}
$dom = new DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
$xpath = new DOMXPath($dom);
$items = $xpath->query("//div[contains(@class, 'item')]");
$results = [];
foreach ($items as $item) {
try {
$titleElement = $xpath->query(".//a[contains(@class, 'title')]", $item)->item(0);
$priceElement = $xpath->query(".//span[contains(@class, 'price')]", $item)->item(0);
$salesElement = $xpath->query(".//span[contains(@class, 'sale-num')]", $item)->item(0);
$product = [
'title' => $titleElement ? trim($titleElement->nodeValue) : '',
'price' => $priceElement ? trim($priceElement->nodeValue) : '',
'sales' => $salesElement ? preg_replace('/[^0-9]/', '', $salesElement->nodeValue) : 0,
'url' => $titleElement ? $this->getAbsoluteUrl($titleElement->getAttribute('href')) : ''
];
$results[] = $product;
} catch (Exception $e) {
continue;
}
}
// 添加延迟以避免被封
sleep($this->delay);
return $results;
}
private function getAbsoluteUrl($relativeUrl) {
if (strpos($relativeUrl, 'http') === 0) {
return $relativeUrl;
}
return 'https:' . $relativeUrl;
}
private function searchViaAPI($keywords, $page) {
// 这里需要替换为你的实际API密钥
$appKey = 'your_app_key';
$appSecret = 'your_app_secret';
$params = [
'method' => 'alibaba.offer.search',
'app_key' => $appKey,
'timestamp' => date('Y-m-d H:i:s'),
'format' => 'json',
'v' => '2.0',
'keywords' => $keywords,
'page_no' => $page,
'fields' => 'title,price,sale_num,detail_url'
];
// 生成签名
ksort($params);
$stringToSign = $appSecret;
foreach ($params as $k => $v) {
$stringToSign .= "$k$v";
}
$stringToSign .= $appSecret;
$params['sign'] = strtoupper(md5($stringToSign));
$queryString = http_build_query($params);
$url = 'http://gw.api.1688.com/openapi?' . $queryString;
$response = file_get_contents($url);
$result = json_decode($response, true);
if (isset($result['error_response'])) {
return ['error' => $result['error_response']['sub_msg'] ?? 'API请求失败'];
}
$products = [];
if (isset($result['response']['offers']['offer'])) {
foreach ($result['response']['offers']['offer'] as $offer) {
$products[] = [
'title' => $offer['title'] ?? '',
'price' => $offer['price'] ?? '',
'sales' => $offer['sale_num'] ?? 0,
'url' => $offer['detail_url'] ?? ''
];
}
}
return $products;
}
}
// 使用示例
$crawler = new Ali1688Crawler();
// 使用爬虫方式
$products = $crawler->search('手机壳', 1, false);
print_r($products);
// 使用API方式(需要申请权限)
// $apiProducts = $crawler->search('手机壳', 1, true);
// print_r($apiProducts);
?>

7. 注意事项

  1. 法律合规性:爬取1688数据前请确保遵守1688的robots.txt和服务条款,避免法律风险
  2. 请求频率:控制请求频率,避免被封IP
  3. 反爬机制:1688可能会更新反爬策略,需要持续关注和调整
  4. API优先:如果可能,优先使用官方API接口
  5. 数据解析:1688页面结构可能变化,需要定期检查并更新解析逻辑

8. 替代方案

如果遇到爬取困难,可以考虑:

  • 使用Selenium等浏览器自动化工具模拟真实用户行为
  • 使用现成的爬虫框架如Goutte、Panther等
  • 购买商业数据服务

希望这个指南对你有所帮助!记得在实际应用中遵守相关法律法规和网站的使用条款。