PHP数据采集技术实战指南
一、基础采集:file_get_contents与cURL
1.1 简单页面抓取
<?php
// 基础方式:file_get_contents(适合简单GET请求)
$html = file_get_contents('https://example.com');
echo $html;
// 带上下文参数(模拟浏览器)
$opts = [
'http' => [
'method' => 'GET',
'header' => 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
]
];
$context = stream_context_create($opts);
$html = file_get_contents('https://example.com', false, $context);
1.2 cURL高级采集(推荐)
<?php
class HttpClient {
/**
* 发送HTTP请求
*/
public function request($url, $method = 'GET', $data = [], $headers = []) {
$ch = curl_init();
// 基础配置
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
// 模拟浏览器
$defaultHeaders = [
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8'
];
curl_setopt($ch, CURLOPT_HTTPHEADER, array_merge($defaultHeaders, $headers));
// POST请求处理
if (strtoupper($method) === 'POST') {
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, is_array($data) ? http_build_query($data) : $data);
}
// 执行请求
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new Exception("cURL Error: $error");
}
return [
'code' => $httpCode,
'body' => $response
];
}
/**
* 批量并行请求(提升效率)
*/
public function multiRequest(array $urls) {
$mh = curl_multi_init();
$handles = [];
$results = [];
foreach ($urls as $key => $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_multi_add_handle($mh, $ch);
$handles[$key] = $ch;
}
// 执行所有请求
$running = null;
do {
curl_multi_exec($mh, $running);
curl_multi_select($mh);
} while ($running > 0);
// 获取结果
foreach ($handles as $key => $ch) {
$results[$key] = curl_multi_getcontent($ch);
curl_multi_remove_handle($mh, $ch);
curl_close($ch);
}
curl_multi_close($mh);
return $results;
}
}
二、数据解析:正则与DOM操作
2.1 正则表达式提取
<?php
class RegexParser {
/**
* 提取所有匹配项
*/
public function extractAll($html, $pattern) {
preg_match_all($pattern, $html, $matches);
return $matches[1] ?? $matches[0] ?? [];
}
/**
* 提取单个匹配项
*/
public function extractOne($html, $pattern) {
preg_match($pattern, $html, $match);
return $match[1] ?? $match[0] ?? null;
}
/**
* 常用采集模式
*/
public function commonPatterns() {
return [
'title' => '/<title>(.*?)</title>/is',
'links' => '/<a[^>]+href=["']([^"']+)["'][^>]*>/i',
'images' => '/<img[^>]+src=["']([^"']+)["'][^>]*>/i',
'emails' => '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{2,}/',
'phones' => '/1[3-9]\d{9}/',
'price' => '/[¥¥]\s*(\d+(?:.\d{1,2})?)/',
'json_ld' => '/<script type="application/ld+json">(.*?)</script>/is'
];
}
}
// 使用示例
$parser = new RegexParser();
$html = file_get_contents('https://example.com/product.html');
// 提取价格
$price = $parser->extractOne($html, '/class="price"[^>]*>([\d.]+)/');
// 提取所有图片
$images = $parser->extractAll($html, '/data-src=["']([^"']+)["']/i');
2.2 DOMDocument解析(结构化数据)
<?php
class DomParser {
private $dom;
private $xpath;
public function __construct($html) {
$this->dom = new DOMDocument();
// 抑制HTML5标签警告
libxml_use_internal_errors(true);
$this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
libxml_clear_errors();
$this->xpath = new DOMXPath($this->dom);
}
/**
* XPath查询
*/
public function query($expression, $contextNode = null) {
return $this->xpath->query($expression, $contextNode);
}
/**
* 提取商品列表(示例:电商网站)
*/
public function extractProducts() {
$products = [];
$nodes = $this->query('//div[contains(@class, "product-item")]');
foreach ($nodes as $node) {
$product = [
'title' => $this->getText('.//h3[@class="title"]', $node),
'price' => $this->getText('.//span[@class="price"]', $node),
'image' => $this->getAttr('.//img', 'src', $node),
'link' => $this->getAttr('.//a', 'href', $node),
'rating' => $this->getText('.//div[@class="rating"]', $node)
];
$products[] = $product;
}
return $products;
}
private function getText($xpath, $context = null) {
$node = $this->query($xpath, $context)->item(0);
return $node ? trim($node->textContent) : null;
}
private function getAttr($xpath, $attr, $context = null) {
$node = $this->query($xpath, $context)->item(0);
return $node ? $node->getAttribute($attr) : null;
}
}
// 使用示例
$html = file_get_contents('https://shop.example.com/list.html');
$parser = new DomParser($html);
$products = $parser->extractProducts();
print_r($products);
三、实战案例:采集京东商品信息
<?php
class JDCrawler {
private $http;
private $cookieJar;
public function __construct() {
$this->http = new HttpClient();
$this->cookieJar = tempnam(sys_get_temp_dir(), 'jd_cookie_');
}
/**
* 获取商品详情
*/
public function getProductDetail($skuId) {
$url = "https://item.jd.com/{$skuId}.html";
$headers = [
'Referer: https://www.jd.com/',
'Accept-Encoding: gzip, deflate, br'
];
$response = $this->http->request($url, 'GET', [], $headers);
if ($response['code'] !== 200) {
throw new Exception("获取失败,HTTP状态码: {$response['code']}");
}
$html = $this->ungzip($response['body']);
return $this->parseProduct($html, $skuId);
}
/**
* 解析商品数据
*/
private function parseProduct($html, $skuId) {
$parser = new RegexParser();
// 提取商品名称
$name = $parser->extractOne($html, '/<div class="sku-name">(.*?)</div>/s');
$name = strip_tags($name);
// 提取价格(需调用价格API)
$price = $this->getPrice($skuId);
// 提取评价数
$commentCount = $parser->extractOne($html, '/comment-count[^>]*>(\d+)/');
// 提取商品图片
$images = $parser->extractAll($html, '/data-origin="(https://img[^"]+)"/');
// 提取规格参数
$params = $this->extractParams($html);
return [
'sku_id' => $skuId,
'name' => $this->cleanText($name),
'price' => $price,
'comment_count' => $commentCount,
'images' => $images,
'params' => $params,
'url' => "https://item.jd.com/{$skuId}.html",
'crawled_at' => date('Y-m-d H:i:s')
];
}
/**
* 获取实时价格(调用京东价格API)
*/
private function getPrice($skuId) {
$url = "https://p.3.cn/prices/mgets?skuIds=J_{$skuId}";
$response = $this->http->request($url);
$data = json_decode($response['body'], true);
return $data[0]['p'] ?? $data[0]['op'] ?? null;
}
/**
* 提取规格参数表
*/
private function extractParams($html) {
$params = [];
// 使用DOM解析参数表格
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$rows = $xpath->query('//table[@class="parameter2"]//tr');
foreach ($rows as $row) {
$tds = $xpath->query('td', $row);
if ($tds->length >= 2) {
$key = trim($tds->item(0)->textContent);
$value = trim($tds->item(1)->textContent);
$params[$key] = $value;
}
}
return $params;
}
/**
* 解压缩gzip内容
*/
private function ungzip($data) {
if (substr($data, 0, 2) === "\x1f\x8b") {
return gzdecode($data);
}
return $data;
}
private function cleanText($text) {
return preg_replace('/\s+/', ' ', trim($text));
}
public function __destruct() {
if (file_exists($this->cookieJar)) {
unlink($this->cookieJar);
}
}
}
// 使用示例
try {
$crawler = new JDCrawler();
$product = $crawler->getProductDetail('100012043978');
echo "商品名称: {$product['name']}\n";
echo "当前价格: ¥{$product['price']}\n";
echo "评价数量: {$product['comment_count']}\n";
echo "规格参数:\n";
print_r($product['params']);
} catch (Exception $e) {
echo "错误: " . $e->getMessage();
}
四、数据存储与导出
<?php
class DataStorage {
private $pdo;
public function __construct($dsn, $user, $pass) {
$this->pdo = new PDO($dsn, $user, $pass, [
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC
]);
}
/**
* 保存到MySQL
*/
public function saveToMySQL($table, $data) {
$columns = implode(', ', array_keys($data));
$placeholders = ':' . implode(', :', array_keys($data));
$sql = "INSERT INTO {$table} ({$columns}) VALUES ({$placeholders})
ON DUPLICATE KEY UPDATE " .
implode(', ', array_map(fn($k) => "$k=VALUES($k)", array_keys($data)));
$stmt = $this->pdo->prepare($sql);
return $stmt->execute($data);
}
/**
* 导出为CSV
*/
public function exportCSV($filename, array $data) {
$fp = fopen($filename, 'w');
// 写入表头
if (!empty($data)) {
fputcsv($fp, array_keys($data[0]));
}
// 写入数据
foreach ($data as $row) {
fputcsv($fp, $row);
}
fclose($fp);
return $filename;
}
/**
* 导出为JSON
*/
public function exportJSON($filename, $data) {
file_put_contents($filename, json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
return $filename;
}
/**
* 批量插入(优化性能)
*/
public function batchInsert($table, array $rows, $batchSize = 100) {
if (empty($rows)) return 0;
$columns = array_keys($rows[0]);
$columnStr = implode(', ', $columns);
$placeholder = '(' . implode(', ', array_fill(0, count($columns), '?')) . ')';
$total = 0;
$chunks = array_chunk($rows, $batchSize);
foreach ($chunks as $chunk) {
$placeholders = implode(', ', array_fill(0, count($chunk), $placeholder));
$sql = "INSERT INTO {$table} ({$columnStr}) VALUES {$placeholders}";
$values = [];
foreach ($chunk as $row) {
$values = array_merge($values, array_values($row));
}
$stmt = $this->pdo->prepare($sql);
$stmt->execute($values);
$total += $stmt->rowCount();
}
return $total;
}
}
五、高级技巧与优化
5.1 代理池与IP轮换
<?php
class ProxyPool {
private $proxies = [];
private $current = 0;
public function __construct(array $proxyList) {
$this->proxies = $proxyList;
shuffle($this->proxies);
}
public function getNext() {
$proxy = $this->proxies[$this->current];
$this->current = ($this->current + 1) % count($this->proxies);
return $proxy;
}
public function markFailed($proxy) {
// 记录失败次数,超过阈值移除
}
}
// 使用代理
$proxyPool = new ProxyPool([
'http://user:pass@proxy1.com:8080',
'http://proxy2.com:3128'
]);
$proxy = $proxyPool->getNext();
curl_setopt($ch, CURLOPT_PROXY, $proxy);
5.2 队列与异步处理
<?php
use React\Http\Browser;
use React\Promise\Promise;
class AsyncCrawler {
private $browser;
private $concurrency = 5;
public function __construct() {
$this->browser = new Browser();
}
/**
* 并发采集
*/
public function crawlConcurrent(array $urls, callable $onSuccess, callable $onError) {
$promises = [];
$semaphore = new \SplQueue();
foreach ($urls as $url) {
$promise = $this->browser->get($url)->then(
function ($response) use ($onSuccess, $url) {
$onSuccess($url, (string)$response->getBody());
},
function ($error) use ($onError, $url) {
$onError($url, $error);
}
);
$promises[] = $promise;
}
\React\Promise\all($promises)->wait();
}
}
5.3 完整采集流程控制
<?php
class CrawlEngine {
private $config;
private $storage;
private $logger;
public function __construct(array $config) {
$this->config = $config;
$this->storage = new DataStorage(
$config['db']['dsn'],
$config['db']['user'],
$config['db']['pass']
);
$this->logger = new Monolog\Logger('crawler');
}
/**
* 执行采集任务
*/
public function run(array $tasks) {
foreach ($tasks as $task) {
try {
$this->processTask($task);
} catch (Exception $e) {
$this->logger->error("任务失败", [
'task' => $task,
'error' => $e->getMessage()
]);
$this->retryTask($task);
}
}
}
private function processTask($task) {
// 1. 下载页面
$html = $this->download($task['url']);
// 2. 解析数据
$data = $this->parse($html, $task['parser']);
// 3. 数据清洗
$cleaned = $this->clean($data);
// 4. 存储
$this->storage->saveToMySQL($task['table'], $cleaned);
// 5. 更新任务状态
$this->markCompleted($task['id']);
}
private function download($url, $retries = 3) {
$http = new HttpClient();
for ($i = 0; $i < $retries; $i++) {
$response = $http->request($url);
if ($response['code'] === 200) {
return $response['body'];
}
sleep(pow(2, $i)); // 指数退避
}
throw new Exception("下载失败: $url");
}
}
六、最佳实践与注意事项
表格
| 实践项 | 说明 |
|---|---|
| 遵守Robots协议 | 检查目标网站的robots.txt,尊重爬取限制 |
| 设置合理延迟 | 使用usleep(500000)控制请求频率,避免被封 |
| User-Agent轮换 | 准备多个浏览器UA标识,定期更换 |
| 异常处理 | 网络超时、HTTP错误、解析失败均需捕获处理 |
| 数据验证 | 对采集结果进行格式校验,过滤无效数据 |
| 日志记录 | 记录请求URL、状态码、耗时,便于排查问题 |
| 增量更新 | 使用哈希或时间戳实现增量采集,减少重复工作 |