使用PHP采集数据的完整技术文章,涵盖多种场景和最佳实践

6 阅读4分钟

PHP数据采集技术实战指南

一、基础采集:file_get_contents与cURL

1.1 简单页面抓取

<?php
// 基础方式:file_get_contents(适合简单GET请求)
$html = file_get_contents('https://example.com');
echo $html;

// 带上下文参数(模拟浏览器)
$opts = [
    'http' => [
        'method' => 'GET',
        'header' => 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
    ]
];
$context = stream_context_create($opts);
$html = file_get_contents('https://example.com', false, $context);

1.2 cURL高级采集(推荐)

<?php
class HttpClient {
    
    /**
     * 发送HTTP请求
     */
    public function request($url, $method = 'GET', $data = [], $headers = []) {
        $ch = curl_init();
        
        // 基础配置
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, 30);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
        
        // 模拟浏览器
        $defaultHeaders = [
            'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8'
        ];
        
        curl_setopt($ch, CURLOPT_HTTPHEADER, array_merge($defaultHeaders, $headers));
        
        // POST请求处理
        if (strtoupper($method) === 'POST') {
            curl_setopt($ch, CURLOPT_POST, true);
            curl_setopt($ch, CURLOPT_POSTFIELDS, is_array($data) ? http_build_query($data) : $data);
        }
        
        // 执行请求
        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $error = curl_error($ch);
        curl_close($ch);
        
        if ($error) {
            throw new Exception("cURL Error: $error");
        }
        
        return [
            'code' => $httpCode,
            'body' => $response
        ];
    }
    
    /**
     * 批量并行请求(提升效率)
     */
    public function multiRequest(array $urls) {
        $mh = curl_multi_init();
        $handles = [];
        $results = [];
        
        foreach ($urls as $key => $url) {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($ch, CURLOPT_TIMEOUT, 30);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
            
            curl_multi_add_handle($mh, $ch);
            $handles[$key] = $ch;
        }
        
        // 执行所有请求
        $running = null;
        do {
            curl_multi_exec($mh, $running);
            curl_multi_select($mh);
        } while ($running > 0);
        
        // 获取结果
        foreach ($handles as $key => $ch) {
            $results[$key] = curl_multi_getcontent($ch);
            curl_multi_remove_handle($mh, $ch);
            curl_close($ch);
        }
        
        curl_multi_close($mh);
        return $results;
    }
}

二、数据解析:正则与DOM操作

2.1 正则表达式提取

<?php
class RegexParser {
    
    /**
     * 提取所有匹配项
     */
    public function extractAll($html, $pattern) {
        preg_match_all($pattern, $html, $matches);
        return $matches[1] ?? $matches[0] ?? [];
    }
    
    /**
     * 提取单个匹配项
     */
    public function extractOne($html, $pattern) {
        preg_match($pattern, $html, $match);
        return $match[1] ?? $match[0] ?? null;
    }
    
    /**
     * 常用采集模式
     */
    public function commonPatterns() {
        return [
            'title' => '/<title>(.*?)</title>/is',
            'links' => '/<a[^>]+href=["']([^"']+)["'][^>]*>/i',
            'images' => '/<img[^>]+src=["']([^"']+)["'][^>]*>/i',
            'emails' => '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{2,}/',
            'phones' => '/1[3-9]\d{9}/',
            'price' => '/[¥¥]\s*(\d+(?:.\d{1,2})?)/',
            'json_ld' => '/<script type="application/ld+json">(.*?)</script>/is'
        ];
    }
}

// 使用示例
$parser = new RegexParser();
$html = file_get_contents('https://example.com/product.html');

// 提取价格
$price = $parser->extractOne($html, '/class="price"[^>]*>([\d.]+)/');
// 提取所有图片
$images = $parser->extractAll($html, '/data-src=["']([^"']+)["']/i');

2.2 DOMDocument解析(结构化数据)

<?php
class DomParser {
    
    private $dom;
    private $xpath;
    
    public function __construct($html) {
        $this->dom = new DOMDocument();
        // 抑制HTML5标签警告
        libxml_use_internal_errors(true);
        $this->dom->loadHTML('<?xml encoding="UTF-8">' . $html);
        libxml_clear_errors();
        $this->xpath = new DOMXPath($this->dom);
    }
    
    /**
     * XPath查询
     */
    public function query($expression, $contextNode = null) {
        return $this->xpath->query($expression, $contextNode);
    }
    
    /**
     * 提取商品列表(示例:电商网站)
     */
    public function extractProducts() {
        $products = [];
        $nodes = $this->query('//div[contains(@class, "product-item")]');
        
        foreach ($nodes as $node) {
            $product = [
                'title' => $this->getText('.//h3[@class="title"]', $node),
                'price' => $this->getText('.//span[@class="price"]', $node),
                'image' => $this->getAttr('.//img', 'src', $node),
                'link' => $this->getAttr('.//a', 'href', $node),
                'rating' => $this->getText('.//div[@class="rating"]', $node)
            ];
            $products[] = $product;
        }
        
        return $products;
    }
    
    private function getText($xpath, $context = null) {
        $node = $this->query($xpath, $context)->item(0);
        return $node ? trim($node->textContent) : null;
    }
    
    private function getAttr($xpath, $attr, $context = null) {
        $node = $this->query($xpath, $context)->item(0);
        return $node ? $node->getAttribute($attr) : null;
    }
}

// 使用示例
$html = file_get_contents('https://shop.example.com/list.html');
$parser = new DomParser($html);
$products = $parser->extractProducts();
print_r($products);

三、实战案例:采集京东商品信息

<?php
class JDCrawler {
    
    private $http;
    private $cookieJar;
    
    public function __construct() {
        $this->http = new HttpClient();
        $this->cookieJar = tempnam(sys_get_temp_dir(), 'jd_cookie_');
    }
    
    /**
     * 获取商品详情
     */
    public function getProductDetail($skuId) {
        $url = "https://item.jd.com/{$skuId}.html";
        
        $headers = [
            'Referer: https://www.jd.com/',
            'Accept-Encoding: gzip, deflate, br'
        ];
        
        $response = $this->http->request($url, 'GET', [], $headers);
        
        if ($response['code'] !== 200) {
            throw new Exception("获取失败,HTTP状态码: {$response['code']}");
        }
        
        $html = $this->ungzip($response['body']);
        
        return $this->parseProduct($html, $skuId);
    }
    
    /**
     * 解析商品数据
     */
    private function parseProduct($html, $skuId) {
        $parser = new RegexParser();
        
        // 提取商品名称
        $name = $parser->extractOne($html, '/<div class="sku-name">(.*?)</div>/s');
        $name = strip_tags($name);
        
        // 提取价格(需调用价格API)
        $price = $this->getPrice($skuId);
        
        // 提取评价数
        $commentCount = $parser->extractOne($html, '/comment-count[^>]*>(\d+)/');
        
        // 提取商品图片
        $images = $parser->extractAll($html, '/data-origin="(https://img[^"]+)"/');
        
        // 提取规格参数
        $params = $this->extractParams($html);
        
        return [
            'sku_id' => $skuId,
            'name' => $this->cleanText($name),
            'price' => $price,
            'comment_count' => $commentCount,
            'images' => $images,
            'params' => $params,
            'url' => "https://item.jd.com/{$skuId}.html",
            'crawled_at' => date('Y-m-d H:i:s')
        ];
    }
    
    /**
     * 获取实时价格(调用京东价格API)
     */
    private function getPrice($skuId) {
        $url = "https://p.3.cn/prices/mgets?skuIds=J_{$skuId}";
        $response = $this->http->request($url);
        $data = json_decode($response['body'], true);
        
        return $data[0]['p'] ?? $data[0]['op'] ?? null;
    }
    
    /**
     * 提取规格参数表
     */
    private function extractParams($html) {
        $params = [];
        
        // 使用DOM解析参数表格
        $dom = new DOMDocument();
        libxml_use_internal_errors(true);
        $dom->loadHTML($html);
        
        $xpath = new DOMXPath($dom);
        $rows = $xpath->query('//table[@class="parameter2"]//tr');
        
        foreach ($rows as $row) {
            $tds = $xpath->query('td', $row);
            if ($tds->length >= 2) {
                $key = trim($tds->item(0)->textContent);
                $value = trim($tds->item(1)->textContent);
                $params[$key] = $value;
            }
        }
        
        return $params;
    }
    
    /**
     * 解压缩gzip内容
     */
    private function ungzip($data) {
        if (substr($data, 0, 2) === "\x1f\x8b") {
            return gzdecode($data);
        }
        return $data;
    }
    
    private function cleanText($text) {
        return preg_replace('/\s+/', ' ', trim($text));
    }
    
    public function __destruct() {
        if (file_exists($this->cookieJar)) {
            unlink($this->cookieJar);
        }
    }
}

// 使用示例
try {
    $crawler = new JDCrawler();
    $product = $crawler->getProductDetail('100012043978');
    
    echo "商品名称: {$product['name']}\n";
    echo "当前价格: ¥{$product['price']}\n";
    echo "评价数量: {$product['comment_count']}\n";
    echo "规格参数:\n";
    print_r($product['params']);
    
} catch (Exception $e) {
    echo "错误: " . $e->getMessage();
}

四、数据存储与导出

<?php
class DataStorage {
    
    private $pdo;
    
    public function __construct($dsn, $user, $pass) {
        $this->pdo = new PDO($dsn, $user, $pass, [
            PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
            PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC
        ]);
    }
    
    /**
     * 保存到MySQL
     */
    public function saveToMySQL($table, $data) {
        $columns = implode(', ', array_keys($data));
        $placeholders = ':' . implode(', :', array_keys($data));
        
        $sql = "INSERT INTO {$table} ({$columns}) VALUES ({$placeholders}) 
                ON DUPLICATE KEY UPDATE " . 
                implode(', ', array_map(fn($k) => "$k=VALUES($k)", array_keys($data)));
        
        $stmt = $this->pdo->prepare($sql);
        return $stmt->execute($data);
    }
    
    /**
     * 导出为CSV
     */
    public function exportCSV($filename, array $data) {
        $fp = fopen($filename, 'w');
        
        // 写入表头
        if (!empty($data)) {
            fputcsv($fp, array_keys($data[0]));
        }
        
        // 写入数据
        foreach ($data as $row) {
            fputcsv($fp, $row);
        }
        
        fclose($fp);
        return $filename;
    }
    
    /**
     * 导出为JSON
     */
    public function exportJSON($filename, $data) {
        file_put_contents($filename, json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE));
        return $filename;
    }
    
    /**
     * 批量插入(优化性能)
     */
    public function batchInsert($table, array $rows, $batchSize = 100) {
        if (empty($rows)) return 0;
        
        $columns = array_keys($rows[0]);
        $columnStr = implode(', ', $columns);
        $placeholder = '(' . implode(', ', array_fill(0, count($columns), '?')) . ')';
        
        $total = 0;
        $chunks = array_chunk($rows, $batchSize);
        
        foreach ($chunks as $chunk) {
            $placeholders = implode(', ', array_fill(0, count($chunk), $placeholder));
            $sql = "INSERT INTO {$table} ({$columnStr}) VALUES {$placeholders}";
            
            $values = [];
            foreach ($chunk as $row) {
                $values = array_merge($values, array_values($row));
            }
            
            $stmt = $this->pdo->prepare($sql);
            $stmt->execute($values);
            $total += $stmt->rowCount();
        }
        
        return $total;
    }
}

五、高级技巧与优化

5.1 代理池与IP轮换

<?php
class ProxyPool {
    
    private $proxies = [];
    private $current = 0;
    
    public function __construct(array $proxyList) {
        $this->proxies = $proxyList;
        shuffle($this->proxies);
    }
    
    public function getNext() {
        $proxy = $this->proxies[$this->current];
        $this->current = ($this->current + 1) % count($this->proxies);
        return $proxy;
    }
    
    public function markFailed($proxy) {
        // 记录失败次数,超过阈值移除
    }
}

// 使用代理
$proxyPool = new ProxyPool([
    'http://user:pass@proxy1.com:8080',
    'http://proxy2.com:3128'
]);

$proxy = $proxyPool->getNext();
curl_setopt($ch, CURLOPT_PROXY, $proxy);

5.2 队列与异步处理

<?php
use React\Http\Browser;
use React\Promise\Promise;

class AsyncCrawler {
    
    private $browser;
    private $concurrency = 5;
    
    public function __construct() {
        $this->browser = new Browser();
    }
    
    /**
     * 并发采集
     */
    public function crawlConcurrent(array $urls, callable $onSuccess, callable $onError) {
        $promises = [];
        $semaphore = new \SplQueue();
        
        foreach ($urls as $url) {
            $promise = $this->browser->get($url)->then(
                function ($response) use ($onSuccess, $url) {
                    $onSuccess($url, (string)$response->getBody());
                },
                function ($error) use ($onError, $url) {
                    $onError($url, $error);
                }
            );
            $promises[] = $promise;
        }
        
        \React\Promise\all($promises)->wait();
    }
}

5.3 完整采集流程控制

<?php
class CrawlEngine {
    
    private $config;
    private $storage;
    private $logger;
    
    public function __construct(array $config) {
        $this->config = $config;
        $this->storage = new DataStorage(
            $config['db']['dsn'],
            $config['db']['user'],
            $config['db']['pass']
        );
        $this->logger = new Monolog\Logger('crawler');
    }
    
    /**
     * 执行采集任务
     */
    public function run(array $tasks) {
        foreach ($tasks as $task) {
            try {
                $this->processTask($task);
            } catch (Exception $e) {
                $this->logger->error("任务失败", [
                    'task' => $task,
                    'error' => $e->getMessage()
                ]);
                $this->retryTask($task);
            }
        }
    }
    
    private function processTask($task) {
        // 1. 下载页面
        $html = $this->download($task['url']);
        
        // 2. 解析数据
        $data = $this->parse($html, $task['parser']);
        
        // 3. 数据清洗
        $cleaned = $this->clean($data);
        
        // 4. 存储
        $this->storage->saveToMySQL($task['table'], $cleaned);
        
        // 5. 更新任务状态
        $this->markCompleted($task['id']);
    }
    
    private function download($url, $retries = 3) {
        $http = new HttpClient();
        
        for ($i = 0; $i < $retries; $i++) {
            $response = $http->request($url);
            
            if ($response['code'] === 200) {
                return $response['body'];
            }
            
            sleep(pow(2, $i)); // 指数退避
        }
        
        throw new Exception("下载失败: $url");
    }
}

六、最佳实践与注意事项

表格

实践项说明
遵守Robots协议检查目标网站的robots.txt,尊重爬取限制
设置合理延迟使用usleep(500000)控制请求频率,避免被封
User-Agent轮换准备多个浏览器UA标识,定期更换
异常处理网络超时、HTTP错误、解析失败均需捕获处理
数据验证对采集结果进行格式校验,过滤无效数据
日志记录记录请求URL、状态码、耗时,便于排查问题
增量更新使用哈希或时间戳实现增量采集,减少重复工作