前言
本文面向正在构建亚马逊选品数据系统的开发者,深入探讨选品所需的数据架构原理、各数据层的采集技术挑战,以及如何基于 Pangolinfo API 快速建立企业级数据采集体系。
TL;DR
- 亚马逊选品涉及五大数据层:趋势、需求、竞争、财务、用户洞察
- 不同数据层的时效性要求差异极大(小时级 vs. 月级),需要分层调度
- 月销量估算误差率高达30-50%,不能用于精确财务建模
- Pangolinfo API 提供实时采集、98% SP 广告位覆盖率、Customer Says 字段支持
- 架构推荐:Pangolinfo API + PostgreSQL 时序存储 + 异步任务调度
一、数据层架构原理
五大数据层
Layer 1: 趋势层 (Trend)
├── BSR Best Sellers → 每日采集
├── BSR New Releases → 每日采集
├── Movers & Shakers → 每小时采集(最高价值趋势信号)
└── 新品上架速度 → 每周统计
Layer 2: 需求层 (Demand)
├── 关键词搜索量趋势 → 每月采集(可接受延迟)
├── 价位段销量分布 → 每周统计
└── ASIN 销量估算 → 每周(仅作方向参考,误差30-50%)
Layer 3: 竞争层 (Competition) ← 最复杂,数据价值最高
├── ASIN 多维度快照 → 每日采集
├── 价格历史趋势 → 每日积累
├── 评论数量热度曲线 → 每周采集
├── SP 广告位分布(98%) → 每日采集
└── Listing 完整度评分 → 每周
Layer 4: 财务层 (Financial)
├── 竞品价格 & 历史价格 → 每日采集
├── Coupon & 促销数据 → 每日采集
└── FBA 费率(动态) → 每月更新
Layer 5: 用户洞察层 (User Insight)
├── 评论完整文本 → 每月批量采集
├── Q&A 数据 → 每月采集
└── Customer Says → 每周(新 ASIN 优先)
二、Pangolinfo API 快速集成
2.1 基础 ASIN 数据采集
import requests
from typing import Optional, List
class AmazonDataCollector:
"""
基于 Pangolinfo Scrape API 的亚马逊数据采集器
文档: https://docs.pangolinfo.com/
"""
BASE_URL = "https://api.pangolinfo.com/v1"
def __init__(self, api_key: str):
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
})
def get_product(
self,
asin: str,
marketplace: str = "amazon.com",
zip_code: Optional[str] = None
) -> dict:
"""
获取 ASIN 产品详情
重要:包含 customer_says 字段(AI 评论摘要,大多数工具不支持)
重要:支持 zip_code 指定邮区(影响价格和配送信息)
"""
payload = {
"asin": asin,
"marketplace": marketplace,
"fields": [
"title", "price", "list_price", "coupon",
"rating", "review_count", "reviews_breakdown",
"bsr_rank", "bsr_category", "category_path",
"main_image", "images", "bullet_points",
"description", "aplus_content", "variants",
"seller_info", "fulfillment_type",
"customer_says", # 独家:AI 评论摘要
"product_attributes"
]
}
if zip_code:
payload["zip_code"] = zip_code
resp = self.session.post(f"{self.BASE_URL}/amazon/product", json=payload)
resp.raise_for_status()
return resp.json()
def get_keyword_search(
self,
keyword: str,
marketplace: str = "amazon.com",
page: int = 1,
include_ads: bool = True
) -> dict:
"""
关键词搜索结果采集
核心优势:SP 广告位采集完整率 98%(行业领先)
大多数工具的广告位覆盖率只有 30-60%,会严重低估竞争强度
"""
payload = {
"keyword": keyword,
"marketplace": marketplace,
"page": page,
"include_ads": include_ads,
"ad_types": [
"sp_top", # 首位独家广告
"sp_inline", # 其余广告位
"sponsored_brand", # 品牌广告
]
}
resp = self.session.post(f"{self.BASE_URL}/amazon/search", json=payload)
resp.raise_for_status()
return resp.json()
def get_bestsellers(
self,
category_url: str,
list_type: str = "movers_shakers", # 最敏感的趋势信号
marketplace: str = "amazon.com"
) -> dict:
"""
BSR 榜单采集
list_type 选项:
- "best_sellers" → 当前最畅销(每小时更新)
- "new_releases" → 新品榜(每小时更新)
- "movers_shakers" → 排名变化最快(每小时更新,最大趋势价值)
"""
payload = {
"category_url": category_url,
"list_type": list_type,
"marketplace": marketplace,
"max_items": 100
}
resp = self.session.post(f"{self.BASE_URL}/amazon/bestsellers", json=payload)
resp.raise_for_status()
return resp.json()
def get_reviews(
self,
asin: str,
marketplace: str = "amazon.com",
star_filter: Optional[int] = None, # 1-5,None 表示全部
max_reviews: int = 100
) -> dict:
"""
评论批量采集(Reviews Scraper API)
适用于竞品评论语义分析和痛点挖掘
"""
payload = {
"asin": asin,
"marketplace": marketplace,
"max_reviews": max_reviews,
"star_rating": star_filter,
"include_body": True,
"include_metadata": True # 时间戳、有帮助投票数等
}
resp = self.session.post(f"{self.BASE_URL}/amazon/reviews", json=payload)
resp.raise_for_status()
return resp.json()
2.2 批量异步采集(生产级别)
import asyncio
import aiohttp
from dataclasses import dataclass
from typing import Callable, Any
@dataclass
class CollectionTask:
task_id: str
endpoint: str
payload: dict
callback: Optional[Callable] = None
class AsyncBatchCollector:
"""
生产级异步批量采集器
适用于每日全量竞品监控
"""
def __init__(self, api_key: str, concurrency: int = 20):
self.api_key = api_key
self.semaphore = asyncio.Semaphore(concurrency)
self.base_url = "https://api.pangolinfo.com/v1"
async def _execute_task(
self,
session: aiohttp.ClientSession,
task: CollectionTask
) -> dict:
async with self.semaphore:
url = f"{self.base_url}{task.endpoint}"
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
try:
async with session.post(
url, headers=headers, json=task.payload,
timeout=aiohttp.ClientTimeout(total=30)
) as resp:
result = await resp.json()
if task.callback:
task.callback(task.task_id, result)
return {"task_id": task.task_id, "data": result, "success": True}
except Exception as e:
return {"task_id": task.task_id, "error": str(e), "success": False}
async def run_batch(self, tasks: List[CollectionTask]) -> List[dict]:
async with aiohttp.ClientSession() as session:
coroutines = [self._execute_task(session, task) for task in tasks]
return await asyncio.gather(*coroutines)
# 使用示例:每日竞品批量监控
async def daily_competitor_sweep(competitor_asins: List[str]):
collector = AsyncBatchCollector(api_key="your_key", concurrency=20)
tasks = [
CollectionTask(
task_id=f"asin_{asin}",
endpoint="/amazon/product",
payload={
"asin": asin,
"marketplace": "amazon.com",
"fields": ["price", "rating", "review_count", "bsr_rank", "coupon"]
}
)
for asin in competitor_asins
]
results = await collector.run_batch(tasks)
successful = [r for r in results if r["success"]]
failed = [r for r in results if not r["success"]]
print(f"采集完成: {len(successful)} 成功 / {len(failed)} 失败")
return successful
2.3 评论 NLP 痛点分析
from collections import Counter
import re
from typing import Tuple
class ReviewPainPointExtractor:
"""
基于评论文本的竞品痛点挖掘
选品差异化分析的核心工具
"""
# 英文停用词(生产环境建议使用完整停用词表)
STOP_WORDS = {
"this", "that", "with", "have", "would", "could", "when", "they",
"very", "just", "from", "been", "were", "what", "than", "then",
"them", "these", "those", "into", "your", "more", "only", "also"
}
def extract_pain_points(
self,
reviews: List[dict],
min_star: int = 1,
max_star: int = 2,
top_n: int = 20
) -> Tuple[List[Tuple], List[str]]:
"""
从差评中提取高频痛点词
Args:
reviews: Review 数据列表
min_star / max_star: 筛选差评区间,默认1-2星
top_n: 返回前N个高频词
Returns:
(高频词列表, 代表性差评摘要)
"""
negative_reviews = [
r for r in reviews
if min_star <= r.get("rating", 5) <= max_star
]
# 文本拼接
all_text = " ".join([
r.get("body", "") for r in negative_reviews
]).lower()
# 提取词组(2-gram 更能捕捉有意义的痛点)
words = re.findall(r'\b[a-z]{3,}\b', all_text)
filtered = [w for w in words if w not in self.STOP_WORDS]
# 2-gram 提取
bigrams = [
f"{filtered[i]} {filtered[i+1]}"
for i in range(len(filtered) - 1)
]
keyword_freq = Counter(filtered).most_common(top_n)
bigram_freq = Counter(bigrams).most_common(top_n)
# 代表性差评(有帮助投票数最高的)
rep_reviews = sorted(
negative_reviews,
key=lambda r: r.get("helpful_votes", 0),
reverse=True
)[:5]
return {
"total_negative_reviews": len(negative_reviews),
"top_pain_points_unigram": keyword_freq,
"top_pain_points_bigram": bigram_freq,
"most_helpful_negative_reviews": [
r.get("body", "")[:200] for r in rep_reviews
]
}
# 完整流程示例
async def product_differentiation_analysis(target_asin: str, api_key: str):
"""
竞品痛点分析 → 差异化机会识别
"""
collector = AmazonDataCollector(api_key)
# 1. 采集产品概览
product = collector.get_product(target_asin)
print(f"竞品: {product.get('title', '')[:50]}")
print(f"评分: {product.get('rating')} ({product.get('review_count')} 评论)")
print(f"Customer Says: {product.get('customer_says', 'N/A')}")
# 2. 采集差评
reviews = collector.get_reviews(
asin=target_asin,
star_filter=2, # 1-2星
max_reviews=200
)
# 3. 痛点分析
extractor = ReviewPainPointExtractor()
analysis = extractor.extract_pain_points(reviews.get("reviews", []))
print(f"\n差评总数: {analysis['total_negative_reviews']}")
print("\n高频痛点词(选品改进方向):")
for keyword, count in analysis["top_pain_points_unigram"][:10]:
print(f" '{keyword}': {count}次")
print("\n高频痛点词组:")
for bigram, count in analysis["top_pain_points_bigram"][:5]:
print(f" '{bigram}': {count}次")
return analysis
三、广告位竞争分析
SP 广告位的采集完整率,直接决定了你对竞争强度的判断是否准确。
def analyze_keyword_competition(
keyword: str,
api_key: str,
marketplace: str = "amazon.com"
) -> dict:
"""
分析关键词竞争强度
包含广告位分布(Pangolinfo SP 采集率98%)
"""
collector = AmazonDataCollector(api_key)
search_data = collector.get_keyword_search(
keyword=keyword,
marketplace=marketplace,
include_ads=True
)
organic_results = search_data.get("organic_results", [])
sponsored = search_data.get("sponsored_results", [])
# 广告位分析
sp_top = [s for s in sponsored if s.get("placement") == "sp_top"]
sp_inline = [s for s in sponsored if s.get("placement") == "sp_inline"]
ad_ratio = len(sponsored) / max(len(organic_results) + len(sponsored), 1)
competition_score = "HIGH" if ad_ratio > 0.4 else ("MEDIUM" if ad_ratio > 0.2 else "LOW")
return {
"keyword": keyword,
"organic_count": len(organic_results),
"sponsored_count": len(sponsored),
"sp_top_count": len(sp_top),
"sp_top_asins": [s.get("asin") for s in sp_top],
"ad_ratio": round(ad_ratio, 3),
"competition_level": competition_score,
"assessment": f"关键词 '{keyword}' 竞争强度:{competition_score} "
f"(广告占比 {ad_ratio:.0%},首位广告商 {len(sp_top)}个)"
}
四、最佳实践总结
| 场景 | 推荐方案 | 说明 |
|---|---|---|
| 每日全量竞品价格监控 | 异步批量 + PostgreSQL 时序存储 | 支持历史趋势查询 |
| BSR 趋势信号捕捉 | Movers & Shakers 每小时采集 | 最敏感的趋势指标 |
| 评论痛点分析 | Reviews Scraper API + NLP | 用于产品差异化决策 |
| 广告竞争强度评估 | 关键词搜索 + 广告位分析 | 98% 广告位覆盖保障准确性 |
| AI Agent 选品自动化 | Amazon Scraper Skill(MCP 接口) | 无需重新开发数据采集层 |