在电商数据分析领域,实时获取亚马逊商品详情页数据是企业制定营销策略、监控竞品动态的核心需求。本教程将深入讲解如何通过亚马逊 SP-API(Selling Partner API)实现商品详情页数据的自动化采集,包含完整的接入流程、代码实现及最佳实践。
一、SP-API 基础认知
1. API 架构概述
亚马逊 SP-API 是基于 RESTful 架构的新一代卖家接口,相比旧版 MWS 具有以下优势:
- OAuth2.0 认证:更安全的第三方授权机制
- 模块化接口:按业务领域划分(Products、Orders、Pricing 等)
- 全球端点支持:多区域部署优化响应速度
- 标准化响应:统一的 JSON 格式与错误码体系
2. 接入权限申请
- 账户注册:完成注册
- ****API 权限申请:提交产品权限请求,通过安全审核
- LWA 应用配置:创建 Login with Amazon 应用获取 Client ID/Secret
二、开发环境搭建
1. 核心依赖安装
# 推荐使用Python 3.8+环境
pip install requests python-dotenv pycryptodome
2. 项目结构设计
amazon-sp-api/
├── config/
│ ├── config.ini # 配置文件
│ └── logging.conf # 日志配置
├── src/
│ ├── auth.py # 认证模块
│ ├── product_api.py # 商品API模块
│ ├── utils.py # 工具函数
│ └── main.py # 主程序
├── .env # 环境变量
└── requirements.txt # 依赖清单
三、认证流程实现
1. OAuth2.0 授权流程
import os
import requests
import json
from dotenv import load_dotenv
class SPAPIAuth:
def __init__(self):
load_dotenv()
self.client_id = os.environ.get('CLIENT_ID')
self.client_secret = os.environ.get('CLIENT_SECRET')
self.refresh_token = os.environ.get('REFRESH_TOKEN')
self.lwa_endpoint = 'https://api.amazon.com/auth/o2/token'
def get_access_token(self):
"""获取临时访问令牌"""
payload = {
'grant_type': 'refresh_token',
'refresh_token': self.refresh_token,
'client_id': self.client_id,
'client_secret': self.client_secret
}
response = requests.post(self.lwa_endpoint, data=payload)
if response.status_code == 200:
return response.json()['access_token']
else:
raise Exception(f"获取访问令牌失败: {response.text}")
2. AWS 签名生成
import hmac
import hashlib
import time
import datetime
import urllib.parse
class AWSSigner:
def __init__(self, access_key, secret_key, region):
self.access_key = access_key
self.secret_key = secret_key
self.region = region
self.algorithm = 'AWS4-HMAC-SHA256'
self.service = 'execute-api'
def generate_signature(self, method, path, host, query_params, headers, payload):
"""生成AWS4-HMAC-SHA256签名"""
# 1. 生成时间戳
t = datetime.datetime.utcnow()
amz_date = t.strftime('%Y%m%dT%H%M%SZ')
date_stamp = t.strftime('%Y%m%d')
# 2. 构建规范请求
canonical_uri = urllib.parse.quote(path, safe='/~')
canonical_querystring = '&'.join([f"{k}={urllib.parse.quote_plus(str(v))}"
for k, v in sorted(query_params.items())])
canonical_headers = '\n'.join([f"{k.lower()}:{v.strip()}" for k, v in headers.items()]) + '\n'
signed_headers = ';'.join([k.lower() for k in headers.keys()])
payload_hash = hashlib.sha256(payload.encode('utf-8')).hexdigest()
canonical_request = (
f"{method}\n{canonical_uri}\n{canonical_querystring}\n"
f"{canonical_headers}\n{signed_headers}\n{payload_hash}"
)
# 3. 构建待签字符串
credential_scope = f"{date_stamp}/{self.region}/{self.service}/aws4_request"
string_to_sign = (
f"{self.algorithm}\n{amz_date}\n{credential_scope}\n"
f"{hashlib.sha256(canonical_request.encode('utf-8')).hexdigest()}"
)
# 4. 生成签名密钥
k_date = self._sign(('AWS4' + self.secret_key).encode('utf-8'), date_stamp)
k_region = self._sign(k_date, self.region)
k_service = self._sign(k_region, self.service)
k_signing = self._sign(k_service, 'aws4_request')
# 5. 计算签名
signature = hmac.new(k_signing, string_to_sign.encode('utf-8'),
hashlib.sha256).hexdigest()
return amz_date, credential_scope, signature
def _sign(self, key, msg):
return hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest()
四、商品详情数据采集实现
1. 商品 API 客户端
class ProductAPIClient:
def __init__(self, auth, aws_signer, endpoint):
self.auth = auth
self.aws_signer = aws_signer
self.endpoint = endpoint
self.host = endpoint.split('//')[1]
def get_product_details(self, asin, marketplace_id='ATVPDKIKX0DER'):
"""获取商品详情"""
access_token = self.auth.get_access_token()
method = 'GET'
path = '/products/2020-09-01/products'
query_params = {
'MarketplaceIds': marketplace_id,
'Asins': asin
}
headers = {
'host': self.host,
'x-amz-date': '', # 将在签名过程中设置
'Authorization': '', # 将在签名过程中设置
'x-amz-access-token': access_token,
'Content-Type': 'application/json'
}
# 生成签名
amz_date, credential_scope, signature = self.aws_signer.generate_signature(
method, path, self.host, query_params, headers, ''
)
# 更新请求头
headers['x-amz-date'] = amz_date
headers['Authorization'] = (
f"AWS4-HMAC-SHA256 Credential={self.aws_signer.access_key}/{credential_scope}, "
f"SignedHeaders=host;x-amz-date, Signature={signature}"
)
# 构建完整URL
query_string = '&'.join([f"{k}={urllib.parse.quote_plus(str(v))}"
for k, v in query_params.items()])
url = f"{self.endpoint}{path}?{query_string}"
# 发送请求
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"请求失败: {response.status_code} - {response.text}")
2. 批量采集实现
def batch_fetch_products(asin_list, max_retries=3, delay=1):
"""批量获取商品数据"""
# 初始化认证和API客户端
auth = SPAPIAuth()
aws_signer = AWSSigner(
access_key=os.environ.get('AWS_ACCESS_KEY'),
secret_key=os.environ.get('AWS_SECRET_KEY'),
region='us-east-1'
)
client = ProductAPIClient(
auth=auth,
aws_signer=aws_signer,
endpoint='https://sellingpartnerapi-na.amazon.com'
)
results = {}
for asin in asin_list:
retries = 0
while retries < max_retries:
try:
data = client.get_product_details(asin)
results[asin] = data
print(f"成功获取ASIN: {asin} 的数据")
break
except Exception as e:
print(f"获取ASIN: {asin} 失败: {str(e)}")
retries += 1
if retries < max_retries:
print(f"重试 ({retries}/{max_retries})")
time.sleep(delay * (2 ** retries)) # 指数退避
else:
results[asin] = None
# 控制请求频率,避免触发限流
time.sleep(delay)
return results
五、数据解析与持久化
1. 数据解析示例
def parse_product_data(raw_data):
"""解析商品数据并提取关键信息"""
if not raw_data or 'payload' not in raw_data:
return None
product = raw_data['payload'][0]
# 提取基本信息
parsed_data = {
'asin': product.get('asin'),
'title': product.get('attributes', {}).get('title'),
'brand': product.get('attributes', {}).get('brand'),
'model': product.get('attributes', {}).get('model'),
'color': product.get('attributes', {}).get('color'),
'size': product.get('attributes', {}).get('size')
}
# 提取价格信息
price_info = product.get('summary', {})
parsed_data['price'] = {
'lowest_new': price_info.get('lowestPrice', {}).get('value'),
'lowest_used': price_info.get('lowestUsedPrice', {}).get('value'),
'currency': price_info.get('lowestPrice', {}).get('currency'),
'condition': price_info.get('condition')
}
# 提取库存信息
availability = product.get('fulfillmentAvailability', {})
parsed_data['availability'] = {
'quantity': availability.get('quantity'),
'fulfillment_channel': availability.get('fulfillmentChannel'),
'is_supply_limited': availability.get('isSupplyLimited')
}
# 提取评分信息
reviews = product.get('customerReviews', {})
parsed_data['reviews'] = {
'average_rating': reviews.get('averageRating'),
'total_reviews': reviews.get('totalReviews'),
'rating_histogram': reviews.get('ratingHistogram')
}
return parsed_data
2. 数据存储方案
import pandas as pd
from sqlalchemy import create_engine
def save_to_database(products_data, db_config):
"""将商品数据存入PostgreSQL数据库"""
# 转换为DataFrame
df = pd.DataFrame([parse_product_data(p) for p in products_data.values() if p])
# 连接数据库
engine = create_engine(
f"postgresql://{db_config['user']}:{db_config['password']}@"
f"{db_config['host']}:{db_config['port']}/{db_config['database']}"
)
# 将数据写入表中
df.to_sql(
name='amazon_products',
con=engine,
if_exists='append',
index=False,
dtype={
'asin': String(10),
'title': Text,
'brand': String(100),
'model': String(100),
'color': String(50),
'size': String(50),
'price': JSON,
'availability': JSON,
'reviews': JSON
}
)
print(f"成功存储 {len(df)} 条商品数据")
六、性能优化与错误处理
1. 并发请求实现
import asyncio
from concurrent.futures import ThreadPoolExecutor
async def fetch_product_async(client, asin):
"""异步获取单个商品数据"""
loop = asyncio.get_event_loop()
try:
# 在单独的线程中执行同步请求
data = await loop.run_in_executor(None, client.get_product_details, asin)
return asin, data
except Exception as e:
print(f"异步获取ASIN: {asin} 失败: {str(e)}")
return asin, None
async def batch_fetch_async(asin_list, max_workers=10):
"""异步批量获取商品数据"""
# 初始化API客户端
auth = SPAPIAuth()
aws_signer = AWSSigner(
access_key=os.environ.get('AWS_ACCESS_KEY'),
secret_key=os.environ.get('AWS_SECRET_KEY'),
region='us-east-1'
)
client = ProductAPIClient(
auth=auth,
aws_signer=aws_signer,
endpoint='https://sellingpartnerapi-na.amazon.com'
)
# 创建线程池执行器
with ThreadPoolExecutor(max_workers=max_workers) as executor:
loop = asyncio.get_event_loop()
tasks = [fetch_product_async(client, asin) for asin in asin_list]
results = await asyncio.gather(*tasks)
return {asin: data for asin, data in results}
2. 限流与重试机制
from ratelimit import limits, sleep_and_retry
class RateLimitedProductAPIClient(ProductAPIClient):
"""带速率限制的商品API客户端"""
CALLS = 10 # 每分钟最多10次调用
PERIOD = 60 # 时间窗口(秒)
@sleep_and_retry
@limits(calls=CALLS, period=PERIOD)
def get_product_details(self, asin, marketplace_id='ATVPDKIKX0DER'):
"""带速率限制的商品详情获取方法"""
return super().get_product_details(asin, marketplace_id)
七、部署与监控
1. 环境配置示例
# config.ini
[api]
region = us-east-1
endpoint = https://sellingpartnerapi-na.amazon.com
[database]
host = localhost
port = 5432
database = amazon_data
user = postgres
password = your_password
[logging]
config_file = logging.conf
2. 监控指标建议
- API 请求成功率
- 平均响应时间
- 数据更新频率
- 限流触发次数
- 数据库写入性能
八、合规与安全注意事项
-
数据使用限制
- 仅用于内部业务分析,禁止公开数据
- 遵守亚马逊 API 服务条款
- 不得进行数据爬虫或过度请求
-
安全最佳实践
- 使用环境变量存储敏感信息
- 定期轮换 API 密钥
- 实施最小权限原则
- 启用数据传输加密
-
防封禁策略
- 严格遵守 API 调用频率限制
- 实现智能限流和请求调度
- 监控异常请求模式并及时调整
通过本教程,你可以完整实现亚马逊 SP-API 的接入与商品详情页数据采集系统。在实际应用中,建议根据业务需求扩展功能,如添加定时任务、数据可视化或机器学习分析模块。
(注:实际开发中请替换示例中的环境变量和配置参数,并遵守亚马逊 API 使用条款。)