下载地址:www.pan38.com/dow/share.p… 提取密码:1928
该代码实现了淘宝店铺搜索、旺旺信息采集和CSV存储功能 关键技术点包括:
使用fake_useragent实现动态UA 随机延迟防止封禁 多线程加速采集 BeautifulSoup解析HTML 完善的异常处理机制 如需采集商品SKU数据,可扩展以下功能模块 商品详情页解析 SKU组合信息提取 价格销量监控 图片下载功能
import requests import re import time import random from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode import os import csv from concurrent.futures import ThreadPoolExecutor
class TaobaoSpider: def init(self): self.ua = UserAgent() self.session = requests.Session() self.headers = { 'User-Agent': self.ua.random, 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Referer': 'www.taobao.com/' } self.proxies = None self.max_retries = 3 self.timeout = 10 self.output_dir = 'taobao_data' os.makedirs(self.output_dir, exist_ok=True)
def get_random_delay(self):
return random.uniform(1, 3)
def make_request(self, url, params=None):
retries = 0
while retries < self.max_retries:
try:
time.sleep(self.get_random_delay())
response = self.session.get(
url,
headers=self.headers,
params=params,
timeout=self.timeout,
proxies=self.proxies
)
response.raise_for_status()
return response
except Exception as e:
retries += 1
print(f"请求失败: {e}, 重试 {retries}/{self.max_retries}")
time.sleep(2)
return None
def parse_shop_info(self, html):
soup = BeautifulSoup(html, 'html.parser')
shop_info = {}
# 解析店铺基本信息
shop_name_tag = soup.find('div', class_='shop-name')
shop_info['name'] = shop_name_tag.text.strip() if shop_name_tag else '未知店铺'
# 解析旺旺信息
wangwang_tag = soup.find('a', class_='ww-light')
shop_info['wangwang'] = wangwang_tag.get('data-nick') if wangwang_tag else None
# 解析店铺评分
rating_items = soup.find_all('div', class_='shop-rate-item')
for item in rating_items:
label = item.find('span', class_='label').text.strip()
value = item.find('span', class_='value').text.strip()
shop_info[f'rating_{label}'] = value
return shop_info
def search_shops(self, keyword, page=1):
base_url = 'https://s.taobao.com/search'
params = {
'q': keyword,
'tab': 'shop',
'page': page,
'app': 'shopsearch'
}
response = self.make_request(base_url, params)
if not response:
return []
soup = BeautifulSoup(response.text, 'html.parser')
shop_list = []
shop_items = soup.find_all('div', class_='shop-item')
for item in shop_items:
shop_link = item.find('a', class_='shop-link')['href']
shop_id = re.search(r'shop_id=(\d+)', shop_link).group(1)
shop_name = item.find('div', class_='shop-name').text.strip()
shop_list.append({
'id': shop_id,
'name': shop_name,
'url': f'https:{shop_link}'
})
return shop_list
def get_shop_details(self, shop_id):
url = f'https://shop{shop_id}.taobao.com'
response = self.make_request(url)
if not response:
return None
return self.parse_shop_info(response.text)
def save_to_csv(self, data, filename):
filepath = os.path.join(self.output_dir, filename)
with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
writer.writeheader()
writer.writerows(data)
def crawl_shops_by_keyword(self, keyword, max_pages=5):
all_shops = []
for page in range(1, max_pages + 1):
print(f"正在采集第 {page} 页...")
shops = self.search_shops(keyword, page)
with ThreadPoolExecutor(max_workers=5) as executor:
results = executor.map(self.get_shop_details, [shop['id'] for shop in shops])
for result in results:
if result:
all_shops.append(result)
print(f"已采集 {len(all_shops)} 家店铺信息")
self.save_to_csv(all_shops, f'shops_{keyword}.csv')
return all_shops
if name == 'main': spider = TaobaoSpider() keyword = input("请输入要搜索的店铺关键词: ") spider.crawl_shops_by_keyword(keyword)