下载地址:www.pan38.com/dow/share.p… 提取密码:2383
该代码实现了淘宝店铺搜索、信息采集和旺旺数据提取功能,包含随机延迟、异常处理和CSV数据存储。使用时需注意遵守平台规则,建议设置合理的采集间隔
import requests import re import time import random from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode import csv import json
class TaobaoSpider: def init(self): self.session = requests.Session() self.ua = UserAgent() self.headers = { 'User-Agent': self.ua.random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Referer': 'www.taobao.com/', 'Upgrade-Insecure-Requests': '1' } self.cookies = {} self.proxies = {} self.timeout = 10 self.retry_times = 3 self.delay = random.uniform(1, 3) self.output_file = 'taobao_shops.csv'
def get_html(self, url):
for _ in range(self.retry_times):
try:
time.sleep(self.delay)
response = self.session.get(
url,
headers=self.headers,
cookies=self.cookies,
proxies=self.proxies,
timeout=self.timeout
)
if response.status_code == 200:
return response.text
else:
print(f"请求失败,状态码:{response.status_code}")
except Exception as e:
print(f"请求异常:{str(e)}")
return None
def parse_shop_info(self, html):
shop_info = {}
soup = BeautifulSoup(html, 'html.parser')
# 解析店铺基础信息
shop_name_tag = soup.find('div', class_='shop-name')
if shop_name_tag:
shop_info['name'] = shop_name_tag.get_text(strip=True)
# 解析旺旺信息
wangwang_tag = soup.find('a', class_='ww-light')
if wangwang_tag:
shop_info['wangwang'] = wangwang_tag.get('data-nick')
# 解析店铺评分
score_tags = soup.find_all('span', class_='score')
if score_tags and len(score_tags) >= 3:
shop_info['desc_score'] = score_tags.get_text(strip=True)
shop_info['service_score'] = score_tags^3^.get_text(strip=True)
shop_info['logistics_score'] = score_tags^1^.get_text(strip=True)
# 解析店铺所在地
location_tag = soup.find('span', class_='shop-location')
if location_tag:
shop_info['location'] = location_tag.get_text(strip=True)
return shop_info
def search_shops(self, keyword, page=1):
base_url = 'https://s.taobao.com/search'
params = {
'q': keyword,
's': (page - 1) * 44,
'ie': 'utf8'
}
url = f"{base_url}?{urlencode(params)}"
html = self.get_html(url)
if html:
shop_links = re.findall(r'//shop(\\d+)\\.taobao\\.com', html)
return list(set(shop_links))
return []
def save_to_csv(self, data):
with open(self.output_file, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
if f.tell() == 0:
writer.writeheader()
writer.writerow(data)
def run(self, keyword, max_page=1):
all_shops = []
for page in range(1, max_page + 1):
print(f"正在采集第{page}页...")
shop_ids = self.search_shops(keyword, page)
for shop_id in shop_ids:
shop_url = f"https://shop{shop_id}.taobao.com"
html = self.get_html(shop_url)
if html:
shop_info = self.parse_shop_info(html)
shop_info['url'] = shop_url
all_shops.append(shop_info)
self.save_to_csv(shop_info)
print(f"采集到店铺:{shop_info.get('name','未知')}")
return all_shops
if name == 'main': spider = TaobaoSpider() keyword = input("请输入搜索关键词: ") pages = int(input("请输入要采集的页数: ")) result = spider.run(keyword, pages) print(f"采集完成,共获取{len(result)}条店铺数据")