淘宝阿里旺旺采集工具,卖家数据采集提取脚本,采集淘宝商家卖家软件【python】

61 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:1928

该代码实现了淘宝店铺搜索、旺旺信息采集和CSV存储功能‌ 关键技术点包括:

使用fake_useragent实现动态UA 随机延迟防止封禁 多线程加速采集 BeautifulSoup解析HTML 完善的异常处理机制 如需采集商品SKU数据,可扩展以下功能模块‌ 商品详情页解析 SKU组合信息提取 价格销量监控 图片下载功能

import requests import re import time import random from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode import os import csv from concurrent.futures import ThreadPoolExecutor

class TaobaoSpider: def init(self): self.ua = UserAgent() self.session = requests.Session() self.headers = { 'User-Agent': self.ua.random, 'Accept': 'text/html,application/xhtml+xml', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Referer': 'www.taobao.com/' } self.proxies = None self.max_retries = 3 self.timeout = 10 self.output_dir = 'taobao_data' os.makedirs(self.output_dir, exist_ok=True)

def get_random_delay(self):
    return random.uniform(1, 3)

def make_request(self, url, params=None):
    retries = 0
    while retries < self.max_retries:
        try:
            time.sleep(self.get_random_delay())
            response = self.session.get(
                url,
                headers=self.headers,
                params=params,
                timeout=self.timeout,
                proxies=self.proxies
            )
            response.raise_for_status()
            return response
        except Exception as e:
            retries += 1
            print(f"请求失败: {e}, 重试 {retries}/{self.max_retries}")
            time.sleep(2)
    return None

def parse_shop_info(self, html):
    soup = BeautifulSoup(html, 'html.parser')
    shop_info = {}
    
    # 解析店铺基本信息
    shop_name_tag = soup.find('div', class_='shop-name')
    shop_info['name'] = shop_name_tag.text.strip() if shop_name_tag else '未知店铺'
    
    # 解析旺旺信息
    wangwang_tag = soup.find('a', class_='ww-light')
    shop_info['wangwang'] = wangwang_tag.get('data-nick') if wangwang_tag else None
    
    # 解析店铺评分
    rating_items = soup.find_all('div', class_='shop-rate-item')
    for item in rating_items:
        label = item.find('span', class_='label').text.strip()
        value = item.find('span', class_='value').text.strip()
        shop_info[f'rating_{label}'] = value
        
    return shop_info

def search_shops(self, keyword, page=1):
    base_url = 'https://s.taobao.com/search'
    params = {
        'q': keyword,
        'tab': 'shop',
        'page': page,
        'app': 'shopsearch'
    }
    response = self.make_request(base_url, params)
    if not response:
        return []
        
    soup = BeautifulSoup(response.text, 'html.parser')
    shop_list = []
    shop_items = soup.find_all('div', class_='shop-item')
    
    for item in shop_items:
        shop_link = item.find('a', class_='shop-link')['href']
        shop_id = re.search(r'shop_id=(\d+)', shop_link).group(1)
        shop_name = item.find('div', class_='shop-name').text.strip()
        shop_list.append({
            'id': shop_id,
            'name': shop_name,
            'url': f'https:{shop_link}'
        })
        
    return shop_list

def get_shop_details(self, shop_id):
    url = f'https://shop{shop_id}.taobao.com'
    response = self.make_request(url)
    if not response:
        return None
    return self.parse_shop_info(response.text)

def save_to_csv(self, data, filename):
    filepath = os.path.join(self.output_dir, filename)
    with open(filepath, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=data.keys())
        writer.writeheader()
        writer.writerows(data)

def crawl_shops_by_keyword(self, keyword, max_pages=5):
    all_shops = []
    for page in range(1, max_pages + 1):
        print(f"正在采集第 {page} 页...")
        shops = self.search_shops(keyword, page)
        with ThreadPoolExecutor(max_workers=5) as executor:
            results = executor.map(self.get_shop_details, [shop['id'] for shop in shops])
            for result in results:
                if result:
                    all_shops.append(result)
        print(f"已采集 {len(all_shops)} 家店铺信息")
        
    self.save_to_csv(all_shops, f'shops_{keyword}.csv')
    return all_shops

if name == 'main': spider = TaobaoSpider() keyword = input("请输入要搜索的店铺关键词: ") spider.crawl_shops_by_keyword(keyword)