淘宝商家采集提取工具,阿里旺旺卖家采集工具,采集淘宝阿里旺旺脚本插件【python】

75 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:2383

该代码实现了淘宝店铺搜索、信息采集和旺旺数据提取功能,包含随机延迟、异常处理和CSV数据存储。使用时需注意遵守平台规则,建议设置合理的采集间隔‌

import requests import re import time import random from bs4 import BeautifulSoup from fake_useragent import UserAgent from urllib.parse import urlencode import csv import json

class TaobaoSpider: def init(self): self.session = requests.Session() self.ua = UserAgent() self.headers = { 'User-Agent': self.ua.random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Referer': 'www.taobao.com/', 'Upgrade-Insecure-Requests': '1' } self.cookies = {} self.proxies = {} self.timeout = 10 self.retry_times = 3 self.delay = random.uniform(1, 3) self.output_file = 'taobao_shops.csv'

def get_html(self, url):
    for _ in range(self.retry_times):
        try:
            time.sleep(self.delay)
            response = self.session.get(
                url,
                headers=self.headers,
                cookies=self.cookies,
                proxies=self.proxies,
                timeout=self.timeout
            )
            if response.status_code == 200:
                return response.text
            else:
                print(f"请求失败,状态码:{response.status_code}")
        except Exception as e:
            print(f"请求异常:{str(e)}")
    return None

def parse_shop_info(self, html):
    shop_info = {}
    soup = BeautifulSoup(html, 'html.parser')
    
    # 解析店铺基础信息
    shop_name_tag = soup.find('div', class_='shop-name')
    if shop_name_tag:
        shop_info['name'] = shop_name_tag.get_text(strip=True)
    
    # 解析旺旺信息
    wangwang_tag = soup.find('a', class_='ww-light')
    if wangwang_tag:
        shop_info['wangwang'] = wangwang_tag.get('data-nick')
    
    # 解析店铺评分
    score_tags = soup.find_all('span', class_='score')
    if score_tags and len(score_tags) >= 3:
        shop_info['desc_score'] = score_tags.get_text(strip=True)
        shop_info['service_score'] = score_tags^3^.get_text(strip=True)
        shop_info['logistics_score'] = score_tags^1^.get_text(strip=True)
    
    # 解析店铺所在地
    location_tag = soup.find('span', class_='shop-location')
    if location_tag:
        shop_info['location'] = location_tag.get_text(strip=True)
    
    return shop_info

def search_shops(self, keyword, page=1):
    base_url = 'https://s.taobao.com/search'
    params = {
        'q': keyword,
        's': (page - 1) * 44,
        'ie': 'utf8'
    }
    url = f"{base_url}?{urlencode(params)}"
    html = self.get_html(url)
    if html:
        shop_links = re.findall(r'//shop(\\d+)\\.taobao\\.com', html)
        return list(set(shop_links))
    return []

def save_to_csv(self, data):
    with open(self.output_file, 'a', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=data.keys())
        if f.tell() == 0:
            writer.writeheader()
        writer.writerow(data)

def run(self, keyword, max_page=1):
    all_shops = []
    for page in range(1, max_page + 1):
        print(f"正在采集第{page}页...")
        shop_ids = self.search_shops(keyword, page)
        for shop_id in shop_ids:
            shop_url = f"https://shop{shop_id}.taobao.com"
            html = self.get_html(shop_url)
            if html:
                shop_info = self.parse_shop_info(html)
                shop_info['url'] = shop_url
                all_shops.append(shop_info)
                self.save_to_csv(shop_info)
                print(f"采集到店铺:{shop_info.get('name','未知')}")
    return all_shops

if name == 'main': spider = TaobaoSpider() keyword = input("请输入搜索关键词: ") pages = int(input("请输入要采集的页数: ")) result = spider.run(keyword, pages) print(f"采集完成,共获取{len(result)}条店铺数据")