下载地址:www.pan38.com/share.php?c… 提取密码:7789
这个工具是可以采集淘宝数据的,但是仅供学习哈,想着学习的小伙伴就看看下面的代码,仅供参考用途,当然这个工具主要还是通过python代码实现的,所以需要具备一定的python基础才能看懂哈,一般人我建议还是直接用成品。
import requests
from bs4 import BeautifulSoup
import re
def get_taobao_product(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
url = f's.taobao.com/search?q={k…'
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
products = []
for item in soup.select('.item.J_MouserOnverReq'):
product = {
'title': item.select_one('.title').get_text().strip(),
'price': item.select_one('.price strong').get_text(),
'sales': re.search(r'(\d+)人付款', item.text).group(1) if re.search(r'(\d+)人付款', item.text) else '0',
'shop': item.select_one('.shopname').get_text().strip() if item.select_one('.shopname') else '未知店铺'
}
products.append(product)
return products
except Exception as e:
print(f"采集出错: {e}")
return []
if name == 'main':
keyword = input("请输入要搜索的商品关键词: ")
results = get_taobao_product(keyword)
for i, product in enumerate(results, 1):
print(f"{i}. {product['title']} - 价格:{product['price']} - 销量:{product['sales']} - 店铺:{product['shop']}")
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import csv
from datetime import datetime
import logging
class TaobaoSpider:
def init(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'www.taobao.com/'
}
self.session = requests.Session()
self.logger = self.setup_logger()
def setup_logger(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='taobao_spider.log'
)
return logging.getLogger(__name__)
def get_page(self, keyword, page=1):
url = f'https://s.taobao.com/search?q={keyword}&s={(page-1)*44}'
try:
time.sleep(random.uniform(1, 3))
response = self.session.get(url, headers=self.headers)
response.raise_for_status()
return response.text
except Exception as e:
self.logger.error(f"获取页面失败: {e}")
return None
def parse_products(self, html):
soup = BeautifulSoup(html, 'html.parser')
products = []
for item in soup.select('.item.J_MouserOnverReq'):
try:
product = {
'title': item.select_one('.title').get_text().strip(),
'price': item.select_one('.price strong').get_text(),
'sales': re.search(r'(\d+)人付款', item.text).group(1) if re.search(r'(\d+)人付款', item.text) else '0',
'shop': item.select_one('.shopname').get_text().strip() if item.select_one('.shopname') else '未知店铺',
'location': item.select_one('.location').get_text() if item.select_one('.location') else '未知地区',
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
products.append(product)
except Exception as e:
self.logger.warning(f"解析商品失败: {e}")
continue
return products
def save_to_csv(self, products, filename):
try:
with open(filename, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=products[0].keys())
if f.tell() == 0:
writer.writeheader()
writer.writerows(products)
self.logger.info(f"成功保存{len(products)}条数据到{filename}")
except Exception as e:
self.logger.error(f"保存数据失败: {e}")
def run(self, keyword, pages=1, output_file='taobao_products.csv'):
all_products = []
for page in range(1, pages+1):
self.logger.info(f"正在采集第{page}页...")
html = self.get_page(keyword, page)
if html:
products = self.parse_products(html)
all_products.extend(products)
time.sleep(random.uniform(2, 5))
if all_products:
self.save_to_csv(all_products, output_file)
return True
return False
if name == 'main':
spider = TaobaoSpider()
keyword = input("请输入要搜索的商品关键词: ")
pages = int(input("请输入要采集的页数(每页44条): "))
if spider.run(keyword, pages):
print("数据采集完成!")
else:
print("数据采集失败,请查看日志文件。")