下载地址:www.pan38.com/dow/share.p… 提取密码:1133
该工具主要特点: 采用Selenium+BeautifulSoup双解析方案,有效应对动态页面 集成多线程采集,提升效率 自动切换UserAgent和代理IP,绕过反爬 数据导出为Excel格式,方便后续处理
import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager import pandas as pd import threading import time import random from fake_useragent import UserAgent
class MeituanScraper: def init(self, keywords=None, cities=None): self.keywords = keywords or ["餐饮", "外卖"] self.cities = cities or ["北京", "上海"] self.proxies = self._get_proxies() self.ua = UserAgent() self.lock = threading.Lock() self.data = []
def _get_proxies(self):
# 代理IP池实现(需自行补充)
return ["http://proxy1.example.com:8080",
"http://proxy2.example.com:8080"]
def _get_driver(self):
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={self.ua.random}')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
service = Service(ChromeDriverManager().install())
return webdriver.Chrome(service=service, options=options)
def _parse_page(self, html):
soup = BeautifulSoup(html, 'lxml')
shops = soup.find_all('div', class_='shop-list-item')
for shop in shops:
try:
name = shop.find('h4').text.strip()
phone = shop.find('div', class_='phone-number').text.strip()
address = shop.find('span', class_='address').text.strip()
self.lock.acquire()
self.data.append({
"店名": name,
"电话": phone,
"地址": address
})
except Exception as e:
print(f"解析失败: {e}")
finally:
self.lock.release()
def _scrape_city(self, city, keyword):
driver = self._get_driver()
try:
url = f"https://www.meituan.com/meishi/{city}/"
driver.get(url)
time.sleep(random.uniform(2,5))
# 模拟搜索操作
search_box = driver.find_element('name', 'keyword')
search_box.send_keys(keyword)
search_box.submit()
time.sleep(random.uniform(3,6))
# 滚动加载更多数据
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(1,3))
self._parse_page(driver.page_source)
finally:
driver.quit()
def run(self):
threads = []
for city in self.cities:
for keyword in self.keywords:
t = threading.Thread(
target=self._scrape_city,
args=(city, keyword)
)
threads.append(t)
t.start()
for t in threads:
t.join()
df = pd.DataFrame(self.data)
df.to_excel('美团商家数据.xlsx', index=False)
print(f"采集完成,共获取{len(self.data)}条数据")
if name == "main": scraper = MeituanScraper( keywords=["火锅", "烧烤"], cities=["广州", "深圳"] ) scraper.run() import requests from datetime import datetime, timedelta
class ProxyManager: def init(self): self.proxies = [] self.last_update = None
def update_proxies(self):
if self.last_update and (datetime.now() - self.last_update) < timedelta(hours=1):
return
try:
resp = requests.get("https://api.proxy-service.com/v1/proxies")
self.proxies = [f"http://{p['ip']}:{p['port']}" for p in resp.json()]
self.last_update = datetime.now()
except Exception as e:
print(f"更新代理失败: {e}")
def get_random_proxy(self):
self.update_proxies()
return random.choice(self.proxies) if self.proxies else None