下载地址:www.pan38.com/dow/share.p… 提取密码:1133
这个代码实现了一个完整的美团商家数据采集工具,主要功能包括:
- 自动获取城市列表
- 根据关键词搜索商家
- 获取商家详细信息(包括电话、评分等)
- 将数据保存为CSV文件
import requests import re import json import time import random from bs4 import BeautifulSoup from urllib.parse import urlencode import csv import os from fake_useragent import UserAgent
class MeituanSpider: def init(self): self.base_url = "www.meituan.com" self.search_url = "www.meituan.com/meishi/" self.headers = { 'User-Agent': UserAgent().random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } self.session = requests.Session() self.data_list = [] self.proxies = None self.timeout = 10 self.max_retry = 3 self.delay = random.uniform(1, 3)
def set_proxy(self, proxy):
self.proxies = {
'http': proxy,
'https': proxy
}
def get_html(self, url):
retry = 0
while retry < self.max_retry:
try:
response = self.session.get(
url,
headers=self.headers,
proxies=self.proxies,
timeout=self.timeout
)
if response.status_code == 200:
return response.text
else:
print(f"请求失败,状态码: {response.status_code}")
retry += 1
time.sleep(self.delay)
except Exception as e:
print(f"请求异常: {str(e)}")
retry += 1
time.sleep(self.delay)
return None
def parse_city_list(self):
url = f"{self.base_url}/changecity/"
html = self.get_html(url)
if not html:
return []
soup = BeautifulSoup(html, 'html.parser')
city_links = soup.select('div.city-area a')
cities = []
for link in city_links:
city_name = link.get_text().strip()
city_url = link['href']
cities.append({
'name': city_name,
'url': city_url
})
return cities
def search_shops(self, city_url, keyword="", page=1):
params = {
'keyword': keyword,
'page': page
}
url = f"{city_url}?{urlencode(params)}"
html = self.get_html(url)
if not html:
return []
soup = BeautifulSoup(html, 'html.parser')
shop_list = soup.select('div[data-poiid]')
shops = []
for shop in shop_list:
try:
shop_id = shop['data-poiid']
name = shop.select_one('div.title').get_text().strip()
address = shop.select_one('div.address').get_text().strip()
avg_price = shop.select_one('div.avg-price').get_text().strip() if shop.select_one('div.avg-price') else '未知'
score = shop.select_one('div.star > span').get_text().strip() if shop.select_one('div.star > span') else '0'
review_count = shop.select_one('div.comment').get_text().strip() if shop.select_one('div.comment') else '0'
shop_url = shop.select_one('a[href^="/meishi/"]')['href']
shops.append({
'id': shop_id,
'name': name,
'address': address,
'avg_price': avg_price,
'score': score,
'review_count': review_count,
'url': f"{self.base_url}{shop_url}"
})
except Exception as e:
print(f"解析店铺信息出错: {str(e)}")
continue
return shops
def get_shop_detail(self, shop_url):
html = self.get_html(shop_url)
if not html:
return None
soup = BeautifulSoup(html, 'html.parser')
detail = {}
try:
# 基本信息
detail['name'] = soup.select_one('h1.shop-name').get_text().strip()
detail['address'] = soup.select_one('div.address').get_text().strip()
detail['score'] = soup.select_one('span.star-score').get_text().strip()
detail['avg_price'] = soup.select_one('span.avg-price').get_text().strip()
# 联系方式
phone_script = soup.find('script', text=re.compile('phone'))
if phone_script:
phone_match = re.search(r'"phone":"([^"]+)"', phone_script.text)
if phone_match:
detail['phone'] = phone_match.group(1)
# 营业时间
hours = soup.select_one('div.business-hours')
if hours:
detail['business_hours'] = hours.get_text().strip()
# 其他信息
info_items = soup.select('div.shop-info-item')
for item in info_items:
key = item.select_one('span.label').get_text().strip()
value = item.select_one('span.value').get_text().strip()
detail[key] = value
return detail
except Exception as e:
print(f"解析店铺详情出错: {str(e)}")
return None
def save_to_csv(self, filename, data):
if not data:
return
file_exists = os.path.isfile(filename)
with open(filename, 'a', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
if not file_exists:
writer.writeheader()
writer.writerows(data)
def run(self, city_name=None, keyword="", max_pages=5):
cities = self.parse_city_list()
if not cities:
print("无法获取城市列表")
return
target_cities = []
if city_name:
target_cities = [city for city in cities if city_name in city['name']]
else:
target_cities = cities[:1] # 默认只爬第一个城市
for city in target_cities:
print(f"开始爬取城市: {city['name']}")
for page in range(1, max_pages + 1):
print(f"正在爬取第 {page} 页...")
shops = self.search_shops(city['url'], keyword, page)
if not shops:
break
for shop in shops:
print(f"获取店铺详情: {shop['name']}")
detail = self.get_shop_detail(shop['url'])
if detail:
shop.update(detail)
self.data_list.append(shop)
time.sleep(random.uniform(0.5, 2))
time.sleep(random.uniform(1, 3))
# 保存数据
if self.data_list:
filename = f"meituan_{city['name']}_{keyword}.csv"
self.save_to_csv(filename, self.data_list)
print(f"数据已保存到 {filename}")
self.data_list = []
if name == "main": spider = MeituanSpider() # 可以设置代理 # spider.set_proxy("http://127.0.0.1:8888")
# 运行爬虫
spider.run(city_name="北京", keyword="火锅", max_pages=3)