美团商家提取工具,采集美团商家电话号手机号评分,python免费开源代码

130 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:1133

这个代码实现了一个完整的美团商家数据采集工具,主要功能包括:

  1. 自动获取城市列表
  2. 根据关键词搜索商家
  3. 获取商家详细信息(包括电话、评分等)
  4. 将数据保存为CSV文件

import requests import re import json import time import random from bs4 import BeautifulSoup from urllib.parse import urlencode import csv import os from fake_useragent import UserAgent

class MeituanSpider: def init(self): self.base_url = "www.meituan.com" self.search_url = "www.meituan.com/meishi/" self.headers = { 'User-Agent': UserAgent().random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' } self.session = requests.Session() self.data_list = [] self.proxies = None self.timeout = 10 self.max_retry = 3 self.delay = random.uniform(1, 3)

def set_proxy(self, proxy):
    self.proxies = {
        'http': proxy,
        'https': proxy
    }

def get_html(self, url):
    retry = 0
    while retry < self.max_retry:
        try:
            response = self.session.get(
                url,
                headers=self.headers,
                proxies=self.proxies,
                timeout=self.timeout
            )
            if response.status_code == 200:
                return response.text
            else:
                print(f"请求失败,状态码: {response.status_code}")
                retry += 1
                time.sleep(self.delay)
        except Exception as e:
            print(f"请求异常: {str(e)}")
            retry += 1
            time.sleep(self.delay)
    return None

def parse_city_list(self):
    url = f"{self.base_url}/changecity/"
    html = self.get_html(url)
    if not html:
        return []
    
    soup = BeautifulSoup(html, 'html.parser')
    city_links = soup.select('div.city-area a')
    cities = []
    for link in city_links:
        city_name = link.get_text().strip()
        city_url = link['href']
        cities.append({
            'name': city_name,
            'url': city_url
        })
    return cities

def search_shops(self, city_url, keyword="", page=1):
    params = {
        'keyword': keyword,
        'page': page
    }
    url = f"{city_url}?{urlencode(params)}"
    html = self.get_html(url)
    if not html:
        return []
    
    soup = BeautifulSoup(html, 'html.parser')
    shop_list = soup.select('div[data-poiid]')
    shops = []
    
    for shop in shop_list:
        try:
            shop_id = shop['data-poiid']
            name = shop.select_one('div.title').get_text().strip()
            address = shop.select_one('div.address').get_text().strip()
            avg_price = shop.select_one('div.avg-price').get_text().strip() if shop.select_one('div.avg-price') else '未知'
            score = shop.select_one('div.star > span').get_text().strip() if shop.select_one('div.star > span') else '0'
            review_count = shop.select_one('div.comment').get_text().strip() if shop.select_one('div.comment') else '0'
            shop_url = shop.select_one('a[href^="/meishi/"]')['href']
            
            shops.append({
                'id': shop_id,
                'name': name,
                'address': address,
                'avg_price': avg_price,
                'score': score,
                'review_count': review_count,
                'url': f"{self.base_url}{shop_url}"
            })
        except Exception as e:
            print(f"解析店铺信息出错: {str(e)}")
            continue
    
    return shops

def get_shop_detail(self, shop_url):
    html = self.get_html(shop_url)
    if not html:
        return None
    
    soup = BeautifulSoup(html, 'html.parser')
    detail = {}
    
    try:
        # 基本信息
        detail['name'] = soup.select_one('h1.shop-name').get_text().strip()
        detail['address'] = soup.select_one('div.address').get_text().strip()
        detail['score'] = soup.select_one('span.star-score').get_text().strip()
        detail['avg_price'] = soup.select_one('span.avg-price').get_text().strip()
        
        # 联系方式
        phone_script = soup.find('script', text=re.compile('phone'))
        if phone_script:
            phone_match = re.search(r'"phone":"([^"]+)"', phone_script.text)
            if phone_match:
                detail['phone'] = phone_match.group(1)
        
        # 营业时间
        hours = soup.select_one('div.business-hours')
        if hours:
            detail['business_hours'] = hours.get_text().strip()
        
        # 其他信息
        info_items = soup.select('div.shop-info-item')
        for item in info_items:
            key = item.select_one('span.label').get_text().strip()
            value = item.select_one('span.value').get_text().strip()
            detail[key] = value
        
        return detail
    except Exception as e:
        print(f"解析店铺详情出错: {str(e)}")
        return None

def save_to_csv(self, filename, data):
    if not data:
        return
    
    file_exists = os.path.isfile(filename)
    with open(filename, 'a', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        if not file_exists:
            writer.writeheader()
        writer.writerows(data)

def run(self, city_name=None, keyword="", max_pages=5):
    cities = self.parse_city_list()
    if not cities:
        print("无法获取城市列表")
        return
    
    target_cities = []
    if city_name:
        target_cities = [city for city in cities if city_name in city['name']]
    else:
        target_cities = cities[:1]  # 默认只爬第一个城市
        
    for city in target_cities:
        print(f"开始爬取城市: {city['name']}")
        for page in range(1, max_pages + 1):
            print(f"正在爬取第 {page} 页...")
            shops = self.search_shops(city['url'], keyword, page)
            if not shops:
                break
            
            for shop in shops:
                print(f"获取店铺详情: {shop['name']}")
                detail = self.get_shop_detail(shop['url'])
                if detail:
                    shop.update(detail)
                    self.data_list.append(shop)
                time.sleep(random.uniform(0.5, 2))
            
            time.sleep(random.uniform(1, 3))
        
        # 保存数据
        if self.data_list:
            filename = f"meituan_{city['name']}_{keyword}.csv"
            self.save_to_csv(filename, self.data_list)
            print(f"数据已保存到 {filename}")
            self.data_list = []

if name == "main": spider = MeituanSpider() # 可以设置代理 # spider.set_proxy("http://127.0.0.1:8888")

# 运行爬虫
spider.run(city_name="北京", keyword="火锅", max_pages=3)