Scrapy 爬取旅游景点相关数据(三)

129 阅读4分钟
  • 这一期我们将之前爬取到的景点数据进行解析,并且保存为excel,便于后续使用,本节包含 (1) 景点数据解析 (2)数据保存到excel

1 编写爬虫

这次继续改进第二节的爬虫,新建一个爬虫文件叫 spiders/qiongyou_3.py

因为这一节开始我们直接解析返回的response了,所以不需要保存html了。

编写一个自定义程序去解析页面的源码提取我们需要的信息

        while True:
            # 先爬取3页 (测试)
            if page_number > 3:
                break

            # 解析当前页面内容(如果需要解析,可以在这里添加解析逻辑)
            page_source = self.driver.page_source
            for item in self.parse_page(page_source):
                yield item

parse_page方法这样写

    # 解析页面
    def parse_page(self, page_source):
        response = scrapy.Selector(text=page_source)
        print(response)
        sights = response.xpath('//ul[@id="poiLists"]/li')
        print('sights=', sights)
        for sight in sights:
            item = TourItem()
            item['title'] = sight.xpath('.//h3[@class="title fontYaHei"]/a/text()').get().strip()
            item['title_en'] = (sight.xpath('.//h3[@class="title fontYaHei"]/a/span/text()').get() or '').strip()
           
            print(f"\033[92m{item['title']}\033[0m")
            print(f"\033[92m{item['title_en']}\033[0m")
        
            yield item

先打通流程,所以只提取2个字段,景点中文名和英文名

这边title_en这么写的原因是景点的外文名可能是不存在的,如果不加处理地提取会报错导致程序直接中断。

2 修改 items

修改items.py ,定义我们自己的数据结构

# 定义数据结构
class TourItem(scrapy.Item):
    title = scrapy.Field()
    title_en = scrapy.Field()

3 修改 Pipelines

利用pipelines 来对数据进行保存。

import pandas as pd

class TourPipeline:
    def __init__(self):
        self.data = []

    def process_item(self, item, spider):
        self.data.append(dict(item))
        return item

    def close_spider(self, spider):
        df = pd.DataFrame(self.data)
        # 使用pandas 保存东京景点 到excel文件
        df.to_excel('tokyo_sights.xlsx', index=False)
        spider.log('Saved data to tokyo_sights.xlsx')

需要安装pandas。

pip install pandas

还需要在settings.py 中打开配置。

ITEM_PIPELINES = {
   'tutorial2.pipelines.TourPipeline': 300,
}

4 执行代码

scrapy crawl qys3 

在测试的时候发现,第一次调的时候,通常驱动启动的会比较慢(这个原因我们下一期说),而后面调试就很快,发现数据可以保存到excel 。

5 完整qiongyou_3.py代码 & items.py 代码

后续继续解析其他字段,这边就直接贴出代码。

# qiongyou_3.py
import re
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.remote.remote_connection import LOGGER
import logging
import time

from tutorial2.items import TourItem

class QiongyouSpider(scrapy.Spider):
    name = 'qys3'
    allowed_domains = ['qyer.com']
    start_urls = ['<https://place.qyer.com/tokyo/sight/>']

    def __init__(self, *args, **kwargs):
        super(QiongyouSpider, self).__init__(*args, **kwargs)
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        LOGGER.setLevel(logging.WARNING)

        self.driver = webdriver.Chrome(options=options)  # 替换为 ChromeDriver 的实际路径

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        # 定义对网站的操作(保存HTML)
        self.driver.get(response.url)
        # 等待页面加载
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//ul[@id="poiLists"]'))
        )

        page_number = 1
        while True:
            # 先爬取3页 (测试)
            if page_number > 40:
                break

            # 解析当前页面内容(如果需要解析,可以在这里添加解析逻辑)
            page_source = self.driver.page_source
            for item in self.parse_page(page_source):
                yield item

            # 查找并点击 "下一页" 按钮
            try:
                # javascript
                next_button = self.driver.find_element(By.XPATH, '//a[@title="下一页"]')
                self.driver.execute_script("arguments[0].click();", next_button)

                page_number += 1
                time.sleep(2)  # 等待页面加载
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//ul[@id="poiLists"]'))
                )
            except Exception as e:
                self.log(f"No more pages or failed to load next page: {e}")
                break

        self.driver.quit()

    # 解析页面
    def parse_page(self, page_source):
        response = scrapy.Selector(text=page_source)
        print(response)
        sights = response.xpath('//ul[@id="poiLists"]/li')
        print('sights=', sights)
        for sight in sights:
            item = TourItem()
            item['title'] = sight.xpath('.//h3[@class="title fontYaHei"]/a/text()').get().strip()
            item['title_en'] = (sight.xpath('.//h3[@class="title fontYaHei"]/a/span/text()').get() or '').strip()
            item['img'] = (sight.xpath('.//p[@class="pics"]/a/img/@src').get() or '').strip()
            item['score'] = (sight.xpath('.//div[@class="info"]/span[@class="grade"]/text()').get() or '').strip()
            comment = (sight.xpath('.//span[@class="dping"]/a/text()').get() or '').strip()
            item['comment_url'] = 'https:' + (sight.xpath('.//span[@class="dping"]/a/@href').get() or '').strip()
            item['rank_title'] = (sight.xpath('.//div[@class="info"]//span[@class="infoSide"]/text()').get() or '').strip()

            item['select_user'] = (sight.xpath('.//p[@class="user"]/a/img/@src').get() or '').strip()
            item['select_comment'] = (sight.xpath('.//div[@class="txt"]/text() | .//p[@class="txt"]/text()').get() or '').strip()

            rank = (sight.xpath('.//div[@class="info"]//em[@class="rank orange"]/text()').get() or '').strip()

            review_count_pattern = r'(\d+)人点评'
            review_count_match = re.search(review_count_pattern, comment)
            if review_count_match:
                item['comment'] = int(review_count_match.group(1))
            else:
                item['comment'] = 0

            rank_pattern = r'第(\d+)位'
            rank_match = re.search(rank_pattern, rank)
            if rank_match:
                item['rank'] = int(rank_match.group(1))
            else:
                item['rank'] = 0

            print(f"\033[92m{item['title']}\033[0m")
            print(f"\033[92m{item['title_en']}\033[0m")
            print(f"\033[92m{item['img']}\033[0m")
            print(f"\033[92m{item['score']}\033[0m")
            print(f"\033[92m{item['comment_url']}\033[0m")

            print(f"\033[92m{item['comment']}\033[0m")
            print(f"\033[92m{item['rank_title']}\033[0m")
            print(f"\033[92m{item['rank']}\033[0m")

            print(f"\033[92m{item['select_user']}\033[0m")
            print(f"\033[92m{item['select_comment']}\033[0m")

            yield item

# items.py

import scrapy

# 定义数据结构
class TourItem(scrapy.Item):
    title = scrapy.Field()
    title_en = scrapy.Field()
    img = scrapy.Field()
    score = scrapy.Field()
    comment = scrapy.Field()
    comment_url = scrapy.Field()
    rank_title = scrapy.Field()
    rank = scrapy.Field()
    select_user = scrapy.Field()
    select_comment = scrapy.Field()

最后爬取到的excel 效果:

image.png 下一期我们就把数据存储到mysql中。