Scrapy爬取分页网站(结合Selenium)

53 阅读1分钟
import re
import time
from urllib.parse import urljoin

import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By

from Dadan.Tools.dbSelectHandle import DbSelectHandle
from Dadan.items import DadanItem


class zycg_pljcSpider(scrapy.Spider):
    name = "zycg_pljc"
    allowed_domains = ["www.zycg.gov.cn"]
    start_urls = ["https://www.zycg.gov.cn/freecms/site/zygjjgzfcgzx/cggg/index.html"]

    def __init__(self):
        self.browser_options = webdriver.ChromeOptions()
        # 添加无头选项
        self.browser_options.add_experimental_option('detach', True)
        self.browser_options.add_argument('--disable-blink-features=AutomationControlled')
        self.browser = webdriver.Chrome(options=self.browser_options)
        super(zycg_pljcSpider, self).__init__()

    def parse(self, response):
        self.browser.get(self.start_urls[0])
        time.sleep(3)
        type_location = self.browser.find_element(By.XPATH, '//div[@class="listLeftT"]//ul[@class="dropdown-menu1"]/li[3]').location
        type_size = self.browser.find_element(By.XPATH, '//div[@class="listLeftT"]//ul[@class="dropdown-menu1"]/li[3]').size
        click_x = type_location['x'] + type_size['width'] / 2
        click_y = type_location['y'] + type_size['height'] / 2
        ActionChains(self.browser).move_by_offset(click_x, click_y).click().perform()

        for i in range(1, 3):
            time.sleep(2)
            self.browser.find_element(By.XPATH, '//button[@class="turnPage next-page"]').click()
            page_source = self.browser.page_source
            # 提取href值
            href_pattern = r'href="(/freecms/site/zygjjgzfcgzx/ggxx/info[^"]+)"'
            href_matches = re.findall(href_pattern, page_source)
            for href_match in href_matches:
                yield scrapy.Request(url=urljoin(response.url, href_match), callback=self.parse_detail)

    def parse_detail(self, response):
        dadan_item = DadanItem()
        title_val = response.xpath('//h4[@class="info-title"]/text()').extract_first().strip()
        area_val = DbSelectHandle.areaPool(title_val)
        gather_time = DbSelectHandle.getGatherTime()
        # 未使用itemLoader,会导致异步错误
        dadan_item['title'] = title_val
        dadan_item['content'] = response.xpath('//div[@class="info-text"]').extract_first()
        dadan_item['area_id'] = area_val
        dadan_item['type_id'] = 42
        dadan_item['source_url'] = response.url
        dadan_item['publish_time'] = response.xpath('//b/text()').extract_first().strip()
        dadan_item['gather_time'] = gather_time
        yield dadan_item