import re
import time
from urllib.parse import urljoin
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from Dadan.Tools.dbSelectHandle import DbSelectHandle
from Dadan.items import DadanItem
class zycg_pljcSpider(scrapy.Spider):
name = "zycg_pljc"
allowed_domains = ["www.zycg.gov.cn"]
start_urls = ["https://www.zycg.gov.cn/freecms/site/zygjjgzfcgzx/cggg/index.html"]
def __init__(self):
self.browser_options = webdriver.ChromeOptions()
self.browser_options.add_experimental_option('detach', True)
self.browser_options.add_argument('--disable-blink-features=AutomationControlled')
self.browser = webdriver.Chrome(options=self.browser_options)
super(zycg_pljcSpider, self).__init__()
def parse(self, response):
self.browser.get(self.start_urls[0])
time.sleep(3)
type_location = self.browser.find_element(By.XPATH, '//div[@class="listLeftT"]//ul[@class="dropdown-menu1"]/li[3]').location
type_size = self.browser.find_element(By.XPATH, '//div[@class="listLeftT"]//ul[@class="dropdown-menu1"]/li[3]').size
click_x = type_location['x'] + type_size['width'] / 2
click_y = type_location['y'] + type_size['height'] / 2
ActionChains(self.browser).move_by_offset(click_x, click_y).click().perform()
for i in range(1, 3):
time.sleep(2)
self.browser.find_element(By.XPATH, '//button[@class="turnPage next-page"]').click()
page_source = self.browser.page_source
href_pattern = r'href="(/freecms/site/zygjjgzfcgzx/ggxx/info[^"]+)"'
href_matches = re.findall(href_pattern, page_source)
for href_match in href_matches:
yield scrapy.Request(url=urljoin(response.url, href_match), callback=self.parse_detail)
def parse_detail(self, response):
dadan_item = DadanItem()
title_val = response.xpath('//h4[@class="info-title"]/text()').extract_first().strip()
area_val = DbSelectHandle.areaPool(title_val)
gather_time = DbSelectHandle.getGatherTime()
dadan_item['title'] = title_val
dadan_item['content'] = response.xpath('//div[@class="info-text"]').extract_first()
dadan_item['area_id'] = area_val
dadan_item['type_id'] = 42
dadan_item['source_url'] = response.url
dadan_item['publish_time'] = response.xpath('//b/text()').extract_first().strip()
dadan_item['gather_time'] = gather_time
yield dadan_item