Python-查询-循环列表-跳转-下载文件

137 阅读2分钟
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import time

service = Service('C:\chromedriver-win64\chromedriver.exe')
driver = webdriver.Chrome(service=service)
# 指定ChromeDriver的路径
#driver_path = 'path/to/chromedriver.exe'

# 创建一个Service对象
#service = Service(executable_path=driver_path)

# 使用Service对象初始化Chrome WebDriver
#driver = webdriver.Chrome(service=service)

# 接下来你可以使用driver对象来操作浏览器
driver.get('https://xxx/xxxx/xxxx/xxxx')
time.sleep(50)

# 定位搜索框并输入搜索关键词
#search_box = driver.find_element(By.ID, "xxxx")  # 请根据实际页面元素ID进行修改
search_box = driver.find_element(By.ID, "xxxxxx")
search_box.send_keys("xxxxx")
search_box.send_keys(Keys.RETURN)

time.sleep(30)

# 假设有一个按钮用于提交查询,这里使用XPath示例,实际使用时需根据网页结构调整
submit_icon = driver.find_element(By.XPATH, "//div[@class='xxxxxx']")
submit_icon.click()

time.sleep(30)

'''
# 查找包含 onclick 属性的div元素
div_tag = driver.find_element(By.CSS_SELECTOR, 'div.xxxx.clickable')
base_url = "https://xxxx.xxxx.xxx.xx"
# 获取 onclick 属性中的相对路径
onclick_attr = div_tag.get_attribute('xxxx')
relative_path = onclick_attr.split("'")[1]
absolute_url = urljoin(base_url, relative_path)
#relative_paths = []
print("Relative Path:", relative_path)
print("absolute_url:", absolute_url)
driver.get(absolute_url)
'''

# 获取结果列表中每个元素的onclick属性的相对路径
results = driver.find_elements(By.CSS_SELECTOR, 'div.xxxx.clickable')
relative_paths = []
for result in results:
    onclick_attr = result.get_attribute('xxxxxx')
    relative_path = onclick_attr.split("'")[1]
    relative_paths.append(relative_path)
    print(relative_path)

# 拼接完整URL并循环点击打开页面
base_url = 'https://xxxxx.xxxx.xxxx.xx/'
for relative_path in relative_paths:
    full_url = urljoin(base_url, relative_path)
    print(full_url)
    driver.get(full_url)
    time.sleep(30)
    result_link = driver.find_element(By.CLASS_NAME, "xxxxxx")
    result_link.click()

time.sleep(30)


#result_link = driver.find_element(By.CLASS_NAME, "xxxxxx")
#result_link.click()

# 假设文件下载链接在一个class为"download-link"的元素内
# 这里的选择器需要根据目标网页的实际结构进行调整
#download_links = driver.find_elements(By.CLASS_NAME, "xxxxx clickable")

# 循环遍历每个下载链接并点击
#for link in download_links:
    #link.click()
    # 等待文件下载完成,这里简单地使用time.sleep,实际应用中可能需要更复杂的逻辑
    #time.sleep(5)  # 等待5秒,这取决于你的网络速度和文件大小
#result_link = driver.find_element(By.XPATH, "/html/body/div[4]/div[1]/div/div[1]/div[1]/div[2]")
#result_link.click()

time.sleep(30)

#base_url = "https://xxxx"


#result_link = driver.find_element(By.CLASS_NAME, "xxxxx")
#result_link.click()

#time.sleep(30)
# 记得在结束时关闭浏览器
driver.quit()