引入依赖
import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
from collections import OrderedDict
设置全局变量
url = 'https://www.bsuc.edu.cn/xwzx/xxyw1.htm'
countList = 0
contentURLs = []
def
获取详情页信息并写入txt文本
def getContent(url, pageIndex):
global countList
print('第' + str(pageIndex) + '条')
contentRES = requests.get(url)
contentRES.encoding = 'utf-8'
if contentRES.status_code != 200:
countList -= 1
print('内容页请求错误:' + str(contentRES.status_code) + '|' + url)
return
contentList = BeautifulSoup(contentRES.text, 'html.parser')
titleElement = contentList.find(attrs={'class', 'title'}).find('h3')
title = titleElement.get_text() if titleElement is not None else ''
infoElement = contentList.find(attrs={'class', 'news-msg text-center'})
info = infoElement.get_text() if infoElement is not None else ''
contentElement = contentList.find(attrs={'class', 'v_news_content'})
content = contentElement.get_text() if contentElement is not None else ''
with open('去重.txt', 'a', encoding='utf-8') as file:
file.write('title:' + title)
file.write('\ninfo:' + info.replace('\n', ''))
file.write('\ncontent:' + content)
file.write('######\n')
def
获取详情页url
def getContentUrl(PageUrl):
res = requests.get(PageUrl)
res.encoding = 'utf-8'
if res.status_code != 200:
print('页面请求错误:' + str(res.status_code) + '|' + PageUrl)
return
soup = BeautifulSoup(res.text, 'html.parser')
urls = soup.select('.box-txt a')
for j in urls:
contentURL = 'https://www.bsuc.edu.cn/' + j['href'].replace('../', '')
contentURLs.append(contentURL)
获取页码
pageNumbers = requests.get(url)
pageNumbers.encoding = 'utf-8'
if pageNumbers.status_code != 200:
print('页码请求错误:' + str(pageNumbers.status_code) + '|' + url)
pageNumber = 0
else:
pageSoup = BeautifulSoup(pageNumbers.text, 'html.parser')
pageNumberText = pageSoup.select('.p_no')[-1]
pageNumber = int(pageNumberText.getText())
print('共' + str(pageNumber) + '页')
开始爬取
contentURLs = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures1 = []
for i in range(pageNumber):
if i == 0:
PageUrl = url
else:
PageUrl = url.replace('.htm', '/' + str(int(pageNumber) - i) + '.htm')
future = executor.submit(getContentUrl, PageUrl)
futures1.append(future)
print('爬取链接结束')
unique_list = list(OrderedDict.fromkeys(contentURLs))
countList = len(unique_list)
print('共' + str(contentURLs) + '条,去重后共' + str(unique_list) + '条链接')
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i, url in enumerate(unique_list):
future = executor.submit(getContent, url, i)
futures.append(future)
print('爬取完成,共:' + str(countList) + '条')