BeautifulSoup爬虫示例

2 阅读1分钟

引入依赖

import requests
from bs4 import BeautifulSoup
import time
import concurrent.futures
from collections import OrderedDict  # 去重

设置全局变量

url = 'https://www.bsuc.edu.cn/xwzx/xxyw1.htm'
countList = 0  # 统计爬取条数
contentURLs = []  # 内容页url

def获取详情页信息并写入txt文本

# 获取详情页信息
def getContent(url, pageIndex):
    global countList
    print('第' + str(pageIndex) + '条')
    # contentURL = 'https://www.bsuc.edu.cn/' + url['href'].replace('../', '')
    # 请求内容页面
    contentRES = requests.get(url)
    contentRES.encoding = 'utf-8'
    if contentRES.status_code != 200:
        countList -= 1
        print('内容页请求错误:' + str(contentRES.status_code) + '|' + url)
        return
    contentList = BeautifulSoup(contentRES.text, 'html.parser')  # 使用html.parser解析器
    # 获取内容
    #  标题
    titleElement = contentList.find(attrs={'class', 'title'}).find('h3')
    title = titleElement.get_text() if titleElement is not None else ''
    #  信息
    infoElement = contentList.find(attrs={'class', 'news-msg text-center'})
    info = infoElement.get_text() if infoElement is not None else ''
    #  内容
    contentElement = contentList.find(attrs={'class', 'v_news_content'})
    content = contentElement.get_text() if contentElement is not None else ''
    # 写入文件
    with open('去重.txt', 'a', encoding='utf-8') as file:
        # 写入txt文本
        file.write('title:' + title)
        file.write('\ninfo:' + info.replace('\n', ''))
        file.write('\ncontent:' + content)
        file.write('######\n')

def获取详情页url

# 获取详情页url
def getContentUrl(PageUrl):
    res = requests.get(PageUrl)
    res.encoding = 'utf-8'
    if res.status_code != 200:
        print('页面请求错误:' + str(res.status_code) + '|' + PageUrl)
        return
    soup = BeautifulSoup(res.text, 'html.parser')  # 使用html.parser解析器
    urls = soup.select('.box-txt a')
    for j in urls:
        contentURL = 'https://www.bsuc.edu.cn/' + j['href'].replace('../', '')
        contentURLs.append(contentURL)

获取页码

# 获取最大页
pageNumbers = requests.get(url)
pageNumbers.encoding = 'utf-8'
if pageNumbers.status_code != 200:
    print('页码请求错误:' + str(pageNumbers.status_code) + '|' + url)
    pageNumber = 0
else:
    pageSoup = BeautifulSoup(pageNumbers.text, 'html.parser')  # 使用html.parser解析器
    pageNumberText = pageSoup.select('.p_no')[-1]
    pageNumber = int(pageNumberText.getText())
print('共' + str(pageNumber) + '页')

开始爬取

contentURLs = []
# 创建线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # 创建一个任务列表
    futures1 = []
    for i in range(pageNumber):
        if i == 0:
            PageUrl = url
        else:
            PageUrl = url.replace('.htm', '/' + str(int(pageNumber) - i) + '.htm')
        future = executor.submit(getContentUrl, PageUrl)
        futures1.append(future)
print('爬取链接结束')
# 去重
unique_list = list(OrderedDict.fromkeys(contentURLs))
countList = len(unique_list)
print('共' + str(contentURLs) + '条,去重后共' + str(unique_list) + '条链接')
# 创建线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # 创建一个任务列表
    futures = []
    for i, url in enumerate(unique_list):
        # 添加到任务列表
        future = executor.submit(getContent, url, i)
        futures.append(future)

print('爬取完成,共:' + str(countList) + '条')