实战(六)-小说章节爬取

153 阅读1分钟
# coding=utf-8

import pymongo
from lxml import etree
import requests
from fake_useragent import UserAgent


class Scrape:
    def __init__(self, book_url):
        self.client = pymongo.MongoClient(host='localhost', port=27017)  # 创建连接对象
        self.collection = self.client['book']['random']  # 定义数据集合
        self.book_url = book_url
        self.headers = {"user-Agent": UserAgent().random}  # 随机请求头

    def get(self):
        try:
            res = requests.get(self.book_url, headers=self.headers, timeout=5)
            source = res.text
            html = etree.HTML(source)
            title = html.xpath("//h1/text()")[0]  # 获取小说名字
            chapters = html.xpath("//div[@id='list']/dl/dd")  # 利用xpath进行数据解析提取
            for chapter in chapters:
                chapter_name = chapter.xpath("./a/text()")
                self.collection.insert({'title': title, 'chapter_name': chapter_name[0]})  # 保存小说名,章节名到mongodb
                print(f"{chapter_name[0]}")
        except Exception as e:
            print(e)

    def close(self):
        self.client.close()  # 关闭MongoDB


if __name__ == '__main__':
    url = "http://www.xs5200.com/2_2156/"
    results = Scrape(url)
    results.get()
    results.close()

保存结果部分截图如下:

image.png