# coding=utf-8
import pymongo
from lxml import etree
import requests
from fake_useragent import UserAgent
class Scrape:
def __init__(self, book_url):
self.client = pymongo.MongoClient(host='localhost', port=27017) # 创建连接对象
self.collection = self.client['book']['random'] # 定义数据集合
self.book_url = book_url
self.headers = {"user-Agent": UserAgent().random} # 随机请求头
def get(self):
try:
res = requests.get(self.book_url, headers=self.headers, timeout=5)
source = res.text
html = etree.HTML(source)
title = html.xpath("//h1/text()")[0] # 获取小说名字
chapters = html.xpath("//div[@id='list']/dl/dd") # 利用xpath进行数据解析提取
for chapter in chapters:
chapter_name = chapter.xpath("./a/text()")
self.collection.insert({'title': title, 'chapter_name': chapter_name[0]}) # 保存小说名,章节名到mongodb
print(f"{chapter_name[0]}")
except Exception as e:
print(e)
def close(self):
self.client.close() # 关闭MongoDB
if __name__ == '__main__':
url = "http://www.xs5200.com/2_2156/"
results = Scrape(url)
results.get()
results.close()
保存结果部分截图如下: