python 下载小说

336 阅读1分钟

安装库

  • requests http 请求
  • lxml 解析html内容

目标站点

  • 笔趣阁

完成效果

将小说下载到本地txt文件中

实现

  • 小说章节列表页解析

    url = "https://www.biquge7.top/50043"
        baseUrl = 'https://www.biquge7.top'
        payload = {}
        response = requests.request("GET", url, headers=request_headers, data=payload)
        html = response.text
        htmlTree = etree.HTML(html)
        # 取出小说baseInfo
        base_info_node = htmlTree.xpath("//div[@class='tits']")[0]
        novel = Novel()
        novel.name = get_one_value(base_info_node.xpath(".//h1/strong/text()"))
    
  • 小说章节内容解析

        response = requests.request("GET", url, headers=request_headers, data={})
        html = response.text.encode("utf-8")
        html_tree = etree.HTML(html)
        content = html_tree.xpath("//div[@class='text']")[0].xpath("string(.)")
        return str(content).replace(" ", "\n")
    

完整代码

import requests
from lxml import etree
​
request_headers = {
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'sec-ch-ua-platform': '"Windows"',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
       AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46'
}
​
​
class Novel:
    def __init__(self):
        # 小说名
        self.name: str = ''
​
​
def get_novel_info(url):
    baseUrl = 'https://www.biquge7.top'
    payload = {}
    response = requests.request("GET", url, headers=request_headers, data=payload)
    html = response.text
    htmlTree = etree.HTML(html)
    # 取出小说baseInfo
    base_info_node = htmlTree.xpath("//div[@class='tits']")[0]
    novel = Novel()
    novel.name = get_one_value(base_info_node.xpath(".//h1/strong/text()"))
​
    # 解析章节信息
    novel_chapter_list = htmlTree.xpath("//div[@class = 'list']/ul/li")
    with open(novel.name + ".txt", "w", encoding='utf-8') as f:
        for chapter in novel_chapter_list:
            url = baseUrl + get_one_value(chapter.xpath("./a/@href"))
            print(etree.tounicode(chapter, method='html'))
            info = get_chapter_info(url)
            chapter_name_info = get_one_value(chapter.xpath("./a/text()"))
            f.writelines(chapter_name_info)
            f.writelines(info)
​
​
def get_one_value(values: list):
    return "" if len(values) < 0 else values[0]
​
​
def get_chapter_info(url):
    response = requests.request("GET", url, headers=request_headers, data={})
    html = response.text.encode("utf-8")
    html_tree = etree.HTML(html)
    content = html_tree.xpath("//div[@class='text']")[0].xpath("string(.)")
    return str(content).replace(" ", "\n")
​
​
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    get_novel_info("https://www.biquge7.top/50043")
​