安装库
- requests http 请求
- lxml 解析html内容
目标站点
- 笔趣阁
完成效果
将小说下载到本地txt文件中
实现
-
小说章节列表页解析
url = "https://www.biquge7.top/50043" baseUrl = 'https://www.biquge7.top' payload = {} response = requests.request("GET", url, headers=request_headers, data=payload) html = response.text htmlTree = etree.HTML(html) # 取出小说baseInfo base_info_node = htmlTree.xpath("//div[@class='tits']")[0] novel = Novel() novel.name = get_one_value(base_info_node.xpath(".//h1/strong/text()")) -
小说章节内容解析
response = requests.request("GET", url, headers=request_headers, data={}) html = response.text.encode("utf-8") html_tree = etree.HTML(html) content = html_tree.xpath("//div[@class='text']")[0].xpath("string(.)") return str(content).replace(" ", "\n")
完整代码
import requests
from lxml import etree
request_headers = {
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46'
}
class Novel:
def __init__(self):
# 小说名
self.name: str = ''
def get_novel_info(url):
baseUrl = 'https://www.biquge7.top'
payload = {}
response = requests.request("GET", url, headers=request_headers, data=payload)
html = response.text
htmlTree = etree.HTML(html)
# 取出小说baseInfo
base_info_node = htmlTree.xpath("//div[@class='tits']")[0]
novel = Novel()
novel.name = get_one_value(base_info_node.xpath(".//h1/strong/text()"))
# 解析章节信息
novel_chapter_list = htmlTree.xpath("//div[@class = 'list']/ul/li")
with open(novel.name + ".txt", "w", encoding='utf-8') as f:
for chapter in novel_chapter_list:
url = baseUrl + get_one_value(chapter.xpath("./a/@href"))
print(etree.tounicode(chapter, method='html'))
info = get_chapter_info(url)
chapter_name_info = get_one_value(chapter.xpath("./a/text()"))
f.writelines(chapter_name_info)
f.writelines(info)
def get_one_value(values: list):
return "" if len(values) < 0 else values[0]
def get_chapter_info(url):
response = requests.request("GET", url, headers=request_headers, data={})
html = response.text.encode("utf-8")
html_tree = etree.HTML(html)
content = html_tree.xpath("//div[@class='text']")[0].xpath("string(.)")
return str(content).replace(" ", "\n")
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
get_novel_info("https://www.biquge7.top/50043")