网址---国学梦
import requests
from lxml import etree
url = 'https://www.guoxuemeng.com/guoxue/zhenjiudacheng/'
response = requests.get(url)
response.encoding = 'utf-8'
# 解析网页内容
html = etree.HTML(response.text)
li_count = html.xpath('//html/body/div[3]/div[2]/div[5]/ul/li')
content = ""
# 用于存储已经添加过的 title1
added_titles = set()
for li_tag in range(1,len(li_count)+1):
# 提取标题
a_tag = html.xpath(f'//html/body/div[3]/div[2]/div[5]/ul/li[{li_tag}]/a')[0]
text = a_tag.text
href = a_tag.get('href')
my_title = text.split("·")
title1 = "\n## " + my_title[1] + "\n"
title2 = "\n#### " + my_title[2] + "\n"
# 检查 title1 是否已经添加过
if title1 not in added_titles:
content += title1
added_titles.add(title1) # 将 title1 添加到已添加的集合中
content += title2
url2= href
response2 = requests.get(url2)
response2.encoding='utf-8'
html2 = etree.HTML(response2.text)
p_tags=html2.xpath('/html/body/div[3]/div[2]/div[2]/p')
# 遍历 p 标签,提取文字内容,直到遇到指定关键词为止
result = ""
for p in p_tags:
text = p.text
if text == "关键词:针灸大成":
break
if text is not None:
result += text
else:
result += "" # 或者 result += "默认值"
content+=result
# 指定文件名
filename = "a.md"
# 打开文件并写入内容
with open(filename, "w", encoding="utf-8") as file:
file.write(content)
print(f"文件已保存为 {filename}")