将html转为markdown格式

92 阅读1分钟

网址---国学梦

import requests
from lxml import etree

url = 'https://www.guoxuemeng.com/guoxue/zhenjiudacheng/'
response = requests.get(url)

response.encoding = 'utf-8'


# 解析网页内容
html = etree.HTML(response.text)
li_count = html.xpath('//html/body/div[3]/div[2]/div[5]/ul/li')

content = ""
# 用于存储已经添加过的 title1
added_titles = set()
for li_tag in range(1,len(li_count)+1):
    # 提取标题
    a_tag = html.xpath(f'//html/body/div[3]/div[2]/div[5]/ul/li[{li_tag}]/a')[0]
    text = a_tag.text
    href = a_tag.get('href')

    my_title = text.split("·")
    title1 = "\n## " + my_title[1] + "\n"
    title2 = "\n#### " + my_title[2] + "\n"

    # 检查 title1 是否已经添加过
    if title1 not in added_titles:
        content += title1
        added_titles.add(title1)  # 将 title1 添加到已添加的集合中

    content += title2

    url2= href
    response2 = requests.get(url2)
    response2.encoding='utf-8'
    html2 = etree.HTML(response2.text)

    p_tags=html2.xpath('/html/body/div[3]/div[2]/div[2]/p')

    # 遍历 p 标签,提取文字内容,直到遇到指定关键词为止
    result = ""
    for p in p_tags:
        text = p.text
        if text == "关键词:针灸大成":
            break
        if text is not None:
            result += text
        else:
            result += ""  # 或者 result += "默认值"
    content+=result

# 指定文件名
filename = "a.md"

# 打开文件并写入内容
with open(filename, "w", encoding="utf-8") as file:
    file.write(content)

print(f"文件已保存为 {filename}")