python使用lxml解析html获取页面内所有叶子节点的xpath路径

81 阅读11分钟
from lxml import etree
import requests

def fetch_html(url):
    """
    获取网页的HTML内容。
    """
    response = requests.get(url)
    response.raise_for_status()
    return response.content

def get_xpath(element, root):
    """
    生成单个元素的XPath。
    """
    components = []
    while element is not root:
        parent = element.getparent()
        if parent is None:
            break
        index = parent.index(element) + 1
        tag = element.tag
        components.append(f'{tag}')
        element = parent
    components.reverse()
    return '/' + '/'.join(components)

def is_leaf(element):
    """
    判断是否为叶子节点。
    """
    return len(element) == 0

def get_all_leaf_xpaths(element, root, leaf_xpaths):
    """
    递归获取所有叶子节点的XPath。
    """
    if is_leaf(element):
        xpath = get_xpath(element, root)
        leaf_xpaths.append(xpath)
    else:
        for child in element:
            get_all_leaf_xpaths(child, root, leaf_xpaths)

# 目标网页的URL
url = 'https://github.com/IEIT-Yuan/Yuan-2.0'  # 替换为你要爬取的网页URL

# 获取网页内容
html_content = fetch_html(url)

# 解析HTML
parser = etree.HTMLParser()
tree = etree.HTML(html_content, parser)

# 获取所有叶子节点的XPath
leaf_xpaths = []
root_element = tree.getroottree().getroot()
get_all_leaf_xpaths(root_element, root_element, leaf_xpaths)
leaf_xpaths = list(set(leaf_xpaths))
# 打印所有叶子节点的XPath
for xpath in leaf_xpaths:
    print(xpath)

/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/div/div/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/template/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div/div/span/a/span/span/svg/path
/body/div/div/script
/body/div/include-fragment
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/p
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/div/div/div/h1
/body/div/div/div/main/div/div/div/details/details-dialog/a/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/button/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div/a/span/span/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div/a/span/span/span
/body/div/div/header/div/div/div/div/input
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/style
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/script
/body/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/div/div/div/h1
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/scrollable-region/div/form/label
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/h2
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/div/div
/body/div/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/div/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/template
/head/meta
/body/div/div/div/main/div/div/div/div/strong/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/h2/a
/body/div/div/div/main/div/div/div/ul/li/a/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/div/div/div/h2
/body/div/div/header/div/div/div/nav/ul/li/div/div/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/button
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/scrollable-region/div/form/input
/body/div/div/div/main/div/div/div/div/div/tool-tip
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/button/span/span/div/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/div/h2
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/table/tbody/tr/td/sup/br
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/p
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/h2/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/ul/li/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/h2
/body/div/div/div/main/div/div/div/ul/li/tool-tip
/body/div/div/react-partial/script
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/scrollable-region/div/form/p
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/div/a/span
/body/div/div/div/main/div/div/div/ul/li/div/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/details/summary/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/h2
/body/div/div/div/main/turbo-frame/div/div/div/div/div/script
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/button/span/span/svg/path
/body/div/div/div/main/div/div/div/details/details-dialog/div/h3
/body/div/div/template/div/div/button/svg/path
/body/div/div/div/main/div/nav/div/action-menu/focus-group/anchored-position/div/div/action-list/div/ul/li/a/span
/body/div/template/details/summary
/body/div/template/details/details-dialog/div
/body/div/div/div/main/div/div/div/div/span/a
/body/div/div/div/main/div/div/div/div/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ul/li/ul/li/ul/li/ul/li/a
/head/title
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/a/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ul/li/strong
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/div/main/turbo-frame/div/div/div/div/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ol/li/p
/body/div/div/header/div/div/div/a
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/scrollable-region/div/form/textarea
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/button/span/span/div/div/span/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/div/main/div/nav/ul/li/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/a/span/span
/body/div/div/div/main/div/div/div/ul/li
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/a/span/span/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/button/svg/path
/body/div/footer/h2
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ul/li/ul/li/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ul/li/ul/li/ul/li/a
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div
/body/div/div/div/main/div/div/div/details/details-dialog/a/div/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/h2
/body/div/div/div/main/div/nav/div/action-menu/focus-group/button/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/nav/ul/li/a/span/svg/path
/body/div/div/react-partial/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/input
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/thead/tr/th/div/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/details/details-dialog/div/button/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/dialog-helper/dialog/scrollable-region/div/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/template/div/div/div/div
/body/div/div/div/main/div/div/div/div/div/div/button/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/h2
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/ul
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/div/div/div/button/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/span
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/div/a
/body/div/div/div/main/div/div/div/p
/body/div/div/div/main/div/nav/div/action-menu/focus-group/tool-tip
/body/div/div/div/button/svg/path
/body/div/div/div/tool-tip
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/details/details-dialog/a/div/div
/body/div/div/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/hr
/body/div/div/header/div/div/div/nav/ul/li/button/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/ul/li/a/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/p/a/code
/body/div/footer/div/nav/ul/li/cookie-consent-link/button
/body/div/div/header/div/div/div/nav/ul/li/div/div/ul/li/a/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/div/button
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/span/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/input
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/include-fragment
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/h3
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/include-fragment/ul/li/div
/body/div/div/header/div/div/a/svg/path
/body/div/div/div/main/div/div/div/div/div/a/svg/path
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/div/label
/body/div/div/div/main/div/div/div/div/div/div/a/svg/path
/body/div/div/span/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/table/tbody/tr/td/sup/em
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/ul/li/a
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/div/input
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/h3/div/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/p/code
/body/div/div/div/main/div/div/div/h3
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/button/span/span/div/svg/path
/body/div/div/div/main/div/nav/ul/li/a/include-fragment
/body/div/div/div/main/div/div/div/div/div/div/a/span
/body/div/template/div/clipboard-copy/svg/path
/head/script
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/thead/tr/th/span
/body/div/div/header/div/div/div/div/div/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/button
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/div/main/div/div/div/div/a/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/button/span/svg/path
/body/div/div/div/main/div/div/div/ul/li/a/span
/body/div/footer/div/nav/h3
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/div/div/h2
/body/div/div/header/div/div/div/div/qbsearch-input/div/button/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/div/a/strong
/body/div/div/div/main/div/nav/div/action-menu/focus-group/anchored-position/div/div/action-list/div/ul/li/a/span/svg/path
/body/div/div/div/span/a
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/<cyfunction Comment at 0x7f24a808df30>
/body/div/div/div/main/div/div/div/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/div/pre/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/details/details-dialog/div/h3
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/div/auto-check/input
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/table/tbody/tr/td/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/p/strong
/body/div/div/div/svg/path
/body/div/footer/div/div/span
/body/div/div/div/main/div/div/div/div/span
/body/div/div/div/main/div/nav/ul/li/a/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/table/thead/tr/th
/body/div/div/header/div/div/div/div/qbsearch-input/div/div/modal-dialog/div/div/div/form/query-builder/div/div/span/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/table/tbody/tr/td/a
/body/div/div/header/button/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/button/span/span
/body/div/footer/div/nav/ul/li/a
/body/div/div/header/div/div/div/div/qbsearch-input/div/custom-scopes/dialog-helper/dialog/scrollable-region/div/div/form/p/a
/body/div/div/div/main/div/div/div/ul/li/div/a/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/div/a/svg/path
/body/div/div/div/main/turbo-frame/div/h1
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/nav/ul/li/a/span
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/div/h1
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/p/a/img
/body/div/div/header/div/div/div/button/span/span/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/div/div/div/div/br
/body/div/div/header/div/div/div/nav/ul/li/a
/body/div/div
/body/div/div/div
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/table/tbody/tr/td
/body/div/template/details/details-dialog/button/svg/path
/body/div/div/header/div/div/div/nav/ul/li/div/div/ul/li/a
/body/div/div/header/h2
/body/div/div/header/div/div/div/div/qbsearch-input/div/button/div/svg/path
/body/div/div/div/main/turbo-frame/div/div/div/div/div/react-partial/div/div/div/div/div/div/article/div/h4
for xpath in leaf_xpaths:
    # 如果你知道具体的XPath路径,可以直接使用它
    # 例如,获取第一个<div>中的<p>标签内容
    try:

        first_div_p = tree.xpath("/html"+xpath)
        # print(first_div_p)
        if len(first_div_p) and first_div_p[0].text:
            print("xpath",xpath)
            print("text",first_div_p[0].text)
    except:
        continue