xpath的使用(五)

132 阅读1分钟
# coding=utf-8
from lxml import etree

text = """<html><body>
    <h3 align='left'>Top 100排名</h3>
        <table>
            <thead>
            <th class='info'><a><span>排名</span></a></th>
            <th class='info'>id</th>
            <th class='info'>姓名</th>
            <th class='info'>点击数</th>
            <th class='info'>电影数量</th>
            <th class='info'>封面图片</th>
            </thead>
            <tbody>
            <tr>
                <td>1</td>
                <td>63</td>
                <td class="td td-1" name='item'>昨夜清晨</td>
                <td>163458</td>
                <td>3</td>
                <td>就是这么帅</td>
            </tr>
            <tr>
                <td>2</td>
                <td>38</td>
                <td>小黄人</td>
                <td>114285</td>
                <td>3</td>
                <td class='123'><a href='相当的哇塞'>小黄人.jpg</a></td>
            </tr>
            </tbody>
            </table>
            </body></html>"""

# 方式一,HTML类进行文本初始化,直接构造xpath对象进行解析
html = etree.HTML(text)
result = html.xpath('//h3/text()')
print(result)

# 方式二,读取文本进行解析
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath("//h3/text()")
# print(result)

# 获取指定节点文本信息
ths = html.xpath("//th/text()")
print(ths)

# 获取子节点文本信息
result = html.xpath("//td/a/text()")
print(result)

# 获取父节点属性,可以用..实现,或者parent::获取
result = html.xpath("//a[@href='相当的哇塞']/../@class")  # a[@href='']表示匹配a节点的href属性
result1 = html.xpath("//a[@href='相当的哇塞']/parent::*/@class")
print(result, result1)

# @进行属性匹配
result = html.xpath("//td/a/@href")
print(result)

# 属性多值匹配,<td class="td td-1" name="item">昨夜清晨</td>,需要使用contains
result = html.xpath("//td[@class='td']/text()")  # 这种方式获取不到对应的属性值
result1 = html.xpath("//td[contains(@class, 'td')]/text()")
print(result, result1)

# 多属性匹配,<td class="td td-1" name="item">昨夜清晨</td>, 同时考察classname属性
result = html.xpath("//td[contains(@class, 'td') and @name='item']/text()")
print(result)

# 按序选择
result = html.xpath("//th[1]/text()")  # 选取th的第一个节点
result1 = html.xpath("//th[last()]/text()")  # 选取th的最后一个节点
result2 = html.xpath("//th[position()<4]/text()")  # 位置小于4th节点,即1,2,3,返回列表
result3 = html.xpath("//th[last()-3]/text()")  # 获取倒数第四个节点
print(result, result1, result2, result3)


# 节点轴选择
result = html.xpath("//th[1]/child::*/span/text()")  # child轴可以获取所有直接子节点,这里的*也可以是a
print(result)

打印结果如下:

['Top 100排名']
['id', '姓名', '点击数', '电影数量', '封面图片']
['小黄人.jpg']
['123'] ['123']
['相当的哇塞']
[] ['昨夜清晨']
['昨夜清晨']
[] ['封面图片'] ['id', '姓名'] ['姓名']
['排名']