# coding=utf-8
from lxml import etree
text = """<html><body>
<h3 align='left'>Top 100排名</h3>
<table>
<thead>
<th class='info'><a><span>排名</span></a></th>
<th class='info'>id</th>
<th class='info'>姓名</th>
<th class='info'>点击数</th>
<th class='info'>电影数量</th>
<th class='info'>封面图片</th>
</thead>
<tbody>
<tr>
<td>1</td>
<td>63</td>
<td class="td td-1" name='item'>昨夜清晨</td>
<td>163458</td>
<td>3</td>
<td>就是这么帅</td>
</tr>
<tr>
<td>2</td>
<td>38</td>
<td>小黄人</td>
<td>114285</td>
<td>3</td>
<td class='123'><a href='相当的哇塞'>小黄人.jpg</a></td>
</tr>
</tbody>
</table>
</body></html>"""
# 方式一,HTML类进行文本初始化,直接构造xpath对象进行解析
html = etree.HTML(text)
result = html.xpath('//h3/text()')
print(result)
# 方式二,读取文本进行解析
# html = etree.parse('./test.html', etree.HTMLParser())
# result = html.xpath("//h3/text()")
# print(result)
# 获取指定节点文本信息
ths = html.xpath("//th/text()")
print(ths)
# 获取子节点文本信息
result = html.xpath("//td/a/text()")
print(result)
# 获取父节点属性,可以用..实现,或者parent::获取
result = html.xpath("//a[@href='相当的哇塞']/../@class") # a[@href='']表示匹配a节点的href属性
result1 = html.xpath("//a[@href='相当的哇塞']/parent::*/@class")
print(result, result1)
# @进行属性匹配
result = html.xpath("//td/a/@href")
print(result)
# 属性多值匹配,<td class="td td-1" name="item">昨夜清晨</td>,需要使用contains
result = html.xpath("//td[@class='td']/text()") # 这种方式获取不到对应的属性值
result1 = html.xpath("//td[contains(@class, 'td')]/text()")
print(result, result1)
# 多属性匹配,<td class="td td-1" name="item">昨夜清晨</td>, 同时考察class和name属性
result = html.xpath("//td[contains(@class, 'td') and @name='item']/text()")
print(result)
# 按序选择
result = html.xpath("//th[1]/text()") # 选取th的第一个节点
result1 = html.xpath("//th[last()]/text()") # 选取th的最后一个节点
result2 = html.xpath("//th[position()<4]/text()") # 位置小于4的th节点,即1,2,3,返回列表
result3 = html.xpath("//th[last()-3]/text()") # 获取倒数第四个节点
print(result, result1, result2, result3)
# 节点轴选择
result = html.xpath("//th[1]/child::*/span/text()") # child轴可以获取所有直接子节点,这里的*也可以是a
print(result)
打印结果如下:
['Top 100排名']
['id', '姓名', '点击数', '电影数量', '封面图片']
['小黄人.jpg']
['123'] ['123']
['相当的哇塞']
[] ['昨夜清晨']
['昨夜清晨']
[] ['封面图片'] ['id', '姓名'] ['姓名']
['排名']