xpath的使用(一)

257 阅读2分钟

读取html字符串

主要是将html_str构造成体

import lxml
import lxml.etree

text = """
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
     </ul>
 </div>
"""

html = lxml.etree.HTML(text)    # 处理文本
print(type(html))
print(html)

"""
<type 'lxml.etree._Element'>
<Element html at 0x134072c8>
"""

print(lxml.etree.tostring(html))

数据展示如下:

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a> # &#230;&#179;&#168;&#230;&#132;&#143;&#239;&#188;&#140;&#230;&#173;&#164;&#229;&#164;&#132;&#231;&#188;&#186;&#229;&#176;&#145;&#228;&#184;&#128;&#228;&#184;&#170; </li> &#233;&#151;&#173;&#229;&#144;&#136;&#230;&#160;&#135;&#231;&#173;&#190;
     </ul>
 </div>
</body></html>

从文件中读取html字符串

import lxml

"""
lxml.etree.HTML()   处理文本字符串
lxml.etree.parse()   处理的是文件内容
"""

import lxml.etree

html = lxml.etree.parse("1.html")  # 处理文件
print(html)
print(type(html))
print(lxml.etree.tostring(html))

"""
报错:
lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: meta line 4 and head, line 6, column 8
这个主要是标签不匹配的原因,将html中的meta标签去掉即可
"""

"""
知识点:lxml.etree.parse(html_file_path,解析器),使用tostring()得到的数据是bytes类型的,decode解码查看
from lxml import etree
 
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

"""

各种标签元素的提取 (重点)

import lxml
import lxml.etree

html = lxml.etree.parse("1.html")
print(type(html))

res = html.xpath("//li")  # 是一个列表,包含所有元素
print(res)
"""
[<Element li at 0x1359f248>, <Element li at 0x1359f208>, <Element li at 0x1359cc08>, <Element li at 0x1359c208>, <Element li at 0x1359ac88>]
"""

print(type(res[0]))
"""
<type 'lxml.etree._Element'>
"""

print(html.xpath("//li/@class"))  # 取出li的所有结点的class名称
"""
将所有的lixia的class属性全部提取了出来
·若其中有的属性不存在,则跳过不显示
['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']
"""

print(html.xpath("//li/a/@href"))
"""
['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
"""

print(html.xpath("//li/a")) # li下面有5个节点,每个节点对应一个元素
print(html.xpath("//li/a/@href="link3.html""))
"""
True
判断匹配出来的元素中是否有与"link3.html"相匹配的元素,若有则返回Ture
"""

print(html.xpath("//li//@class"))
"""
会将li下面的所有的class属性全部抓取出来
['item-0', 'hhh', 'item-1', 'item-inactive', 'item-1', 'item-0']
"""

print(html.xpath("//li"))
print(html.xpath("//li[1]"))
print(html.xpath("//li[last()]"))
print(html.xpath("//li[last()-1]"))
"""
第一个/最后一个
[<Element li at 0x1368a2c8>, <Element li at 0x1368a288>, <Element li at 0x13687c88>, <Element li at 0x13687288>, <Element li at 0x13685d08>]
[<Element li at 0x1368a2c8>]
[<Element li at 0x13685d08>]
[<Element li at 0x137ad288>]
"""

print(html.xpath("//li[last()-1]/a/@href"))
"""
['link4.html']
"""

print(html.xpath("//*[@href="link4.html"]")) # 选取href="link4.html" 的元素
"""
[<Element a at 0x12f3d948>]
"""

print(html.xpath("//*[@href="link4.html"]/text()")) # text() 取出标签之间的文本数据
"""
['fourth item']
"""