xpath的使用(四)

168 阅读2分钟

属性多值contains的使用

contains的使用:应用于一个标签的属性有多个值的情况,如果我们还是用之前的相等的模式,是匹配不到值的

from lxml import etree

text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

多个属性的and连接查找

多属性值的匹配,使用到了xpath的运算符

from lxml import etree

text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

轴的使用

xpath中轴的使用

from lxml import etree

text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
"""
第一个li的所有祖先
[<Element html at 0x2b3be050dc8>, <Element body at 0x2b3be181b88>, <Element div at 0x2b3be181bc8>, <Element ul at 0x2b3be181c08>]
"""

result = html.xpath('//li[1]/ancestor::div')
print(result)
"""
第一个li的所有div祖先
[<Element div at 0x2b3be181bc8>]
"""

result = html.xpath('//li[1]/attribute::*')
print(result)
"""
第一个li的所有属性的值
['item-0']
"""

result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
"""
第一个li的直接子节点中标签为a且href属性值为link1.html的元素
[<Element a at 0x2b3be181b88>]
"""

result = html.xpath('//li[1]/descendant::span')
print(result)
"""
第一个li的所有子孙节点中标签为span的元素
[<Element span at 0x2b3be181bc8>]
"""

result = html.xpath('//li[1]/following::*[2]')
print(result)
"""
第一个li的following轴,可以获取当前节点之后的所有节点,这里我们虽然使用的是*匹配,但又加了索引选择,所以只获取了第二个后续节点
[<Element a at 0x2b3be181c08>]
"""

result = html.xpath('//li[1]/following-sibling::*')
print(result)
"""
第一个li的following-sibling轴,可以获取当前节点的所有兄弟节点
[<Element li at 0x29526521e48>, <Element li at 0x29526521d48>, <Element li at 0x29526521a88>, <Element li at 0x29526521c48>]
"""

综合案例

"""
运行环境python2
在这里我们是对https的请求:
    1.对于这个页面是并没有任何影响
    2.假设我们用不了,抓取不到https的数据  解决办法ssl 忽略
    import ssl
    context = ssl._create_unverified_context()
"""

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import lxml.etree
import lxml
import urllib2
import urllib

def makeurllist():
    urllist = ["https://www.jb51.net/list/list_97_"+str(i+1)+".htm"for i in range(514)]
    return urllist

def parseurl(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    data = response.read()
    html = lxml.etree.HTML(data)
    title = html.xpath("//div[@class="artlist clearfix"]//dl//dt//a//@title")
    detail_url_list = html.xpath("//div[@class="artlist clearfix"]//dl//dt//a//@href")
    new_detail_url_list = ["https://www.jb51.net"+detail_url for detail_url in detail_url_list]
    return zip(title,new_detail_url_list)

if __name__ == '__main__':
    filepath = "./jiaobenzhijia.txt"
    j_file = open(filepath, "w+")
    urllist = makeurllist()
    for url in urllist:
        all = parseurl(url)
        for title, de_url in all:
            new_str = title + "\t\t" + de_url + "\n"
            j_file.write(new_str)
        print(url + "已经抓取完毕")
    j_file.close()