属性多值contains的使用
contains的使用:应用于一个标签的属性有多个值的情况,如果我们还是用之前的相等的模式,是匹配不到值的
from lxml import etree
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
多个属性的and连接查找
多属性值的匹配,使用到了xpath的运算符
from lxml import etree
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
轴的使用
xpath中轴的使用
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
"""
第一个li的所有祖先
[<Element html at 0x2b3be050dc8>, <Element body at 0x2b3be181b88>, <Element div at 0x2b3be181bc8>, <Element ul at 0x2b3be181c08>]
"""
result = html.xpath('//li[1]/ancestor::div')
print(result)
"""
第一个li的所有div祖先
[<Element div at 0x2b3be181bc8>]
"""
result = html.xpath('//li[1]/attribute::*')
print(result)
"""
第一个li的所有属性的值
['item-0']
"""
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
"""
第一个li的直接子节点中标签为a且href属性值为link1.html的元素
[<Element a at 0x2b3be181b88>]
"""
result = html.xpath('//li[1]/descendant::span')
print(result)
"""
第一个li的所有子孙节点中标签为span的元素
[<Element span at 0x2b3be181bc8>]
"""
result = html.xpath('//li[1]/following::*[2]')
print(result)
"""
第一个li的following轴,可以获取当前节点之后的所有节点,这里我们虽然使用的是*匹配,但又加了索引选择,所以只获取了第二个后续节点
[<Element a at 0x2b3be181c08>]
"""
result = html.xpath('//li[1]/following-sibling::*')
print(result)
"""
第一个li的following-sibling轴,可以获取当前节点的所有兄弟节点
[<Element li at 0x29526521e48>, <Element li at 0x29526521d48>, <Element li at 0x29526521a88>, <Element li at 0x29526521c48>]
"""
综合案例
"""
运行环境python2
在这里我们是对https的请求:
1.对于这个页面是并没有任何影响
2.假设我们用不了,抓取不到https的数据 解决办法ssl 忽略
import ssl
context = ssl._create_unverified_context()
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import lxml.etree
import lxml
import urllib2
import urllib
def makeurllist():
urllist = ["https://www.jb51.net/list/list_97_"+str(i+1)+".htm"for i in range(514)]
return urllist
def parseurl(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
html = lxml.etree.HTML(data)
title = html.xpath("//div[@class="artlist clearfix"]//dl//dt//a//@title")
detail_url_list = html.xpath("//div[@class="artlist clearfix"]//dl//dt//a//@href")
new_detail_url_list = ["https://www.jb51.net"+detail_url for detail_url in detail_url_list]
return zip(title,new_detail_url_list)
if __name__ == '__main__':
filepath = "./jiaobenzhijia.txt"
j_file = open(filepath, "w+")
urllist = makeurllist()
for url in urllist:
all = parseurl(url)
for title, de_url in all:
new_str = title + "\t\t" + de_url + "\n"
j_file.write(new_str)
print(url + "已经抓取完毕")
j_file.close()