xpath的使用(三)

328 阅读1分钟

stockstar代码实战

import urllib2
import lxml
import lxml.etree

def parse(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    data = response.read()
    html = lxml.etree.HTML(data)
    stocklist = html.xpath("//tbody[@class="tbody_right"]//tr//text()")
    for linedata in stocklist:
        print(linedata)

parse("https://quote.stockstar.com/fund/open.shtml")

haoduanzi代码实战

import urllib2
import lxml
import lxml.etree

def parse(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    data = response.read()
    html = lxml.etree.HTML(data)
    stocklist = html.xpath("//ul[@class="list-box"]//li//div[@class="content"]//text()")
    str_r = "".join(stocklist)
    print(str_r)

parse("http://www.haoduanzi.com/wen/")

父节点的使用

from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())

# 首先选取的是href属性的值为link4.html的a标签,在找到这些a标签的父标签的class属性的值
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)

"""
另一种方式
使用parent:: 对父元素进行提取
from lxml import etree
 
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
"""