stockstar代码实战
import urllib2
import lxml
import lxml.etree
def parse(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
data = response.read()
html = lxml.etree.HTML(data)
stocklist = html.xpath("//tbody[@class="tbody_right"]//tr//text()")
for linedata in stocklist:
print(linedata)
parse("https://quote.stockstar.com/fund/open.shtml")
haoduanzi代码实战
import urllib2
import lxml
import lxml.etree
def parse(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
data = response.read()
html = lxml.etree.HTML(data)
stocklist = html.xpath("//ul[@class="list-box"]//li//div[@class="content"]//text()")
str_r = "".join(stocklist)
print(str_r)
parse("http://www.haoduanzi.com/wen/")
父节点的使用
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
"""
另一种方式
使用parent:: 对父元素进行提取
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
"""