xpath的使用(二)

392 阅读1分钟

强化数据提取代码

  • 巩固前面的提取方式:最常见的两种
    1. 一是对属性的提取 //a/@href
    2. 二是对文本的提取 //div[@class="haha"]/a/text()
#coding:utf-8
import lxml
import lxml.etree

html=u'''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title>12345</title>
</head>
<body>
<div id="content">
    <ul id="useful">
        <li text="11">a1</li>
        <li text="12">a2</li>
        <li text="13">a3</li>
    </ul>
    <ul id="useless">
        <li>1</li>
        <li>2</li>
        <li>3</li>
    </ul>

    <div id="url">
        <a href="http://51job.com">zhiwei</a>
        <a href="http://51job.com/course/" title="python">clickit</a>
    </div>
</div>

</body>
</html>
'''

mytree=lxml.etree.HTML(html)
print(mytree.xpath("//title/text()"))
print(mytree.xpath("//*[@id="useful"]/li/text()"))
print(mytree.xpath("//*[@id="useless"]/li/text()"))
print(mytree.xpath("//*[@id="url"]/a/@href"))
print(mytree.xpath("//*[@id="url"]/a/@title"))
print(mytree.xpath("//*[@id="url"]/a/text()"))

结合urllib2完成网页数据的提取

xpath结合urllib2进行网页数据提取实战

import lxml
import lxml.etree

import urllib2
import urllib

def download(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request)
    data = response.read()
    html = lxml.etree.HTML(data)
    e_html = html.xpath("//*[@class="emphasis"]/text()")
    print(e_html)

download("https://www.autohome.com.cn/166/#pvareaid=311284")

请求的中文细节问题

python3 版本

"""
在这里我们解析得到的数据中中文无法正常的显示,主要是python2的原因,但是我们可以使用python3的方式进行处理
使用python3进行处理需要注意(实战遇坑):
    1.import urllib.request
    2.使用urllib.request.urlopen()的时候,不能加入headers

除了使用python3,进行解决 我们还可以使用python2的解码方式进行问题的解决
"""

import lxml
import lxml.etree
import urllib.request

def parse(url):
    response = urllib.request.urlopen(url)
    data = response.read()
    html = lxml.etree.HTML(data)
    print(html.xpath("//div[@class="dw_tlc"]/div[@class="rt"]/text()")[0].strip())
    
parse("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=")

python2 版本

import lxml
import lxml.etree
import urllib2

def parse(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url,headers =headers)
    response =urllib2.urlopen(request)
    data = response.read()
    html = lxml.etree.HTML(data)
    print(html.xpath("//div[@class="dw_tlc"]/div[@class="rt"]/text()")[0].strip())

parse("https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=")