urllib2的使用(二)

80 阅读1分钟

Request对象的方法

  • 使用Request对象的方法add_header,在请求头中增加信息
  • 还有方法 add_data(data) add_header(key,value) add_unredirected_header(key,value) get_data() get_full_url() get_header() get_host() get_method() get_origin_req_host()
import urllib2

def download(url):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    request = urllib2.Request(url,headers = headers)
    request.add_header("Connection","keep-alive")  # 额外添加header信息
    # 通过调用Request.get_header()来查看header消息
    request.get_header(header_name="Connection")
    response = urllib2.urlopen(request)
    print(response.code)     # 查看状态码
    data = response.read()
    return data

url = "https://www.lagou.com/jobs/list_python/p-city_2"
print(download(url))

urlencode/urlparse编解码

  • 解决编码问题:
    • 有的浏览器不支持中文,必须要将中文转化成编码后的字符格式才能完成搜索功能
    • 统一规范,编码解码
# python2
import urllib
word = {"kw":"waws520"}
print(urllib.urlencode(word))
print(urllib.unquote(urllib.urlencode(word)))

"""
kw=%E7%8E%8B%E4%BC%9F
kw=waws520
"""

# python3
import urllib.parse
word = {"kw":"waws520"}
print(urllib.parse.urlencode(word))
print(urllib.parse.unquote(urllib.parse.urlencode(word)))

urlencode/urlparse 路径参数的生成

路径参数详细的使用爬取网页的代码urllib.urlencode()的使用

import urllib
import urllib2

headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
url = "http://www.baidu.com/"
word = {"wd":"刘欢"}
kw = urllib.urlencode(word)
newurl = url + "?" + kw
request = urllib2.Request(newurl,headers = headers)
print(urllib2.urlopen(request).read())

多参数的书写方式

import urllib2
import urllib

def download(url,addr,mytype):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
    addr = urllib.urlencode({"jl":addr})
    mytype = urllib.urlencode({"kw":mytype})
    url = url + "?" + addr + "&" + mytype + "kt=3"

    request = urllib2.Request(url,headers = headers)
    request.add_header("Connection","keep-alive")
    response = urllib2.urlopen(request)
    print response.code
    data = response.read()
    return data

addr = "530"
mytype = "python"
url = "https://sou.zhaopin.com/"
print download(url,addr,mytype)