Request对象的方法
- 使用Request对象的方法add_header,在请求头中增加信息
- 还有方法
add_data(data)
add_header(key,value)
add_unredirected_header(key,value)
get_data()
get_full_url()
get_header()
get_host()
get_method()
get_origin_req_host()
import urllib2
def download(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib2.Request(url,headers = headers)
request.add_header("Connection","keep-alive") # 额外添加header信息
# 通过调用Request.get_header()来查看header消息
request.get_header(header_name="Connection")
response = urllib2.urlopen(request)
print(response.code) # 查看状态码
data = response.read()
return data
url = "https://www.lagou.com/jobs/list_python/p-city_2"
print(download(url))
urlencode/urlparse编解码
- 解决编码问题:
- 有的浏览器不支持中文,必须要将中文转化成编码后的字符格式才能完成搜索功能
- 统一规范,编码解码
# python2
import urllib
word = {"kw":"waws520"}
print(urllib.urlencode(word))
print(urllib.unquote(urllib.urlencode(word)))
"""
kw=%E7%8E%8B%E4%BC%9F
kw=waws520
"""
# python3
import urllib.parse
word = {"kw":"waws520"}
print(urllib.parse.urlencode(word))
print(urllib.parse.unquote(urllib.parse.urlencode(word)))
urlencode/urlparse 路径参数的生成
路径参数详细的使用爬取网页的代码urllib.urlencode()的使用
import urllib
import urllib2
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
url = "http://www.baidu.com/"
word = {"wd":"刘欢"}
kw = urllib.urlencode(word)
newurl = url + "?" + kw
request = urllib2.Request(newurl,headers = headers)
print(urllib2.urlopen(request).read())
多参数的书写方式
import urllib2
import urllib
def download(url,addr,mytype):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
addr = urllib.urlencode({"jl":addr})
mytype = urllib.urlencode({"kw":mytype})
url = url + "?" + addr + "&" + mytype + "kt=3"
request = urllib2.Request(url,headers = headers)
request.add_header("Connection","keep-alive")
response = urllib2.urlopen(request)
print response.code
data = response.read()
return data
addr = "530"
mytype = "python"
url = "https://sou.zhaopin.com/"
print download(url,addr,mytype)