python爬虫学习-day005

311 阅读2分钟

Requests

reqeusts用python语言编写,基于urllib,采用Apache2 Licensed 开源协议的HTTP库。比urllib更加方便,可以节约大量的工作,完全满足HTTP测试需求。

Python实现的简单易用的HTTP库

使用
import requests
response = requests.get("https://www.baidu.com/")
print(type(response))
print(response.status_code)
print(type(response.text))
print(response.text)
print(response.cookies)
请求方式
import requests
request.post('http://httpbin.org/post')
request.put('http://httpbin.org/put')
request.delete('http://httpbin.org/delete')
request.head('http://httpbin.org/get')
request.options('http://httpbin.org/get')

请求

GET请求
import requests
response = requests.get("http://httpbin.org/get")
print(response)

# 带参数GET请求
response = requests.get("http://httpbin.org/get?name=germey&age=22")
print(response)


data = {
  'name': 'germey',
  'age': 22
}
response = requests.get("http://httpbin.org/get", params=data)
print(response.text)

解析json
import requests
response = requests.get("http://httpbin.org/get")
print(type(response.text))
print(response.json())
print(json.load(response.text))
print(response.content)
获取二进制文件
import requests
response = request.get('https://github.com/favicon.ico')
print(type(response.text), type(response.content))
print(response.text)
print(response.content)
with open('favicon.ico', 'wb') as f:
  f.write(response.content)
添加headers
import requests
response = requests.get('https://www.zhihu.com/explore')
print(response.text)  #  没有header直接500

headers = {
  "User-Agent": "Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}
response = requests.get('https://www.zhihu.com/explore', headers=headers)
print(response.text)

POST

import requests
data = {'name': 'Germey', 'age': 22}
response = requests.post('http://httpbin.org/post', data=data)
print(request.text)

headers = {
  "User-Agent": "Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}

response = requests.post('http://httpbin.org/post', data=data, headers=headers)
print(request.text)
响应
response属性
import requests

response = requests.get("http://www.baidu.com")
print(type(response.status_code), response.status_code)
print(type(response.headers), response.headers)
print(type(response.cookies), response.cookies)
print(type(response.url), response.url)
print(type(response.history), response.history)
状态码判断
import requests

response = requests.get("http://www.baidu.com")
exit() if not response.status_code == requests.codes.ok else print("Request Successfully!")

response = requests.get("http://www.baidu.com")
exit() if response.status_code != 200 else print("Request Successfully")

# http status code 
# 期待中文注释
http_status_code = {
  100: ("continue", ),
	101: ("switching_protocols", ),
  102: ("processing", ),
  103: ("checkpoint", ),
  122: ("url_too_long", "request_url_too_long"),
  200: ("ok", "okay", "all_ok", "all_good", "\\o/", "√"),
  201: ("created", ),
  202: ("accepted", ),
  203: ("non_authoritative_info", "non_authoritative_information") ,
  204: ("no_content", ),
  205: ("reset_content", "reset"),
  206: ("partial_content", "partial"),  #  客户发送了一个带有Range头的GET请求,服务器完成了它(http 1.1)。
  207: ("multi_status", "multiple_status", "multi_stati", "multiple_stati"),
  208: ("already_reported", ),
  226: ("im_used", ),
  
  # Redirection
  300: ("multiple_choices", ),
  301: ("moved_permanently", "moved", "\\o-"),
  302: ("found",),
  303: ("see_other", "other"),
  304: ("not_modified", ),
  305: ("use_proxy",),
  306: ("switch_proxy",),
  307: ("temporary_redirect", "temporary_moved", "temporary"),
  308: ("permanent_redirect", "resume_incomplete", "resume"),
  
  # Client Error
  400: ('bad request', 'bad'),
  401: ("unauthorized",),
  402: ("payment_required", "payment"),
  403: ("forbidden", ),
  404: ("not_found", '-o-'),
  405: ("method_not_allowed", "not_allowed"),
  406: ("not_acceptable",),
  407: ("proxy_authentication_required", "proxy_auth", "proxy_authentication"),
  408: ("request_timeout", "timeout"),
  409: ("conflict",),
  410: ("gone",),
  411: ("length_required", ),
  412: ("precondition_failed", "precondition"),
  413: ("request_entity_too_large", ),
  414: ("request_url_too_large",),
  415: ("unsupported_media_type", "unsupported_media", "media_type"),
  416: ("requested_range_not_satisfiable", "request_range", "range_not_satisfiable"),
  417: ("exceptation_failed",),
  418: ("im_a_teapot", "teapot", "i_am_a_teapot"),
  421: ("misdirected_request", ),
  422: ("unprocessable_entity", "unprocessable"),
  423: ("locked",),
  424: ("failed_dependency", "dependency"),
  425: ("unordered_collection", "unordered"),
  426: ("upgrade_required", "upgrade"),
  428: ("precondition_required", "precondition"),
  429: ("too_many_requests", "too_many"),
  431: ("header_fields_too_large", "fields_too_large"),
  444: ("no_response", "none"),
  449: ("retry_with", "retry"),
  450: ("blocked_by_windows_parental_controls", "parental_controls"),
  451: ("unavaliable_for_legal_reasons", "legal_reasons"),
  499: ("client_closed_request", ),
  
  # Server Error
  500: ("internal_server_error", "server_error", "/o\\", "x"),
  501: ("not_implemented",),
  502: ("bad_gateway",),
  503: ("service_unavaliable", "unavaliable"),
  504: ("gateway_timeout",),
  505; ("http_version_not_supported", "http_versioin"),
  506: ("variant_also_negotiates", ),
  507: ("insufficient_storage", ),
  509: ("bandwidth_limit_exceeded", "bandwidth"),
  510: ("not_extended", ),
  511: ("network_authentication_requried", "network_auth", "network_authentication"),
}
 # 官方文档有详细的解释

高级操作

文件上传
import requests
files = {"file": open("favicon.ico", "rb")}
response = requests.post("http://httpbin.org/post", files=files)
print(response.text)
获取cookie
import requests

response = requests.get("http://www.baidu.com")
print(response.cookies)
for key, value in response.cookies.iteritems():
  print(key + "=" + value)
会话维持
import requests

requests.get("http://httpbin.org/cookies/set/number/123456789")
response = requests.get("http://httpbin.org/cookies")
print(response.text)


s = requests.Session()
s.get("http://httpbin.org/cookies/set/number/123456789")
response = s.get("http://httpbin.org/cookies")
print(response.text)

证书验证
import requests

response = request.get("https://www.12306.cn")
print(response.status_code)

from requests.packages import urllib3
urllib3.disable_warnings()  # 关闭警告
response = requests.get("https://www.12306.cn", verify=False)
print(response.status_code)

response = requests.get("https://www.12306.cn", cert=('/path/servr.crt', '/path/key'))
print(response.status_code)

代理设置
import requests

proxies = {
  "http": "http://127.0.0.1:9743",
  "https": "https://127.0.0.1:9743"
}

response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)

pip install 'requests[socks]'

import requests
proxies = {
  'http': 'socks5://127.0.0.1:9742',
  "https": 'socks5://127.0.0.1:9742'
}
response = requests.get("https://www.taobao.com", proxies=proxies)
超时设置
import requests
response = requests.get("https://httpbin.org/get", timeout=1)
print(response.status_code)

from requests.exceptions import ReadTimeout
try:
  response = requests.get("https://httpbin.org/get", timeout=0.5)
  print(response.status_code)
except ReadTimeout:
  print("Timeout")
认证设置
import requests
from requests.auth import HTTPBasicAuth

r = requests.get("http://120.27.34.24:9001", auth=HTTPBasicAuth('user', '123'))
print(r.status_code)

r = requests.get("http://120.27.34.24:9001", auth=('user', '123'))
print(r.status_code)
异常处理
import requests
from requests.exception import ReadTimeout, HTTPError, RequestException
try:
  response = requests.get("http://httpbin.org/get", timeout=0.5)
  print(reponse.status_code)
except ReadTimeout:
  print("Timeout")
except HTTPError:
  print("Http error")
except RequestException:
  print("Error")

学习笔记来源崔庆才 python3网络爬虫