将cookie转为字典
# -*- coding: UTF-8 -*-
# @Project :网络爬虫
# @File :requests技巧.py
# @IDE :PyCharm
# @Author :艳烔
# @Date :2024/10/24 15:01
import requests
response = requests.get('https://www.baidu.com')
print(response.cookies)
# 将Cookie转换为字典
# {'BD_NOT_HTTPS': '1', 'PSTM': '1729754274', 'H_PS_PSSID': '60449_60839_60851_60886_60875_60934', 'BAIDUID': '3A1E2F1C84792B2CD3AB547D72AC93E4:FG=1', 'BAIDUID_BFESS': '3A1E2F1C84792B2CD3AB547D72AC93E4:FG=1'}
print(requests.utils.dict_from_cookiejar(response.cookies))
# 将字典转换为Cookie
print(requests.utils.cookiejar_from_dict({'BD_NOT_HTTPS': '1', 'PSTM': '1729754274', 'H_PS_PSSID': '60449_60839_60851_60886_60875_60934', 'BAIDUID': '3A1E2F1C84792B2CD3AB547D72AC93E4:FG=1', 'BAIDUID_BFESS': '3A1E2F1C84792B2CD3AB547D72AC93E4:FG=1'}))
请求SSL证书验证
SSL证书验证: HTTPS = HTTP + SSL
get = requests.get('https://www.12306.cn', verify=False)
print(get)
设置超时
r = requests.get('https://www.google.com.hk/', timeout=3)
print(r)
配合状态码判断是否请求成功
assert r.status_code == 200
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
def _parse_url(url):
response = requests.get(url, headers=headers, timeout=3)
assert response.status_code == 200
return response.content.decode()
def parse_url(url):
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str
if __name__ == '__main__':
url = 'https://www.baidu.com'
print(parse_url(url))
编解码
print(requests.utils.unquote('%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB'))
print(requests.utils.quote('网络爬虫'))
retrying
# -*- coding: UTF-8 -*-
# @Project :网络爬虫
# @File :超时重发.py
# @IDE :PyCharm
# @Author :艳烔
# @Date :2024/10/24 16:14
import requests
from retrying import retry
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
@retry(stop_max_attempt_number=3)
def _parse_url(url):
print('代码执行了几次')
response = requests.get(url, headers=headers, timeout=3)
assert response.status_code == 200
return response.content.decode()
def parse_url(url):
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str
if __name__ == '__main__':
url = 'https://www.google.com.hk/'
print(parse_url(url))