Python爬虫——requests的实际使用

130 阅读2分钟

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第2天,点击查看活动详情

1、proxies 代理

proxies 参数的使用

proxies :Requests 提供的代理 IP 参数

import requests

url = 'https://www.baidu.com'
headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',

}
# 免费代理ip
proxies = {
    'http':'http://47.113.90.161:83',
    'https':'https://111.3.118.247:30001'
}
content = requests.get(url,proxies=proxies,headers=headers,timeout=5).text

2、requests 请求登录

代码实例

import requests
from lxml import etree
from bs4 import BeautifulSoup
from selenium import webdriver

url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
     'cookie':'login=flase; Hm_lvt_9007fab6814e892d3020a64454da5a55=1660892375; ASP.NET_SessionId=qjqxtkdr2vi1nwooegykjg3z; ticketStr=201867921%7cgQFK8DwAAAAAAAAAAS5odHRwOi8vd2VpeGluLnFxLmNvbS9xLzAyRGg4TFJqbGVkN2kxZnIxQ3h6MVoAAgTbNP9iAwQAjScA; codeyzgswso=669933fd155ef03c; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1660892996'
}
response = requests.get(url=url,headers=headers)
content = response.text
# =======================================使用xpath解析方式=======================================
parse = etree.HTML(content)
viewstate_xpath = parse.xpath('//input[@id="__VIEWSTATE"]/@value')[0]
viewstategenerator_xpath = parse.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value')[0]
print(viewstate_xpath)
print(viewstategenerator_xpath)

# =======================================使用BeautifulSoup解析方式=======================================
# 使用 lxml 解析器
soup = BeautifulSoup(content,'lxml')
viewstate_bs4 = soup.select('#__VIEWSTATE')[0].attrs.get('value')
viewstategenerator_bs4 = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')
print(viewstate_bs4)
print(viewstategenerator_bs4)

# =======================================使用Selenium解析方式=======================================
driver = webdriver.Chrome()
driver.get(url)
viewstate_selenium = driver.find_element_by_id('__VIEWSTATE').get_attribute('value')
viewstategenerator_selenium = driver.find_element_by_id('__VIEWSTATEGENERATOR').get_attribute('value')
# 验证码地址
code_url = driver.find_element_by_id('imgCode').get_attribute('src')
print(viewstate_selenium)
print(viewstategenerator_selenium)
driver.quit()

# 使用session让请求之间具有连贯性,保证在这里下载的验证码和下面登录时的验证码是同一个验证码
session = requests.session()
down_response = session.get(code_url)
# 下载图片,可以使用content属性以二进制的方式下载图片
code_down = down_response.content
# 将二进制数据写入文件
with open('C:/Users/Administrator/Desktop/image/code_down.jpg','wb') as f:
    f.write(code_down)

code_name = input('输入验证码:')
login_post = {
    '__VIEWSTATE': viewstate_selenium,
    '__VIEWSTATEGENERATOR': viewstategenerator_selenium,
    'from': 'http://so.gushiwen.cn/user/collect.aspx',
    'email': 账号,
    'pwd': 密码,
    'code': code_name,
    'denglu': '登录'
}

response_login = session.post(url=url,headers=headers,data=login_post)
content_login = response_login.text
with open('gushiwenwnag.html','w',encoding='utf-8') as fp:
    fp.write(content_login)

注意:通过使用session让请求之间保持状态,而保证了下载的验证码和登录时的验证码是同一个验证码。

执行结果:

sbcuO7uef79ONGY0PgQ1FGwdL6WUU85gHoi9ui/BEY+Iw6HI7luKQt7m384KSliyBaFEuN64rF92vWFs89jYLiEBkNOFE/IPO+CdU66iMrygOICpCUVkELWZj6dlBzgVS0p5Y6TDPEMUNjOnEfqjJoeP49c=
C93BE1AE

sbcuO7uef79ONGY0PgQ1FGwdL6WUU85gHoi9ui/BEY+Iw6HI7luKQt7m384KSliyBaFEuN64rF92vWFs89jYLiEBkNOFE/IPO+CdU66iMrygOICpCUVkELWZj6dlBzgVS0p5Y6TDPEMUNjOnEfqjJoeP49c=
C93BE1AE

Dfg6sJMsGMy6mp8EuqjpE1jEQ05VJtgBGzMEtORW0G1LU+5rg6Y2FlgJGwqe7hm2bsr7eLSj0FYgxpWi6w8XPhmYNTExjlaq1N9oyZwPJl3tRHXHAR5udSoXk1ZYrG0iL6tvO57Cxy9EQCPzu4KJRSrhjSE=
C93BE1AE

使用content属性以二进制的方式下载,验证码,然后把验证码保存在本地:

在这里插入图片描述

成功登录到个人主页: