urllib中使用代理
from urllib.request import Request, urlopen, ProxyHandler, build_opener
from fake_useragent import UserAgent
url = "http://httpbin.org/get"
headers = {
"User-Agent":UserAgent().random
}
request = Request(url,headers=headers)
handler = ProxyHandler({'http':"180.125.208.103:44011"})
opener = build_opener(handler)
response = opener.open(request)
print(response.read().decode())
Requests
1 安装
pip install requests
2 基本请求
req = requests.get( "http://www.baidu.com")
req = requests.post( "http://www.baidu.com")
req = requests.put( "http://www.baidu.com")
req = requests.delete( "http://www.baidu.com")
req = requests.head( "http://www.baidu.com")
req = requests.options( "http://www.baidu.com")
2.1 get请求
参数是字典,我们也可以传递json类型的参数
import requests
from fake_useragent import UserAgent
url = "https://www.baidu.com/s"
headers = {
'User-Agent':UserAgent().random
}
params = {'w':'勇哥的ID'}
response = requests.get(url,params=params,headers=headers)
print(response.url)
response.encode = 'utf-8'
html = response.text
print(html)
2.2 post请求
参数是字典,我们也可以传递json类型的参数:
url = "http://www.exeddddd.com"
formdata={
"user":"111111111".
"password":"11111111"
}
response = requests.post(rul,data=formdata)
response.encoding='utf-8'
html = response.text
2.3 自定义请求头
伪装请求头部是采集时经常用的,我么可以用这个方法来隐藏
headers = {"User-Agent":UserAgent().random}
r = requests.get(url,headers=headers)
print(r.request.headers['User-Agent'])
2.4 设置超时时间
可以通过timeout属性设置超时时间,一旦超过这个时间还没获得响应内容,就会提示出现错误
requests.get(url,timeout=1)
2.5代理访问
采集时为比秒ip被封,经常会使用代理,requests也有响应的proxies属性
import requests
proxies = {
"http":"49.86.183.105:31897",
"http":"49.86.56.110:32472"
}
requests.get(url,proxies=proxies)
# 如果代理需要用户名和密码,需要这样
proxies = {
"http" : "http://user:pass@10.10.1.10:3128/",
}
2.6 session自动保存cookies
session的意思是保持一个会话,比如,登陆后继续操作(记录身份信息)而requests是单次请求的请求,身份信息不会被记录
# 创建一个对象
s = requests.Session()
s.get(url)
2.7 ssl验证
#禁用安全请求喾告
requests.packages.urllib3.disable_warnings()
resp = requests.get(url, verify=False,headers=headers)
3 获取响应信息
resp.json() : 获取响应内容(json格式字符)
resp.text : 获取响应内容(字符串)
resp.content:获取响应内容(以字节的方式)
resp.headers:获取响应内容
resp.url : 获取访问地址
resp.encoding:获取网页编码
resp.request.headers :请求头内容
requests+re实战
# 使用正则获取内涵段子的段子信息
import re
import requests
from fake_useragent import UserAgent
url = "https://www.qiushibaike.com/text/page/1/"
headers = {
"User-Agent":UserAgent().random
}
response = requests.get(url,headers=headers)
info = response.text
infos = re.findall('<div class="content">\s*<span>\s*(.+)\s*</span>',info)
with open('duanzi.txt','a',encoding='utf-8') as f:
for info in infos:
f.write(info+'\n\n\n')
Beautiful Soup
beautifulsoup.readthedocs.io/zh_CN/v4.4.…
安装:
pip install bs4
pip install xlml
样例
<title>勇哥的id</title>
<div class='info' float='left'>hello world</div>
<div class='info' float='right'>
<span>Good Study</span>
<a href='https://www.baidu.com'></a>
<strong><!--这是注释--></strong>
</div>
from bs4 import BeautifulSoup
from bs4 import Comment
str_1 = """
<title>勇哥的id</title>
<div class='info' float='left'>hello world</div>
<div class='info' float='right'>
<span>Good Study</span>
<a href='https://www.baidu.com'></a>
<strong><!--这是注释--></strong>
</div>
"""
soup = BeautifulSoup(str_1,'lxml')
soup.get_text() # 获取内容
soup.title # 获取一个element
soup.div # 获取element,但是只能获取第一个
soup.div.get('float') # 获取element中的一个属性
soup.strong.string #获取注释内容
spup.div.attrs # 获取该标签下的所有属性,是一个字典
# 关于注释
if type(soup.strong.string) == Comment:
print(soup.strong.string)
# 按照html原样式输出
print(soup.strong.prettify())
else:
print(soup.string.text)
css选择器
print("-----------css选择器-------------")
# id选择器
print(soup.select("#title")[0].text)
# 类选择器
print(soup.select('.info')[0].attrs.get('float'))
# 连续select
print(soup.select('.info')[1].select('a')[0].attrs.get('href'))
xpath选择器
import re
import requests
from lxml import etree
from fake_useragent import UserAgent
url = "https://www.qidian.com/rank/yuepiao?chn=-1&page=1"
headers = {
"User-Agent":UserAgent().random
}
response = requests.get(url=url, headers=headers)
e = etree.HTML(response.text)
names = e.xpath('//h4/a/text()')
authors = e.xpath('//p[@class="author"]/a/text()')
for name,author in zip(names,authors):
print(f"书名是:{name}")
print(f"作者是是:{author}")
print("*"*20)