urllib库
from urllib.request import urlopen
url = "https://www.imooc.com"
# 发送请求
response = urlopen(url=url)
# 读取内容
info = response.read()
# 打印内容
print(info.decode())
response.read()
read()方法就是读取文件里的全部内容
response.getcode()
返回HTTP的响应码,成功返回200,4开头是客户端服务器错误,5开头是服务器出错
response.geturl()
返回实际数据的实际url,放置重定向
respinse.info
返回服务器响应的HTTP报头
Request对象
其实上面的urlopen参数可以传入一个request请求,它其实就是一个Request类的实例,构造时需要传入Url,Data等等的内容。比如上面的两行代码,我们可以这么改写
from urllib.request import Request,urlopen
# 随机的User-Agent
from fake_useragent import UserAgent
ua = UserAgent()
url = "https://www.baidu.com"
headers = {
'User-Agent': ua.random
}
request = Request(url=url, headers=headers)
response = urlopen(request)
info = response.read()
print(info.decode())
UserAgent
# 随机的User-Agent
from fake_useragent import UserAgent
ua = UserAgent()
for i in range(10):
print(ua.random)
print(ua.chrome)
print(ua.firefox)
print(ua.ie)
print(ua.random) # 随机去一个userAgent
url编码问题解决-quote,urlencode
# url编码问题
# 使用useragent
from fake_useragent import UserAgent
from urllib.request import urlopen,Request
from urllib.parse import quote,urlencode
# 使用quote对字符串进行编码
args = quote('勇哥')
print(args)
args = {
'wd': "勇哥",
'name': "哥哥"
}
# 使用urlencode对字典进行编码
args = urlencode(args)
print(args)
url = "https://www.baidu.com/s?{}".format(args)
headers = {
'User-Agent':UserAgent().random
}
request = Request(url=url,headers=headers)
response = urlopen(request)
info = response.read()
print(info.decode())
urllib发送get请求,获取百度贴吧html
# url编码问题
# 使用useragent
from fake_useragent import UserAgent
from urllib.request import urlopen,Request
from urllib.parse import quote,urlencode
# 使用quote对字符串进行编码
args = quote('勇哥')
print(args)
args = {
'wd': "勇哥",
'name': "哥哥"
}
# 使用urlencode对字典进行编码
args = urlencode(args)
print(args)
url = "https://www.baidu.com/s?{}".format(args)
headers = {
'User-Agent':UserAgent().random
}
request = Request(url=url,headers=headers)
response = urlopen(request)
info = response.read()
print(info.decode())
urllib发送post请求
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
url = "https://exple.com"
form_data = {
'name':'andy',
'age':18
}
headers = {
"User-Agent":UserAgent().random
}
request = Request(url=url,data=urlencode(form_data).encode(),headers=headers)
response = urlopen(request)
info = response.read().decode()
关于AJAX动态请求的思考
当我们去看一个网站,需要爬取的数据在查看网页源代码之后没有显示,说明改数据可能是一个ajax动态请求的数据
这个时候去检查里面去看network中查看接口,获取接口之后榨取接口里面的内容
如果该请求是ajax请求的话,在Response Headers中一定会有
XMLHttpRequest这个value的