爬虫基础(二)---基本库的使用

245 阅读2分钟

urllib中使用代理

from urllib.request import Request, urlopen, ProxyHandler, build_opener
from fake_useragent import UserAgent

url = "http://httpbin.org/get"
headers = {
    "User-Agent":UserAgent().random
}

request = Request(url,headers=headers)
handler = ProxyHandler({'http':"180.125.208.103:44011"})
opener = build_opener(handler)
response = opener.open(request)
print(response.read().decode())

Requests

1 安装

pip install requests

2 基本请求

req = requests.get( "http://www.baidu.com")
req = requests.post( "http://www.baidu.com")
req = requests.put( "http://www.baidu.com")
req = requests.delete( "http://www.baidu.com")
req = requests.head( "http://www.baidu.com")
req = requests.options( "http://www.baidu.com")

2.1 get请求

参数是字典,我们也可以传递json类型的参数

import requests
from fake_useragent import UserAgent

url = "https://www.baidu.com/s"
headers = {
    'User-Agent':UserAgent().random
}
params = {'w':'勇哥的ID'}
response = requests.get(url,params=params,headers=headers)
print(response.url)
response.encode = 'utf-8'
html = response.text
print(html)

2.2 post请求

参数是字典,我们也可以传递json类型的参数:

url = "http://www.exeddddd.com"
formdata={
    "user":"111111111".
    "password":"11111111"
}
response = requests.post(rul,data=formdata)
response.encoding='utf-8'
html = response.text

2.3 自定义请求头

伪装请求头部是采集时经常用的,我么可以用这个方法来隐藏

headers = {"User-Agent":UserAgent().random}
r = requests.get(url,headers=headers)
print(r.request.headers['User-Agent'])

2.4 设置超时时间

可以通过timeout属性设置超时时间,一旦超过这个时间还没获得响应内容,就会提示出现错误

requests.get(url,timeout=1)

2.5代理访问

采集时为比秒ip被封,经常会使用代理,requests也有响应的proxies属性

import requests
proxies = {
    "http":"49.86.183.105:31897",
	"http":"49.86.56.110:32472"
}
requests.get(url,proxies=proxies)

# 如果代理需要用户名和密码,需要这样
proxies = {
	"http" : "http://user:pass@10.10.1.10:3128/",
	}

2.6 session自动保存cookies

session的意思是保持一个会话,比如,登陆后继续操作(记录身份信息)而requests是单次请求的请求,身份信息不会被记录

# 创建一个对象
s = requests.Session()
s.get(url)

2.7 ssl验证

#禁用安全请求喾告
requests.packages.urllib3.disable_warnings()
resp = requests.get(url, verify=False,headers=headers)

3 获取响应信息

resp.json() : 获取响应内容(json格式字符)

resp.text : 获取响应内容(字符串)

resp.content:获取响应内容(以字节的方式)

resp.headers:获取响应内容

resp.url : 获取访问地址

resp.encoding:获取网页编码

resp.request.headers :请求头内容

requests+re实战

# 使用正则获取内涵段子的段子信息
import re
import requests
from fake_useragent import UserAgent
url = "https://www.qiushibaike.com/text/page/1/"
headers = {
    "User-Agent":UserAgent().random
}

response = requests.get(url,headers=headers)
info = response.text
infos = re.findall('<div class="content">\s*<span>\s*(.+)\s*</span>',info)
with open('duanzi.txt','a',encoding='utf-8') as f:
    for info in infos:
        f.write(info+'\n\n\n')

Beautiful Soup

beautifulsoup.readthedocs.io/zh_CN/v4.4.…

安装:

pip install bs4
pip install xlml

样例

<title>勇哥的id</title>
<div class='info' float='left'>hello world</div>
<div class='info' float='right'>
    <span>Good Study</span>
    <a href='https://www.baidu.com'></a>
    <strong><!--这是注释--></strong>
</div>
from bs4 import BeautifulSoup
from bs4 import Comment
str_1 = """
<title>勇哥的id</title>
<div class='info' float='left'>hello world</div>
<div class='info' float='right'>
    <span>Good Study</span>
    <a href='https://www.baidu.com'></a>
    <strong><!--这是注释--></strong>
</div>
"""

soup = BeautifulSoup(str_1,'lxml')
soup.get_text()  # 获取内容
soup.title    # 获取一个element
soup.div   # 获取element,但是只能获取第一个
soup.div.get('float')  # 获取element中的一个属性
soup.strong.string   #获取注释内容
spup.div.attrs  # 获取该标签下的所有属性,是一个字典



# 关于注释
if type(soup.strong.string) == Comment:
    print(soup.strong.string)
    # 按照html原样式输出
    print(soup.strong.prettify())
else:
    print(soup.string.text)

css选择器

print("-----------css选择器-------------")
# id选择器
print(soup.select("#title")[0].text)
# 类选择器
print(soup.select('.info')[0].attrs.get('float'))
# 连续select
print(soup.select('.info')[1].select('a')[0].attrs.get('href'))

xpath选择器

import re
import requests
from lxml import etree
from fake_useragent import UserAgent

url = "https://www.qidian.com/rank/yuepiao?chn=-1&page=1"

headers = {
    "User-Agent":UserAgent().random
}

response = requests.get(url=url, headers=headers)

e = etree.HTML(response.text)

names = e.xpath('//h4/a/text()')
authors = e.xpath('//p[@class="author"]/a/text()')
for name,author in zip(names,authors):
    print(f"书名是:{name}")
    print(f"作者是是:{author}")
    print("*"*20)