分析url规律
打开猫咪图片网址。其 url 规律如下:
第一页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html
第二页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/2.html
第三页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/3.html
第三页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/n.htm
安装一下库
pip install BeautifulSoup4
pip install requests
pip install urllib3
pip install lxml
爬取图片代码:
from bs4 import BeautifulSoup
import requests
import urllib.request
import os
# 第一页猫咪图片网址
url = 'https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html'
# 图片保存路径,这里 r 表示不转义
path = r"/Users/xxx/Downloads/cats/"
# 判断目录是否存在,存在则跳过,不存在则创建
if os.path.exists(path):
pass
else:
os.mkdir(path)
# 获得所有猫咪图片网页地址
def allpage():
all_url = []
# 循环翻页次数 20 次
for i in range(1, 20):
# 替换翻页的页数,这里的 [-6] 是指网页地址倒数第 6 位
each_url = url.replace(url[-6], str(i))
# 将所有获取的 url 加入 all_url 数组
all_url.append(each_url)
# 返回所有获取到的地址
return all_url
# 主函数入口
if __name__ == '__main__':
# 调用 allpage 函数获取所有网页地址
img_url = allpage()
cookies = {
"t": "c41f9a1b34c87907a0df5375c5f0801b",
"r": "8507",
"Hm_lvt_4c65a21638f96d83db9e42a8df2772a9": "1703583082",
"Hm_lpvt_4c65a21638f96d83db9e42a8df2772a9": "1703583652"
}
headers = {
"authority": "www.huiyi8.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,ak;q=0.7,es;q=0.6",
"cache-control": "max-age=0",
"referer": "https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html",
"sec-ch-ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Google Chrome\";v=\"120\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
for url in img_url:
# 获得网页源代码
requ = requests.get(url,cookies=cookies,headers=headers)
req = requ.text.encode(requ.encoding).decode()
print(req)
html = BeautifulSoup(req, 'lxml')
# 添加一个 url 数组
img_urls = []
# 获取 html 中所有 img 标签的内容
for img in html.find_all('img'):
# 筛选匹配 src 标签内容以 http 开头,以 jpg 结束
if img["src"].startswith('http') and img["src"].endswith("jpg"):
# 将符合条件的 img 标签加入 img_urls 数组
img_urls.append(img)
# 循环数组中所有 src
for k in img_urls:
# 获取图片 url
img = k.get('src')
# 获取图片名称,强制类型转换很重要
name = str(k.get('alt'))
type(name)
# 给图片命名
file_name = path + name + '.jpg'
html = requests.get(url=img,headers=headers,cookies=cookies).content
with open(file_name,'wb') as f:
f.write(html)
# 打印爬取的图片
print(img, file_name)