Python之爬取图片库,python学习的小练习 写了一个爬取图片的爬虫,爬取pixabay的图片,方便之后机器学习使用,
有几个注意点
- 1.需要关闭代理(科学上网工具),不然可能出现403等问题
- 2.请求的url,https全部改成http,不然会出现证书问题,这个困扰了很久
- 3.charles等抓包也有可能影响,比如网页请求的话会出现"你的连接不是私密链接",如果高级里面没有显示"继续访问",那就直接敲击thisisunsafe就启用隐藏功能了
- 4.使用:在main方法对关键字和页数修改,运行就可以,如果时mac的话,改下下载地址
- 5.写的比较简单,没做多线程优化,也没有使用框架
具体代码如下
import traceback
import requests
from bs4 import BeautifulSoup
import re
def http_get(url, header):
print("开始请求:" + url)
try:
r = requests.get(url, headers=header, timeout=30)
r.raise_for_status()
# r.encoding = r.apparent_encoding
r.encoding = "utf-8"
return r.text
except:
traceback.print_exc()
print("异常请求:"+str(url)+"\n"+str(header))
return "请求异常"
# 请求前多少页的图片
def get_range_img(keyword, page_range):
urls = []
i = 1
while i <= page_range:
urls.extend(get_img_website_url_list(keyword, i))
i += 1
return urls
# 请求指定页的图片网站列表
def get_img_website_url_list(keyword, index):
# "https://pixabay.com/zh/images/search/%E7%8B%97/?pagi=1"
res = http_get(search_url + keyword + "?pagi=" + str(index), kv)
if res == "":
return
soup = BeautifulSoup(res, "html.parser")
tags = soup.find_all('a')
urls = []
for tag in tags:
try:
href = tag.attrs['href']
if re.match(r'^https://pixabay.com/.*$', href) and "?pagi=" not in href:
urls.append(href.replace('https:', 'http:'))
except:
continue
return urls
def get_img_url_list(urls):
urls2 = []
i = 0
for url in urls:
res = http_get(url, kv)
if res == "":
return
soup = BeautifulSoup(res, "html.parser")
tag = soup.find('source')
href = tag.attrs['src']
urls2.append(href.replace('https:', 'http:'))
i += 1
print("正在获取url:" + str(i) + "/" + str(len(urls)))
return urls2
def download_img(urls):
file_path = "D:/file/"
i = 0
for url in urls:
res = requests.get(url)
file_name = url[url.rfind('/') + 1:]
with open(file_path + file_name, "wb") as f:
f.write(res.content)
f.close()
i += 1
print("正在下载:" + str(i) + "/" + str(len(urls)))
return ""
# 提取源码图片
# def get_page_res():
kv = {
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0",
"accept-language": "zh-CN"
}
search_url = "http://pixabay.com/zh/images/search/"
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
search_keyword = "广州"
page_index = 1
urls = get_img_website_url_list(search_keyword, page_index)
urls2 = get_img_url_list(urls)
download_img(urls2)
# page_range = 1
# urls3 = get_range_img(search_keyword, page_range)
# urls4 = get_img_url_list(urls3)
# download_img(urls4)