前情提要
前文了解了python连接redshift的api操作方式,如果有大批量的插入操作,还是使用copy命令,从S3进行数据导入。这样会节省大量的时间,并且S3也可以看作数仓的ODS层。下面来聊一聊,S3的API,boto3的使用方式。
- 第一步获取S3的连接 连接S3有两个官方提供的方法
boto3.client
boto3.resource
# 两个方法的区别在于 client 会返回字典信息,如果想要获取详细的内容,需要使用get方法获取key对应的值
# 而resource会将S3返回的内容做一层封装,想要获取相应的内容,直接调用属性或者方法即可得到
# 看下来是resource方法更加的高级易用,符合面向对象的特点,由于笔者看到的是采用
# client获取连接的API,所以下文主要介绍在client获取连接基础上实现的一些功能,同样的resource应该可以实现
- 第二步就可以具体操作S3了 操作的方式差不多就是增删改查,权限设置,此外API中还提到了一种生成连接供外部请求的方法。
因为S3中是没有文件夹的概念的,所有的内容都当作对象来使用和查看,所以操作文件的时候注意文件的命名,/结尾会默认为一个可以存储文件的节点(文件夹)
下面是详细代码
import json
import boto3
from botocore.exceptions import ClientError
import logging
import os
import requests
class S3:
def __init__(self):
pass
# 获取S3连接
def conn_s3(self):
# Client级别的接口则是返回Dictionary来表示查询到的资源信息。而Resource级别的接口是对Client级别的接口进行了面向对象的封装,接口的返回值大部分都是Resource对象(
# 如果返回值是某个Resource的信息的话), 我们可以对返回的对象再进行操作(比如删除,修改等)。
s3 = boto3.client(service_name='s3', aws_access_key_id='',
aws_secret_access_key='', region_name='')
return s3
# 查看所有bucket
def showAllBucket(self, s3):
response = s3.list_buckets()
# print(response1)
# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
print(f' {bucket["Name"]}')
# 查看所有bucket中对象
def list_bucket_keys(self, s3, bucket_name):
""" :type : pyboto3.s3 """
result = s3.list_objects(Bucket=bucket_name).get('Contents')
obj_lis = []
for _ in result:
obj_lis.append(_.get('Key'))
print(_.get('Key'))
return obj_lis
# 创建bucket
def create_bucket(self, bucket_name, s3, region=None):
"""Create an S3 bucket in a specified region
If a region is not specified, the bucket is created in the S3 default
region (us-east-1).
:param bucket_name: Bucket to create
:param region: String region to create bucket in, e.g., 'us-west-2'
:return: True if bucket created, else False
"""
# Create bucket
try:
if region is None:
s3.create_bucket(Bucket=bucket_name)
else:
s3_other = boto3.client(service_name='s3', aws_access_key_id='AKIAYDFF4KH5MGBIZ6ON',
aws_secret_access_key='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',
region_name=region)
location = {'LocationConstraint': region}
s3_other.create_bucket(Bucket=bucket_name,
CreateBucketConfiguration=location)
except ClientError as e:
logging.error(e)
return False
return True
# 上传文件
def upload_file(self, s3, file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
# 将上传文件的名字赋予上传后的对象名
object_name = os.path.basename(file_name)
# Upload the file
try:
response = s3.upload_file(file_name, bucket, object_name)
print('文件上传成功')
except ClientError as e:
logging.error(e)
return False
return True
# 下载文件
def download_file(self, s3, bucket_name, object, file):
try:
s3.download_file(Bucket=bucket_name, Key=object, Filename=file)
print('下载成功')
except Exception as e:
print('下载失败')
print(e)
# 创建临时访问s3对象的连接
def create_presigned_url(self, s3, bucket_name, object_name, expiration=3600):
"""Generate a presigned URL to share an S3 object
:param bucket_name: string
:param object_name: string
:param expiration: Time in seconds for the presigned URL to remain valid
:return: Presigned URL as string. If error, returns None.
"""
# Generate a presigned URL for the S3 object
try:
response = s3.generate_presigned_url('get_object',
Params={'Bucket': bucket_name,
'Key': object_name},
ExpiresIn=expiration)
except ClientError as e:
logging.error(e)
return None
# The response contains the presigned URL
return response
# 将s3调用方法的结果对外生成访问连接
def create_presigned_url_expanded(self, s3, client_method_name, method_parameters=None,
expiration=3600, http_method='GET'):
"""Generate a presigned URL to invoke an S3.Client method
Not all the client methods provided in the AWS Python SDK are supported.
:param client_method_name: Name of the S3.Client method, e.g., 'list_buckets'
:param method_parameters: Dictionary of parameters to send to the method
:param expiration: Time in seconds for the presigned URL to remain valid
:param http_method: HTTP method to use (GET, etc.)
:return: Presigned URL as string. If error, returns None.
"""
try:
response = s3.generate_presigned_url(ClientMethod=client_method_name,
Params=method_parameters,
ExpiresIn=expiration,
HttpMethod=http_method)
except ClientError as e:
logging.error(e)
return None
# The response contains the presigned URL
return response
# 将上传s3的方法封装在post中,可以通过post往s3中上传数据。此外,也可以采用HTML方式上传
def create_presigned_post(self, s3, bucket_name, object_name,
fields=None, conditions=None, expiration=3600):
"""Generate a presigned URL S3 POST request to upload a file
:param bucket_name: string
:param object_name: string
:param fields: Dictionary of prefilled form fields
:param conditions: List of conditions to include in the policy
:param expiration: Time in seconds for the presigned URL to remain valid
:return: Dictionary with the following keys:
url: URL to post to
fields: Dictionary of form fields and values to submit with the POST
:return: None if error.
"""
try:
response = s3.generate_presigned_post(bucket_name,
object_name,
Fields=fields,
Conditions=conditions,
ExpiresIn=expiration)
except ClientError as e:
logging.error(e)
return None
# The response contains the presigned URL and required fields
return response
# 创建post上传S3示例
def generate_post_demo(self, s3):
# Generate a presigned S3 POST URL
object_name = 'mindao2.csv'
response = self.create_presigned_post(s3, 'myawsbucket98765', object_name)
print(response['url'], response['fields'])
if response is None:
exit(1)
# Demonstrate how another Python program can use the presigned URL to upload a file
with open(object_name, 'rb') as f:
files = {'file': (object_name, f)}
http_response = requests.post(response['url'], data=response['fields'], files=files)
# If successful, returns HTTP status code 204
logging.info(f'File upload HTTP status code: {http_response.status_code}')
# 获取bucket安全策略
def get_bucket_policy(self, s3, bucket):
result = s3.get_bucket_policy(Bucket=bucket)
return result['Policy']
# 定义bucket安全策略
def put_bucket_policy(self, s3, bucket, policy):
s3.put_bucket_policy(Bucket=bucket, Policy=policy)
# 定义安全策略demo
def put_bucket_policy_demo(self):
bucket_name = 'myawsbucket98765'
bucket_policy = {
'Version': '2012-10-17',
'Statement': [{
'Sid': 'AddPerm',
'Effect': 'Allow',
'Principal': '*',
'Action': ['s3:GetObject'],
'Resource': f'arn:aws:s3:::{bucket_name}/*'
}]
}
bucket_policy = json.dumps(bucket_policy)
self.put_bucket_policy(self.conn_s3(), bucket=bucket_name, policy=bucket_policy)
if __name__ == '__main__':
s3_obj = S3()
s3 = s3_obj.conn_s3()
# print(s3_obj.list_bucket_keys(s3, bucket_name='myawsbucket98765'))
s3_obj.showAllBucket(s3)
# url = s3_obj.create_presigned_url(s3, 'myawsbucket98765', 'favorite-pics/2006年第三季度業績報告.pdf', expiration=3600)
# url2 = s3_obj.create_presigned_url_expanded(s3,'showAllBucket')
# print(url2)
# s3_obj.upload_file(s3,'2006年第三季度業績報告.pdf','myawsbucket98765','favorite-pics/2006年第三季度業績報告.pdf')
# s3_obj.download_file(s3, bucket_name='myawsbucket98765', object='favorite-pics/2006年第三季度業績報告.pdf', file='down_pdf.pdf')
# s3_obj.generate_post_demo(s3)
bucket_policy = s3_obj.get_bucket_policy(s3, 'myawsbucket98765')
print(bucket_policy)
# s3_obj.put_bucket_policy_demo()
以下是一些可以参考的文档
Available services — Boto3 Docs 1.24.2 documentation (amazonaws.com) An Introduction to boto’s S3 interface — boto v2.49.0
以下是一些参考的文章
AWS S3 - python 从S3下载数据到本地 - 掘金 (juejin.cn)
boto3 - 使用Python访问AWS S3 (02) - 掘金 (juejin.cn)
使用boto3批量上传图片到S3以及工作中两个小总结 - 掘金 (juejin.cn)
boto3连接aws的s3及注意事项 - 掘金 (juejin.cn)