背景说明
1、客户线下IDC机房自建minio服务器
2、需要将文件文件同步到oss
3、minio每天有新增和删除动作,因此同步到oss的时候也需要同步新增文件和删除动作
4、流量切换停机窗口1小时
迁移示意图
迁移脚本
迁移脚本实现以下功能
1、多进程限速上传
2、小文件全量迁移(增量)
3、大文件分片迁移(增量)
4、统计大文件信息
5、统计minio、oss文件数量
使用脚本
1、如果是自动更新直接运行auto.py脚本
2、如果要手动加参数执行minio_upload_oss.py脚本
脚本详情
auto.py
import time
import configparser
from multiprocessing import Process, Manager
from minio_client import MyMinioClient
from oss_clent import MyOssClient
import traceback
from minio_upload_oss import *
def main():
config = configparser.ConfigParser()
config.read('minio_config.ini')
while True:
# 1. 删除
try:
delete_oss_redundant_files(config)
except Exception as e:
print(f"Error occurred: {e}")
traceback.print_exc()
# 2、上传小文件
try:
all(config)
except Exception as e:
print(f"Error occurred: {e}")
traceback.print_exc()
# 2、大文件上传
try:
big_file_upload(config)
except Exception as e:
print(f"Error occurred: {e}")
traceback.print_exc()
# 休眠10分钟
print("休眠中.....")
time.sleep(600)
if __name__ == "__main__":
main()
minio_config.ini
[MinIO]
endpoint = xx.xxx.xxx.xxx:9000
access_key = XXXXXXXXXXx
secret_key = xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
secure = False
file_threshold = 300M
[OSS]
endpoint = http://oss-cn-qingdao.aliyuncs.com
access_key = xxxxxxxx
secret_key = xxxxxxxxxxxxxx
bucket_name = xxxxxxxxx
max_processes = 10
preferred_size = 5M
limit_speed = 600K
multipart_limit_speed = 4M
chunk_size = 32K
oss_clent.py
from singleton import singleton
from oss2 import Auth, Bucket
@singleton
class MyOssClient:
counnt = None
def create_connection(self, config, max_retries=3):
retries = 0
while retries < max_retries:
if self.counnt is None:
print("创建oss连接")
try:
# 初始化OSS客户端
auth = Auth(config['OSS']['access_key'], config['OSS']['secret_key'])
endpoint = config['OSS']['endpoint']
self.counnt = Bucket(auth, endpoint, config['OSS']['bucket_name'], connect_timeout=600)
except Exception as e:
print(f"创建 OSS 连接失败,第 {retries + 1} 次尝试: {e}")
retries += 1
continue
try:
# 检测连接状态
self.counnt.get_bucket_info()
# print("OSS 连接正常")
return self.counnt
except Exception as e:
print(f"OSS 连接异常,第 {retries + 1} 次尝试重新连接: {e}")
self.counnt = None # 重置连接
retries += 1
print("达到最大重试次数,无法建立 OSS 连接")
singleton.py
from functools import wraps
def singleton(cls):
instances = {}
@wraps(cls)
def getinstance(*args, **kw):
if cls not in instances:
# 如果实例不存在就实例化一个
instances[cls] = cls(*args, **kw)
return instances[cls]
return getinstance
minio_client.py
from singleton import singleton
import time
from minio import Minio
@singleton
class MyMinioClient:
counnt = None
def create_connection(self,config):
if self.counnt is None:
print("创建minio连接")
self.counnt = Minio(
config['MinIO']['endpoint'],
access_key=config['MinIO']['access_key'],
secret_key=config['MinIO']['secret_key'],
secure=config['MinIO'].getboolean('secure')
)
return self.counnt
minio_upload_oss.py
import logging
from minio import Minio
import configparser
import argparse
from oss2 import Auth, Bucket, SizedFileAdapter, determine_part_size, ObjectIterator
import time
import oss2
import traceback
from oss2.models import OSS_TRAFFIC_LIMIT
from multiprocessing import Pool, Manager, Process
from minio_client import MyMinioClient
from oss_clent import MyOssClient
from multiprocessing import current_process # 导入 current_process 函数
# 配置日志
logging.basicConfig(
filename='process_errors.log',
level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def conversion(speed_limit):
if speed_limit.endswith('K'):
return int(speed_limit[:-1]) * 1024
elif speed_limit.endswith('M'):
return int(speed_limit[:-1]) * 1024 * 1024
elif speed_limit.endswith('G'):
return int(speed_limit[:-1]) * 1024 * 1024 * 1024
else:
return int(speed_limit)
def get_oss_files(oss_client):
"""获取OSS指定桶中的所有文件路径"""
try:
file_set = set()
for obj in ObjectIterator(oss_client):
file_set.add(obj.key)
return file_set
except Exception as e:
print(f"获取OSS文件列表时发生错误: {str(e)}")
return set()
def get_incremental_files(minio_client, oss_files, bucket_name, config, type='small',folder_path=''):
"""获取需要增量迁移的文件列表"""
try:
minio_files = set()
objects = minio_client.list_objects(bucket_name, prefix=folder_path, recursive=True)
size_threshold = conversion(config['MinIO']['file_threshold']) # 300MB 转换为字节
for obj in objects:
# 获取文件大小
file_size = obj.size
if type == 'small' and file_size <= size_threshold:
# 小文件迁移
minio_files.add(f"{bucket_name}/{obj.object_name}")
elif type == 'big' and file_size > size_threshold:
# 大文件迁移
minio_files.add(f"{bucket_name}/{obj.object_name}")
return minio_files - oss_files
except Exception as e:
print(f"计算增量文件时发生错误: {str(e)}")
return set()
def multipart_upload(minio_client, oss_client, obj_path,config):
"""单文件上传函数,使用分片上传方式"""
try:
minio_info = obj_path.split('/', 1)
bucket_name = minio_info[0]
minio_page = minio_info[-1]
print(minio_info)
# 获取文件元数据
obj_stat = minio_client.stat_object(bucket_name, minio_page)
file_size = obj_stat.size
file_size_mb = file_size / (1024 * 1024)
print(f"文件 {obj_path} 的大小为 {file_size_mb:.2f} MB")
response = minio_client.get_object(bucket_name, minio_page)
print(obj_path + "下载完成")
# 记录分片上传开始时间
start_time = time.time()
# 初始化分片上传
upload_id = oss_client.init_multipart_upload(obj_path).upload_id
print(f"初始化分片上传,upload_id: {upload_id}")
parts = []
part_number = 1
preferred_size = conversion(config['OSS']['preferred_size']) #分片大小
part_size = determine_part_size(file_size, preferred_size=preferred_size) # 10MB 分片大小
limit_speed = conversion(config['OSS']['multipart_limit_speed'])*8
try:
while True:
chunk = response.read(part_size)
if not chunk:
break
# 上传分片并设置流量限制,将 traffic_limit 转换为字符串
headers = {OSS_TRAFFIC_LIMIT: str(limit_speed)}
result = oss_client.upload_part(obj_path, upload_id, part_number, chunk, headers=headers)
parts.append(oss2.models.PartInfo(part_number, result.etag))
part_number += 1
# print(f"已上传第 {part_number - 1} 个分片")
# 完成分片上传
oss_client.complete_multipart_upload(obj_path, upload_id, parts)
print(f"文件 {obj_path} 分片上传完成")
except Exception as e:
print(f"分片上传文件 {obj_path} 时发生错误,取消分片上传: {str(e)}")
oss_client.abort_multipart_upload(obj_path, upload_id)
raise
# 记录分片上传结束时间
end_time = time.time()
# 计算并打印分片上传耗时
upload_duration = end_time - start_time
print(f"文件 {obj_path} 上传成功,耗时: {upload_duration:.2f} 秒")
except Exception as e:
print(f"上传文件 {obj_path} 时发生错误: {str(e)}")
print(traceback.format_exc())
finally:
response.close()
response.release_conn()
def minio_to_oss_stream_upload(task_queue, config):
# 打印当前进程号
print(f"当前进程号: {current_process().pid}")
# 限速上传文件,打个进程限速
limit_speed = conversion(config['OSS']['limit_speed'])*8
# 单次读取块大小
chunk_size = conversion(config['OSS']['chunk_size'])
headers = dict()
headers[OSS_TRAFFIC_LIMIT] = str(limit_speed)
try:
while not task_queue.empty():
try:
obj_path = task_queue.get_nowait()
if obj_path =='end': # 检查是否接收到终止信号
return
except Exception as e:
logging.error(f"从任务队列获取任务时出错: {str(e)}")
break
try:
# 获取 MinIO 客户端
minio_client = MyMinioClient().create_connection(config)
# 获取 OSS 客户端
oss_client = MyOssClient().create_connection(config)
minio_info = obj_path.split('/', 1)
bucket_name = minio_info[0]
minio_page = minio_info[-1]
print(minio_info)
response = minio_client.get_object(bucket_name, minio_page)
# 创建生成器函数
def generate_chunks():
for chunk in response.stream(chunk_size):
yield chunk
result = oss_client.put_object(
obj_path,
generate_chunks(),
headers=headers
)
print('http response status:', result.status)
except Exception as e:
# 上传异常将文件写回到消息队列
# task_queue.put(obj_path)
logging.error(f"上传文件 {obj_path} 时出错: {str(e)}\n{traceback.format_exc()}")
finally:
if 'response' in locals():
response.close()
response.release_conn()
return
except Exception as e:
logging.error(f"进程 {current_process().pid} 执行出错: {str(e)}\n{traceback.format_exc()}")
return
def get_buckets_info(minio_client):
try:
buckets = minio_client.list_buckets()
return [bucket.name for bucket in buckets]
except Exception as e:
print(traceback.format_exc())
print("创建minio连接报错")
return []
def get_file_counts(config):
"""获取MinIO所有桶和OSS指定桶的文件数量"""
try:
# 获取MinIO客户端
minio_client = MyMinioClient().create_connection(config)
# 获取OSS客户端
oss_client = MyOssClient().create_connection(config)
# 获取MinIO所有桶的文件数量
buckets = get_buckets_info(minio_client)
for bucket in buckets:
objects = minio_client.list_objects(bucket, recursive=True)
if not objects:
continue
minio_set = set()
for obj in objects:
minio_set.add('{}/{}'.format(bucket, obj.object_name))
# print(minio_set)
oss_set = set()
path = bucket+'/'
print(path)
for obj in ObjectIterator(oss_client, prefix=path):
oss_set.add(obj.key)
diff_set = len(minio_set - oss_set)
print(f"桶 {bucket} 中文件数量: {len(minio_set)}, 未上传数量: {diff_set}")
except Exception as e:
print(f"获取文件数量时发生错误: {str(e)}")
print(traceback.format_exc())
return {}
def path(config,path):
print("根据路径迁移")
# 获取 MinIO 客户端
minio_client = MyMinioClient().create_connection(config)
# 获取 OSS 客户端
oss_client = MyOssClient().create_connection(config)
max_processes = config['OSS'].getint('max_processes')
try:
# 分割路径获取桶名和文件夹路径
parts = path.split('/', 1)
print(parts)
if len(parts) < 2:
print("输入路径格式不正确,需要包含桶名和文件夹路径")
return
bucket_name, folder_path = parts
# 获取指定路径下的文件列表
# 获取需要增量迁移的文件列表
oss_files = get_oss_files(oss_client)
incremental_files = get_incremental_files(minio_client, oss_files, bucket_name,config,'small',folder_path)
processes = []
if incremental_files:
print(f"发现 {len(incremental_files)} 个需要上传的文件")
# 创建一个消息队列,将迁移任务写入到消息队列中
task_queue = Manager().Queue()
for obj_path in incremental_files:
task_queue.put(obj_path)
for i in range(20):
print("写入结束")
task_queue.put("end")
for i in range(max_processes):
# 每隔1秒启动一个新进程
if i > 0:
time.sleep(1)
p = Process(target=minio_to_oss_stream_upload, args=(task_queue, config))
p.start()
processes.append(p)
print(f"已启动进程 {i + 1}/{max_processes}")
# 等待所有进程完成
for p in processes:
p.join()
else:
print("没有需要上传的文件")
except Exception as e:
print(e)
print(traceback.format_exc())
def all_incremental_files(task_queue, config):
try:
# 获取 MinIO 客户端
minio_client = MyMinioClient().create_connection(config)
# 获取 OSS 客户端
oss_client = MyOssClient().create_connection(config)
minio_bucket_list = get_buckets_info(minio_client)
oss_files = get_oss_files(oss_client)
for bucket_name in minio_bucket_list:
print(bucket_name)
incremental_files = get_incremental_files(minio_client, oss_files, bucket_name, config,'small')
if not incremental_files:
continue
for obj_path in incremental_files:
task_queue.put(obj_path)
for i in range(20):
print("写入结束")
task_queue.put("end")
except Exception as e:
logging.error(f"获取增量文件列表时出错: {str(e)}\n{traceback.format_exc()}")
def all(config):
print("全量迁移")
max_processes = config['OSS'].getint('max_processes')
# 创建一个消息队列,将迁移任务写入到消息队列中
task_queue = Manager().Queue()
get_incremental_files = Process(target=all_incremental_files, args=(task_queue,config)) # 定义一个写进程
get_incremental_files.start()
get_incremental_files.join()
processes = []
for i in range(max_processes):
# 每隔1秒启动一个新进程
if i > 0:
time.sleep(1)
p = Process(target=minio_to_oss_stream_upload, args=(task_queue, config))
p.start()
processes.append(p)
print(f"已启动进程 {i + 1}/{max_processes}")
# 等待所有进程完成
for p in processes:
p.join()
def big_file_upload(config,path=None):
print("大文件上传")
try:
# 获取客户端
minio_client = MyMinioClient().create_connection(config)
oss_client = MyOssClient().create_connection(config)
oss_files = get_oss_files(oss_client)
# 根据path参数,决定是扫描整个minio,还是扫描指定的路径
if path is not None:
parts = path.split('/', 1)
print(parts)
bucket_name, folder_path = parts
incremental_files = get_incremental_files(minio_client, oss_files, bucket_name,config,"big",folder_path)
else:
buckets = get_buckets_info(minio_client)
incremental_files = set()
for bucket_name in buckets:
print(bucket_name)
# 遍历桶中的对象
bucket_files = get_incremental_files(minio_client, oss_files, bucket_name,config,"big",'')
incremental_files.update(bucket_files)
if not incremental_files:
print("没有需要上传的文件")
return
print(f"{len(incremental_files)}个需要上传的文件")
# 分片上传
for obj_path in incremental_files:
multipart_upload(minio_client, oss_client, obj_path,config)
except Exception as e:
print(f"增量上传过程中发生错误: {str(e)}")
print(traceback.format_exc())
def count_large_files(config):
"""统计MinIO中超过300MB的文件数量"""
try:
# 获取MinIO客户端
minio_client = MyMinioClient().create_connection(config)
# 获取所有桶
buckets = get_buckets_info(minio_client)
large_file_counts = {}
total_large_files = 0
# 遍历每个桶
for bucket in buckets:
count = 0
# 遍历桶中的对象
objects = minio_client.list_objects(bucket, recursive=True)
for obj in objects:
# 获取文件大小
file_size = obj.size
# 判断是否超过300MB
if file_size > 300 * 1024 * 1024:
count += 1
total_large_files += 1
large_file_counts[bucket] = count
return {
"large_file_counts": large_file_counts,
"total_large_files": total_large_files
}
except Exception as e:
print(f"统计大文件数量时发生错误: {str(e)}")
print(traceback.format_exc())
return {}
def delete_oss_redundant_files(config):
"""删除OSS上存在但MinIO不存在的文件"""
try:
# 获取客户端
minio_client = MyMinioClient().create_connection(config)
oss_client = MyOssClient().create_connection(config)
# 获取MinIO所有文件
minio_files = set()
buckets = get_buckets_info(minio_client)
for bucket in buckets:
objects = minio_client.list_objects(bucket, recursive=True)
for obj in objects:
minio_files.add(f"{bucket}/{obj.object_name}")
# 获取OSS所有文件
oss_files = get_oss_files(oss_client)
# 计算需要删除的文件
redundant_files = oss_files - minio_files
# 删除多余文件
deleted_count = 0
for file in redundant_files:
try:
oss_client.delete_object(file)
print(f"已删除文件: {file}")
deleted_count += 1
except Exception as e:
print(f"删除文件 {file} 时出错: {str(e)}")
print(traceback.format_exc())
print(f"共删除 {deleted_count} 个多余文件")
return deleted_count
except Exception as e:
print(f"删除多余文件时发生错误: {str(e)}")
print(traceback.format_exc())
return 0
if __name__ == "__main__":
config_path = 'minio_config.ini'
config = configparser.ConfigParser()
config.read(config_path)
parser = argparse.ArgumentParser(description="输入参数")
parser.add_argument("--type", type=str, required=True, help="同步类型")
parser.add_argument("--path", type=str, required=False, help="指定路径")
args = parser.parse_args()
if args.type == "all":
# 全量迁移
# nohup python3.10 minio_upload_oss.py --type=all > minio_reader.log 2>&1 &
all(config)
elif args.type == "folder":
# 指定路径迁移
path(config,args.path)
# python minio_upload_oss.py --type=folder --path='test01/2025/05'
# python3.10 minio_upload_oss.py --type=folder --path='hicx/'
# nohup python3.10 minio_upload_oss.py --type=folder --path='hiedu/' > minio_reader.log 2>&1 &
elif args.type == "big":
# 大文件迁移
big_file_upload(config,args.path)
# nohup python3.10 minio_upload_oss.py --type=big --path='higheredu-college/' > minio_reader.log 2>&1 &
# nohup python3.10 minio_upload_oss.py --type=big > minio_reader.log 2>&1 &
elif args.type == "count":
# 获取文件数量
# python minio_upload_oss.py --type=count
get_file_counts(config)
elif args.type == "large_count":
# python minio_upload_oss.py --type=large_count
# 统计大文件数量
result = count_large_files(config)
print(result)
elif args.type == "delete":
# python minio_upload_oss.py --type=large_count
# 统计大文件数量
result = delete_oss_redundant_files(config)
print(result)
# pgrep -f "python3.10 minio_upload_oss.py" | xargs kill
# {'minio_counts': {'hicx': 42491, 'hiedu': 362542, 'higheredu': 487, 'higheredu-college': 3292, 'hihelper': 489, 'hios': 0, 'hitalk-ai': 2279, 'hiuser': 790, 'meeting': 0, 'test': 3, 'test1': 2}, 'total_minio_count': 412375, 'oss_count': 408510}