2、minio迁移到oss

24 阅读10分钟

背景说明

1、客户线下IDC机房自建minio服务器
2、需要将文件文件同步到oss
3、minio每天有新增和删除动作,因此同步到oss的时候也需要同步新增文件和删除动作
4、流量切换停机窗口1小时

迁移示意图

4、Minio迁移.png

迁移脚本

迁移脚本实现以下功能

1、多进程限速上传

image.png 2、小文件全量迁移(增量)
3、大文件分片迁移(增量)
4、统计大文件信息
5、统计minio、oss文件数量

使用脚本

1、如果是自动更新直接运行auto.py脚本
2、如果要手动加参数执行minio_upload_oss.py脚本

脚本详情

auto.py

import time
import configparser
from multiprocessing import Process, Manager
from minio_client import MyMinioClient
from oss_clent import MyOssClient
import traceback
from minio_upload_oss import *
def main():
    config = configparser.ConfigParser()
    config.read('minio_config.ini')

    while True:
        # 1. 删除
        try:
            delete_oss_redundant_files(config)
        except Exception as e:
            print(f"Error occurred: {e}")
            traceback.print_exc()
        # 2、上传小文件
        try:
            all(config)
        except Exception as e:
            print(f"Error occurred: {e}")
            traceback.print_exc()
        # 2、大文件上传
        try:
            big_file_upload(config)
        except Exception as e:
            print(f"Error occurred: {e}")
            traceback.print_exc()
        # 休眠10分钟
        print("休眠中.....")
        time.sleep(600)

if __name__ == "__main__":
    main()

minio_config.ini

[MinIO]
endpoint = xx.xxx.xxx.xxx:9000
access_key = XXXXXXXXXXx
secret_key = xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
secure = False
file_threshold = 300M

[OSS]
endpoint = http://oss-cn-qingdao.aliyuncs.com
access_key = xxxxxxxx
secret_key = xxxxxxxxxxxxxx
bucket_name = xxxxxxxxx
max_processes = 10
preferred_size = 5M
limit_speed = 600K
multipart_limit_speed = 4M
chunk_size = 32K

oss_clent.py

from singleton import singleton
from oss2 import Auth, Bucket

@singleton
class MyOssClient:
    counnt = None
    def create_connection(self, config, max_retries=3):
        retries = 0
        while retries < max_retries:
            if self.counnt is None:
                print("创建oss连接")
                try:
                    # 初始化OSS客户端
                    auth = Auth(config['OSS']['access_key'], config['OSS']['secret_key'])
                    endpoint = config['OSS']['endpoint']
                    self.counnt = Bucket(auth, endpoint, config['OSS']['bucket_name'], connect_timeout=600)
                except Exception as e:
                    print(f"创建 OSS 连接失败,第 {retries + 1} 次尝试: {e}")
                    retries += 1
                    continue

            try:
                # 检测连接状态
                self.counnt.get_bucket_info()
                # print("OSS 连接正常")
                return self.counnt
            except Exception as e:
                print(f"OSS 连接异常,第 {retries + 1} 次尝试重新连接: {e}")
                self.counnt = None  # 重置连接
                retries += 1

        print("达到最大重试次数,无法建立 OSS 连接")

singleton.py

from functools import wraps
def singleton(cls):
    instances = {}
    @wraps(cls)
    def getinstance(*args, **kw):
        if cls not in instances:
            # 如果实例不存在就实例化一个
            instances[cls] = cls(*args, **kw)
        return instances[cls]
    return getinstance

minio_client.py

from singleton import singleton
import time
from minio import Minio
@singleton
class MyMinioClient:
    counnt = None
    def create_connection(self,config):
        if self.counnt is None:
            print("创建minio连接")
            self.counnt = Minio(
                    config['MinIO']['endpoint'],
                    access_key=config['MinIO']['access_key'],
                    secret_key=config['MinIO']['secret_key'],
                    secure=config['MinIO'].getboolean('secure')
                )
        return self.counnt

minio_upload_oss.py

import logging
from minio import Minio
import configparser
import argparse
from oss2 import Auth, Bucket, SizedFileAdapter, determine_part_size, ObjectIterator
import time
import oss2
import traceback
from oss2.models import OSS_TRAFFIC_LIMIT
from multiprocessing import Pool, Manager, Process
from minio_client import MyMinioClient
from oss_clent import MyOssClient
from multiprocessing import current_process  # 导入 current_process 函数

# 配置日志
logging.basicConfig(
    filename='process_errors.log',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def conversion(speed_limit):
    if speed_limit.endswith('K'):
        return int(speed_limit[:-1]) * 1024
    elif speed_limit.endswith('M'):
        return int(speed_limit[:-1]) * 1024 * 1024
    elif speed_limit.endswith('G'):
        return int(speed_limit[:-1]) * 1024 * 1024 * 1024
    else:
        return int(speed_limit)


def get_oss_files(oss_client):
    """获取OSS指定桶中的所有文件路径"""
    try:
        file_set = set()
        for obj in ObjectIterator(oss_client):
            file_set.add(obj.key)
        return file_set
    except Exception as e:
        print(f"获取OSS文件列表时发生错误: {str(e)}")
        return set()

def get_incremental_files(minio_client, oss_files, bucket_name, config, type='small',folder_path=''):
    """获取需要增量迁移的文件列表"""
    try:
        minio_files = set()
        objects = minio_client.list_objects(bucket_name, prefix=folder_path, recursive=True)
        size_threshold = conversion(config['MinIO']['file_threshold'])  # 300MB 转换为字节
        for obj in objects:
            # 获取文件大小
            file_size = obj.size
            if type == 'small' and file_size <= size_threshold:
                # 小文件迁移
                minio_files.add(f"{bucket_name}/{obj.object_name}")
            elif type == 'big' and file_size > size_threshold:
                # 大文件迁移
                minio_files.add(f"{bucket_name}/{obj.object_name}")
        return minio_files - oss_files
    except Exception as e:
        print(f"计算增量文件时发生错误: {str(e)}")
        return set()
def multipart_upload(minio_client, oss_client, obj_path,config):
    """单文件上传函数,使用分片上传方式"""
    try:
        minio_info = obj_path.split('/', 1)
        bucket_name = minio_info[0]
        minio_page = minio_info[-1]

        print(minio_info)
        # 获取文件元数据
        obj_stat = minio_client.stat_object(bucket_name, minio_page)
        file_size = obj_stat.size
        file_size_mb = file_size / (1024 * 1024)
        print(f"文件 {obj_path} 的大小为 {file_size_mb:.2f} MB")
        response = minio_client.get_object(bucket_name, minio_page)
        print(obj_path + "下载完成")
        # 记录分片上传开始时间
        start_time = time.time()

        # 初始化分片上传
        upload_id = oss_client.init_multipart_upload(obj_path).upload_id
        print(f"初始化分片上传,upload_id: {upload_id}")

        parts = []
        part_number = 1
        preferred_size = conversion(config['OSS']['preferred_size'])  #分片大小
        part_size = determine_part_size(file_size, preferred_size=preferred_size)  # 10MB 分片大小
        limit_speed = conversion(config['OSS']['multipart_limit_speed'])*8

        try:
            while True:
                chunk = response.read(part_size)
                if not chunk:
                    break
                # 上传分片并设置流量限制,将 traffic_limit 转换为字符串
                headers = {OSS_TRAFFIC_LIMIT: str(limit_speed)}
                result = oss_client.upload_part(obj_path, upload_id, part_number, chunk, headers=headers)
                parts.append(oss2.models.PartInfo(part_number, result.etag))
                part_number += 1
                # print(f"已上传第 {part_number - 1} 个分片")

            # 完成分片上传
            oss_client.complete_multipart_upload(obj_path, upload_id, parts)
            print(f"文件 {obj_path} 分片上传完成")
        except Exception as e:
            print(f"分片上传文件 {obj_path} 时发生错误,取消分片上传: {str(e)}")
            oss_client.abort_multipart_upload(obj_path, upload_id)
            raise

        # 记录分片上传结束时间
        end_time = time.time()
        # 计算并打印分片上传耗时
        upload_duration = end_time - start_time
        print(f"文件 {obj_path} 上传成功,耗时: {upload_duration:.2f} 秒")

    except Exception as e:
        print(f"上传文件 {obj_path} 时发生错误: {str(e)}")
        print(traceback.format_exc())
    finally:
        response.close()
        response.release_conn()
def minio_to_oss_stream_upload(task_queue, config):
    # 打印当前进程号
    print(f"当前进程号: {current_process().pid}")
    # 限速上传文件,打个进程限速
    limit_speed = conversion(config['OSS']['limit_speed'])*8
    # 单次读取块大小
    chunk_size = conversion(config['OSS']['chunk_size'])
    headers = dict()
    headers[OSS_TRAFFIC_LIMIT] = str(limit_speed)
    try:
        while not task_queue.empty():
            try:
                obj_path = task_queue.get_nowait()
                if obj_path =='end':  # 检查是否接收到终止信号
                    return
            except Exception as e:
                logging.error(f"从任务队列获取任务时出错: {str(e)}")
                break
            try:
                # 获取 MinIO 客户端
                minio_client = MyMinioClient().create_connection(config)
                # 获取 OSS 客户端
                oss_client = MyOssClient().create_connection(config)
                minio_info = obj_path.split('/', 1)
                bucket_name = minio_info[0]
                minio_page = minio_info[-1]
                print(minio_info)
                response = minio_client.get_object(bucket_name, minio_page)
                # 创建生成器函数
                def generate_chunks():
                    for chunk in response.stream(chunk_size):
                        yield chunk

                result = oss_client.put_object(
                    obj_path,
                    generate_chunks(),
                    headers=headers
                )
                print('http response status:', result.status)
            except Exception as e:
                # 上传异常将文件写回到消息队列
                # task_queue.put(obj_path)
                logging.error(f"上传文件 {obj_path} 时出错: {str(e)}\n{traceback.format_exc()}")
            finally:
                if 'response' in locals():
                    response.close()
                    response.release_conn()
        return
    except Exception as e:
        logging.error(f"进程 {current_process().pid} 执行出错: {str(e)}\n{traceback.format_exc()}")
        return 

def get_buckets_info(minio_client):
    try:
        buckets = minio_client.list_buckets()
        return [bucket.name for bucket in buckets]
    except Exception as e:
        print(traceback.format_exc())
        print("创建minio连接报错")
        return []

def get_file_counts(config):
    """获取MinIO所有桶和OSS指定桶的文件数量"""
    try:
        # 获取MinIO客户端
        minio_client = MyMinioClient().create_connection(config)
        # 获取OSS客户端
        oss_client = MyOssClient().create_connection(config)

        # 获取MinIO所有桶的文件数量
        buckets = get_buckets_info(minio_client)
        for bucket in buckets:
            objects = minio_client.list_objects(bucket, recursive=True)
            if not objects:
                continue
            minio_set = set()
            for  obj in objects:
                minio_set.add('{}/{}'.format(bucket, obj.object_name))
            # print(minio_set)
            oss_set = set()
            path = bucket+'/'
            print(path)
            for obj in ObjectIterator(oss_client, prefix=path):
                oss_set.add(obj.key)
            diff_set = len(minio_set - oss_set)
            print(f"桶 {bucket} 中文件数量: {len(minio_set)}, 未上传数量: {diff_set}")
        
        
    except Exception as e:
        print(f"获取文件数量时发生错误: {str(e)}")
        print(traceback.format_exc())
        return {}
def path(config,path):
    print("根据路径迁移")
    # 获取 MinIO 客户端
    minio_client = MyMinioClient().create_connection(config)
    # 获取 OSS 客户端
    oss_client = MyOssClient().create_connection(config)
    max_processes = config['OSS'].getint('max_processes')

    try:
        # 分割路径获取桶名和文件夹路径
        parts = path.split('/', 1)
        print(parts)
        if len(parts) < 2:
            print("输入路径格式不正确,需要包含桶名和文件夹路径")
            return
        bucket_name, folder_path = parts

        # 获取指定路径下的文件列表

        # 获取需要增量迁移的文件列表
        oss_files = get_oss_files(oss_client)
        incremental_files = get_incremental_files(minio_client, oss_files, bucket_name,config,'small',folder_path)

        processes = []
        if incremental_files:
            print(f"发现 {len(incremental_files)} 个需要上传的文件")
            # 创建一个消息队列,将迁移任务写入到消息队列中
            task_queue = Manager().Queue()
            for obj_path in incremental_files:
                task_queue.put(obj_path)
            for i in range(20):
                print("写入结束")
                task_queue.put("end")
            for i in range(max_processes):
                # 每隔1秒启动一个新进程
                if i > 0:
                    time.sleep(1)

                p = Process(target=minio_to_oss_stream_upload, args=(task_queue, config))
                p.start()
                processes.append(p)
                print(f"已启动进程 {i + 1}/{max_processes}")
            # 等待所有进程完成
            for p in processes:
                p.join()
        else:
            print("没有需要上传的文件")
    except Exception as e:
        print(e)
        print(traceback.format_exc())

def all_incremental_files(task_queue, config):
    try:
        # 获取 MinIO 客户端
        minio_client = MyMinioClient().create_connection(config)
        # 获取 OSS 客户端
        oss_client = MyOssClient().create_connection(config)
        minio_bucket_list = get_buckets_info(minio_client)
        oss_files = get_oss_files(oss_client)
        for bucket_name in minio_bucket_list:
            print(bucket_name)
            incremental_files = get_incremental_files(minio_client, oss_files, bucket_name, config,'small')
            if not incremental_files:
                continue
            for obj_path in incremental_files:
                task_queue.put(obj_path)
        for i in range(20):
            print("写入结束")
            task_queue.put("end")
    except Exception as e:
        logging.error(f"获取增量文件列表时出错: {str(e)}\n{traceback.format_exc()}")

def all(config):
    print("全量迁移")
    
    max_processes = config['OSS'].getint('max_processes')
    # 创建一个消息队列,将迁移任务写入到消息队列中
    task_queue = Manager().Queue()
    get_incremental_files = Process(target=all_incremental_files, args=(task_queue,config))  # 定义一个写进程
    get_incremental_files.start()
    get_incremental_files.join()
    processes = []
    for i in range(max_processes):
        # 每隔1秒启动一个新进程
        if i > 0:
            time.sleep(1)

        p = Process(target=minio_to_oss_stream_upload, args=(task_queue, config))
        p.start()
        processes.append(p)
        print(f"已启动进程 {i + 1}/{max_processes}")

    # 等待所有进程完成
    for p in processes:
        p.join()

    

def big_file_upload(config,path=None):
    print("大文件上传")
    try:
        # 获取客户端
        minio_client = MyMinioClient().create_connection(config)
        oss_client = MyOssClient().create_connection(config)

        oss_files = get_oss_files(oss_client)
        # 根据path参数,决定是扫描整个minio,还是扫描指定的路径
        if path is not  None:
            parts = path.split('/', 1)
            print(parts)
            bucket_name, folder_path = parts
            incremental_files = get_incremental_files(minio_client, oss_files, bucket_name,config,"big",folder_path)
        else:
            buckets = get_buckets_info(minio_client)
            incremental_files = set()
            for bucket_name in buckets:
                print(bucket_name)
                # 遍历桶中的对象
                bucket_files = get_incremental_files(minio_client, oss_files, bucket_name,config,"big",'')
                incremental_files.update(bucket_files)
        if not incremental_files:
            print("没有需要上传的文件")
            return 
        print(f"{len(incremental_files)}个需要上传的文件")
        # 分片上传
        for obj_path in incremental_files:
            multipart_upload(minio_client, oss_client, obj_path,config)

    except Exception as e:
        print(f"增量上传过程中发生错误: {str(e)}")
        print(traceback.format_exc())

def count_large_files(config):
    """统计MinIO中超过300MB的文件数量"""
    try:
        # 获取MinIO客户端
        minio_client = MyMinioClient().create_connection(config)
        
        # 获取所有桶
        buckets = get_buckets_info(minio_client)
        large_file_counts = {}
        total_large_files = 0
        
        # 遍历每个桶
        for bucket in buckets:
            count = 0
            # 遍历桶中的对象
            objects = minio_client.list_objects(bucket, recursive=True)
            for obj in objects:
                # 获取文件大小
                file_size = obj.size
                # 判断是否超过300MB
                if file_size > 300 * 1024 * 1024:
                    count += 1
                    total_large_files += 1
            large_file_counts[bucket] = count
        
        return {
            "large_file_counts": large_file_counts,
            "total_large_files": total_large_files
        }
        
    except Exception as e:
        print(f"统计大文件数量时发生错误: {str(e)}")
        print(traceback.format_exc())
        return {}

def delete_oss_redundant_files(config):
    """删除OSS上存在但MinIO不存在的文件"""
    try:
        # 获取客户端
        minio_client = MyMinioClient().create_connection(config)
        oss_client = MyOssClient().create_connection(config)
        
        # 获取MinIO所有文件
        minio_files = set()
        buckets = get_buckets_info(minio_client)
        for bucket in buckets:
            objects = minio_client.list_objects(bucket, recursive=True)
            for obj in objects:
                minio_files.add(f"{bucket}/{obj.object_name}")
        
        # 获取OSS所有文件
        oss_files = get_oss_files(oss_client)
        
        # 计算需要删除的文件
        redundant_files = oss_files - minio_files
        
        # 删除多余文件
        deleted_count = 0
        for file in redundant_files:
            try:
                oss_client.delete_object(file)
                print(f"已删除文件: {file}")
                deleted_count += 1
            except Exception as e:
                print(f"删除文件 {file} 时出错: {str(e)}")
                print(traceback.format_exc())
        
        print(f"共删除 {deleted_count} 个多余文件")
        return deleted_count
        
    except Exception as e:
        print(f"删除多余文件时发生错误: {str(e)}")
        print(traceback.format_exc())
        return 0

if __name__ == "__main__":
    config_path = 'minio_config.ini'
    config = configparser.ConfigParser()
    config.read(config_path)
    parser = argparse.ArgumentParser(description="输入参数")
    parser.add_argument("--type", type=str, required=True, help="同步类型")
    parser.add_argument("--path", type=str, required=False, help="指定路径")
    args = parser.parse_args()
    if args.type == "all":
        # 全量迁移
        # nohup python3.10 minio_upload_oss.py --type=all > minio_reader.log 2>&1 &
        all(config)
    elif  args.type == "folder":
        # 指定路径迁移
        path(config,args.path)
        #  python minio_upload_oss.py --type=folder --path='test01/2025/05'
        #  python3.10 minio_upload_oss.py --type=folder --path='hicx/'
        #  nohup python3.10 minio_upload_oss.py --type=folder --path='hiedu/' > minio_reader.log 2>&1 &
    elif args.type == "big":
        # 大文件迁移
        big_file_upload(config,args.path)
        # nohup python3.10 minio_upload_oss.py --type=big --path='higheredu-college/' > minio_reader.log 2>&1 &
        # nohup python3.10 minio_upload_oss.py --type=big  > minio_reader.log 2>&1 &
    elif args.type == "count":
        # 获取文件数量
        # python minio_upload_oss.py --type=count
        get_file_counts(config)
    elif args.type == "large_count":
        # python minio_upload_oss.py --type=large_count
        # 统计大文件数量
        result = count_large_files(config)
        print(result)
    elif args.type == "delete":
        # python minio_upload_oss.py --type=large_count
        # 统计大文件数量
        result = delete_oss_redundant_files(config)
        print(result)

# pgrep -f "python3.10 minio_upload_oss.py" | xargs kill
# {'minio_counts': {'hicx': 42491, 'hiedu': 362542, 'higheredu': 487, 'higheredu-college': 3292, 'hihelper': 489, 'hios': 0, 'hitalk-ai': 2279, 'hiuser': 790, 'meeting': 0, 'test': 3, 'test1': 2}, 'total_minio_count': 412375, 'oss_count': 408510}