Python 解压多层嵌套的压缩文件

696 阅读4分钟

简述

本文介绍了 python 解压多层嵌套的压缩包的一种方式。目前支持的压缩格式有: zip, tar, gz, rar, 7z

如果仅需要代码可直接访问:源码地址

实现细节

  1. 用队列代替递归

    压缩包的嵌套层级无法预估,递归调用栈保存中间变量过多可能导致解压时内存溢出,故使用队列一层一层地解压压缩文件。

  2. 压缩包中重名文件处理

    当压缩文件名和同层级有同名的文件夹时,解压时会将解压路径进行重命名。

  3. 加密或解压失败的压缩文件

    加密的压缩文件和解压失败的压缩文件不会阻塞解压流程,会在压缩子文件记录中保存信息。

  4. 中文文件名异常处理

    python 标准库的 zipfile 模块对跨系统的中文文件名识别不友好,该方法通过修改 Python/lib/zipfile.py 源码优化了该问题。

zipfile.py 修改说明

如果不想修改标准库下的源文件,可将 Python/lib/zipfile.py 下的文件复制出来修改再放到项目下使用。

修改步骤如下:

  1. 增加函数
def get_decode_name(name):
    guessed_encoding = chardet.detect(name)['encoding'] or 'cp1252'
    if guessed_encoding == 'ISO-8859-9':
        guessed_encoding = 'gbk'
    return name.decode(guessed_encoding, 'replace')
  1. filename = filename.decode('cp437') 这行代码改成 filename = get_decode_name(filename)

  2. fname_str = fname.decode("cp437") 这行代码改成 fname_str = get_decode_name(fname)

解压嵌套压缩文件完整源码

import os
import zipfile_modify as zipfile
import tarfile
import gzip
import rarfile
import py7zr


class CompressStatus:
    SUCCESS = 0
    ENCRYPTED = 1
    FAIL = 2


def zipfile_extract_all_to(file_path, target_path):
    status = CompressStatus.SUCCESS
    message = None
    try:
        zip_file = zipfile.ZipFile(file_path)
        for zinfo in zip_file.infolist():
            is_encrypted = zinfo.flag_bits & 0x1
            if is_encrypted:
                # 加密的压缩文件
                message = "加密文件"
                status = CompressStatus.ENCRYPTED

        if status == CompressStatus.SUCCESS:
            zip_file.extractall(target_path)
    except Exception as e:
        # 解压失败
        status = CompressStatus.FAIL
        message = str(e)
    finally:
        if 'zip_file' in dir():
            zip_file.close()
    return status, message


def tarfile_extract_all_to(file_path, target_path):
    status = CompressStatus.SUCCESS
    message = None

    try:
        tar_file = tarfile.open(file_path)
        tar_file.extractall(target_path)
    except Exception as e:
        # 解压失败
        status = CompressStatus.FAIL
        message = str(e)
    finally:
        if 'tar_file' in dir():
            tar_file.close()
    return status, message


def gzfile_extract_all_to(file_path, target_path):
    '''
    .gz 为压缩单个文件, .tar.gz 结尾的包通过 tarfile 处理
    '''
    status = CompressStatus.SUCCESS
    message = None

    try:
        gz_file = gzip.open(file_path, 'rb')
        target_file = open(target_path, 'wb')
        while True:
            block = gz_file.read(65536)
            if not block:
                break
            else:
                target_file.write(block)
    except Exception as e:
        # 解压失败
        status = CompressStatus.FAIL
        message = str(e)
    finally:
        if 'gz_file' in dir():
            gz_file.close()
        if 'target_file' in dir():
            target_file.close()
    return status, message


def rarfile_extract_all_to(file_path, target_path):
    status = CompressStatus.SUCCESS
    message = None
    try:
        rar_file = rarfile.RarFile(file_path)
        if rar_file.needs_password():
            # 加密的压缩文件
            status = CompressStatus.ENCRYPTED
            message = "加密文件"
        else:
            rar_file.extractall(target_path)
    except Exception as e:
        # 解压失败
        status = CompressStatus.FAIL
        message = str(e)
    finally:
        if 'rar_file' in dir():
            rar_file.close()
    return status, message


def py7zr_extract_all_to(file_path, target_path):
    status = CompressStatus.SUCCESS
    message = None
    try:
        py7zr_file = py7zr.SevenZipFile(file_path)
        py7zr_file.extractall(target_path)
    except py7zr.exceptions.PasswordRequired:
        status = CompressStatus.ENCRYPTED
        message = "加密文件"
    except Exception as e:
        # 解压失败
        status = CompressStatus.FAIL
        message = str(e)
    finally:
        if 'py7zr_file' in dir():
            py7zr_file.close()
    return status, message


class FileInfo:
    '''
    解压的单个文件信息保存
    '''

    def __init__(self, path, compressed_path):
        self.path = path
        self.compressed_path = compressed_path


class CompressedFileInfo(FileInfo):
    '''
    压缩文件信息
    '''

    def __init__(self, path, compressed_path):
        super().__init__(path, compressed_path)
        self.to_path = ''
        self.encrypted = False
        self.is_error = False
        self.msg = ''


class CompressedFile():
    '''
    压缩文件工具
    '''
    uncomprees_func_dict = {
        'zip': zipfile_extract_all_to,
        'tar': tarfile_extract_all_to,
        'gz': gzfile_extract_all_to,
        'rar': rarfile_extract_all_to,
        '7z': py7zr_extract_all_to
    }

    @classmethod
    def is_compressed_file(cls, path):
        '''
        判断是否是当前支持的压缩文件
        '''
        _, ext = os.path.splitext(path)
        return ext[1:] in cls.uncomprees_func_dict

    def __init__(self, path, compressed_path, target_path_default=None):
        self.info = CompressedFileInfo(path, compressed_path)
        if path.endswith('.tar.gz'):
            self.uncompress_func = tarfile_extract_all_to
            self.target_path_default = path[:-len('.tar.gz')]
        else:
            path_without_ext, ext = os.path.splitext(path)
            self.uncompress_func = self.uncomprees_func_dict.get(ext[1:])
            self.target_path_default = path_without_ext
        if target_path_default != None:
            self.target_path_default = target_path_default

    def extract_all(self, target_path=None):
        if not target_path:
            target_path = self.target_path_default
        self.info.to_path = self.check_uncompress_path(target_path)
        status, message = self.uncompress_func(self.info.path, self.info.to_path)
        if status == CompressStatus.ENCRYPTED:
            self.info.encrypted = True
            self.info.msg = message
        elif status == CompressStatus.FAIL:
            self.info.is_error = True
            self.info.msg = message
        return self.info

    def check_uncompress_path(self, to_path):
        '''
        若解压路径存在,返回新的路径
        在原路径前添加 (i) i=1,2,3...
        '''
        suffix = 1
        file_dir, filename = os.path.split(to_path)

        while True:
            if os.path.exists(to_path):
                to_path = os.path.join(file_dir, f"({str(suffix)}){filename}")
                suffix += 1
            else:
                return to_path


class NestedCompressedFile():

    def __init__(self, file_path):
        self.file_path = file_path
        self.compressed_file_queue = []
        self.filelist = []
        self.origin_compress_file = None

    def deep_extract_all_to(self, target_path):
        '''
        核心功能
        将嵌套的压缩包按源相对路径挨个解压到 target_path ,
        并将子文件的信息保存到 fileList 中
        '''
        self.filelist = []

        _, filename = os.path.split(self.file_path)
        self.origin_compress_file = CompressedFile(self.file_path, filename, target_path_default=target_path)

        self.compressed_file_queue.append(self.origin_compress_file)

        while len(self.compressed_file_queue) != 0:
            compressed_file = self.compressed_file_queue.pop(0)

            compressed_file_info = compressed_file.extract_all()
            if compressed_file_info.encrypted or compressed_file_info.is_error:
                self.filelist.append(compressed_file_info)
            else:
                self.uncompress_subfile_walk(compressed_file_info.to_path, compressed_file_info.compressed_path)

    def uncompress_subfile_walk(self, walk_path, path_in_compressed):
        '''
        遍历文件夹子文件
        将子文件信息保存于 filelist 和 compressed_file_queue
        '''
        if os.path.isdir(walk_path):
            for current_dir, _, sub_file_list in os.walk(walk_path):
                if len(sub_file_list) != 0:
                    for sub_file_fullname in sub_file_list:
                        file_save_path = os.path.join(current_dir, sub_file_fullname)
                        file_path_in_compressed = file_save_path.replace(walk_path, path_in_compressed)

                        if CompressedFile.is_compressed_file(file_save_path):
                            self.compressed_file_queue.append(CompressedFile(file_save_path, file_path_in_compressed))
                        else:
                            self.filelist.append(FileInfo(file_save_path, file_path_in_compressed))
        else:
            if CompressedFile.is_compressed_file(walk_path):
                self.compressed_file_queue.append(CompressedFile(walk_path, path_in_compressed))
            else:
                self.filelist.append(FileInfo(walk_path, path_in_compressed))