Python 文件操作

90 阅读3分钟

一、环境准备

pip install pillow pypdf2 python-docx pdf2docx openpyxl Pillow reportlab

二、文件压缩与解压

1. ZIP文件处理

import zipfile
import os

# 压缩文件/文件夹
def zip_compress(source_path, zip_name):
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        if os.path.isdir(source_path):
            for root, dirs, files in os.walk(source_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, os.path.relpath(file_path, source_path))
        else:
            zipf.write(source_path, os.path.basename(source_path))

# 解压ZIP文件
def zip_extract(zip_path, extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zipf:
        zipf.extractall(extract_dir)

# 使用示例
zip_compress('document.txt', 'archive.zip')
zip_extract('archive.zip', 'extracted_files')

2. 多格式压缩(使用tar)

import tarfile

# 创建tar.gz压缩包
def tar_compress(source_path, output_name):
    with tarfile.open(output_name, "w:gz") as tar:
        tar.add(source_path, arcname=os.path.basename(source_path))

# 解压tar.gz文件
def tar_extract(tar_path, extract_dir):
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(extract_dir)

三、文件格式转换

1. 图片格式转换(PNG → JPG)

from PIL import Image

def convert_image(input_path, output_path, output_format):
    img = Image.open(input_path)
    img.save(output_path, format=output_format)

# 使用示例
convert_image('input.png', 'output.jpg', 'JPEG')

2. PDF转Word

from pdf2docx import Converter

def pdf_to_word(pdf_path, docx_path):
    cv = Converter(pdf_path)
    cv.convert(docx_path)
    cv.close()

pdf_to_word('document.pdf', 'output.docx')

四、文件水印添加

1. 图片水印

from PIL import Image, ImageDraw, ImageFont

def add_image_watermark(input_path, output_path, watermark_text):
    base_image = Image.open(input_path).convert("RGBA")
    txt = Image.new("RGBA", base_image.size, (255,255,255,0))
    
    font = ImageFont.truetype("arial.ttf", 40)
    d = ImageDraw.Draw(txt)
    
    # 在全图重复添加水印
    for i in range(0, txt.size[0], 200):
        for j in range(0, txt.size[1], 200):
            d.text((i,j), watermark_text, font=font, fill=(255,255,255,128))
    
    combined = Image.alpha_composite(base_image, txt)
    combined.convert("RGB").save(output_path)

add_image_watermark("photo.jpg", "watermarked.jpg", "SAMPLE")

2. PDF水印

from PyPDF2 import PdfReader, PdfWriter

def add_pdf_watermark(input_pdf, output_pdf, watermark_text):
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    for page in reader.pages:
        page.merge_page(get_watermark_page(watermark_text))
        writer.add_page(page)
    
    with open(output_pdf, "wb") as f:
        writer.write(f)

def get_watermark_page(text):
    # 使用ReportLab创建水印PDF页面
    from reportlab.pdfgen import canvas
    from io import BytesIO
    
    packet = BytesIO()
    can = canvas.Canvas(packet)
    can.setFont("Helvetica", 50)
    can.setFillColorRGB(0.5,0.5,0.5, alpha=0.3)
    can.rotate(45)
    can.drawString(100, 100, text)
    can.save()
    
    packet.seek(0)
    return PdfReader(packet).pages[0]

add_pdf_watermark("document.pdf", "watermarked.pdf", "CONFIDENTIAL")

五、文件净化

1. 清理临时文件

import os
import glob

def clean_temp_files(directory):
    patterns = ['*.tmp', '~*', '*.bak']
    for pattern in patterns:
        for filepath in glob.glob(os.path.join(directory, pattern)):
            try:
                os.remove(filepath)
            except Exception as e:
                print(f"Error deleting {filepath}: {e}")

clean_temp_files("./documents")

2. 敏感信息擦除(示例:清理CSV文件)

import csv
import re

def clean_csv_sensitive(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        for row in reader:
            cleaned_row = [
                re.sub(r'\b\d{4}-\d{2}-\d{4}\b', '[ID MASKED]', cell)  # 屏蔽身份证号
                for cell in row
            ]
            writer.writerow(cleaned_row)

clean_csv_sensitive("data.csv", "cleaned_data.csv")

六、文件分析

1. 文件系统分析

import os
from collections import defaultdict

def analyze_directory(directory):
    stats = defaultdict(lambda: {'count':0, 'size':0})
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            try:
                size = os.path.getsize(filepath)
                ext = os.path.splitext(file)[1].lower() or 'no_ext'
                stats[ext]['count'] += 1
                stats[ext]['size'] += size
            except Exception as e:
                print(f"Error processing {filepath}: {e}")
    
    # 转换为MB并打印结果
    for ext, data in stats.items():
        data['size_mb'] = round(data['size'] / (1024*1024), 2)
        print(f"{ext}: {data['count']} files, {data['size_mb']}MB")

analyze_directory("./documents")

2. 文件内容分析(示例:词频统计)

from collections import Counter
import re

def analyze_text_file(file_path, top_n=10):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()
    
    words = re.findall(r'\b\w{4,}\b', text)  # 匹配4字母以上单词
    word_counts = Counter(words)
    
    print(f"Top {top_n} frequent words:")
    for word, count in word_counts.most_common(top_n):
        print(f"{word}: {count}")

analyze_text_file("document.txt")

七、总结与扩展

最佳实践建议:

  1. 错误处理:在实际使用中需要添加try/except块处理文件操作异常
  2. 性能优化:处理大文件时使用流式处理(stream processing)
  3. 安全性:处理用户上传文件时进行格式验证和大小限制
  4. 扩展功能:
    • 使用PyMuPDF处理更复杂的PDF操作
    • 使用MoviePy处理视频文件
    • 使用python-pptx处理PPT文档