一、环境准备
pip install pillow pypdf2 python-docx pdf2docx openpyxl Pillow reportlab
二、文件压缩与解压
1. ZIP文件处理
import zipfile
import os
def zip_compress(source_path, zip_name):
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
if os.path.isdir(source_path):
for root, dirs, files in os.walk(source_path):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, source_path))
else:
zipf.write(source_path, os.path.basename(source_path))
def zip_extract(zip_path, extract_dir):
with zipfile.ZipFile(zip_path, 'r') as zipf:
zipf.extractall(extract_dir)
zip_compress('document.txt', 'archive.zip')
zip_extract('archive.zip', 'extracted_files')
2. 多格式压缩(使用tar)
import tarfile
def tar_compress(source_path, output_name):
with tarfile.open(output_name, "w:gz") as tar:
tar.add(source_path, arcname=os.path.basename(source_path))
def tar_extract(tar_path, extract_dir):
with tarfile.open(tar_path, "r:gz") as tar:
tar.extractall(extract_dir)
三、文件格式转换
1. 图片格式转换(PNG → JPG)
from PIL import Image
def convert_image(input_path, output_path, output_format):
img = Image.open(input_path)
img.save(output_path, format=output_format)
convert_image('input.png', 'output.jpg', 'JPEG')
2. PDF转Word
from pdf2docx import Converter
def pdf_to_word(pdf_path, docx_path):
cv = Converter(pdf_path)
cv.convert(docx_path)
cv.close()
pdf_to_word('document.pdf', 'output.docx')
四、文件水印添加
1. 图片水印
from PIL import Image, ImageDraw, ImageFont
def add_image_watermark(input_path, output_path, watermark_text):
base_image = Image.open(input_path).convert("RGBA")
txt = Image.new("RGBA", base_image.size, (255,255,255,0))
font = ImageFont.truetype("arial.ttf", 40)
d = ImageDraw.Draw(txt)
for i in range(0, txt.size[0], 200):
for j in range(0, txt.size[1], 200):
d.text((i,j), watermark_text, font=font, fill=(255,255,255,128))
combined = Image.alpha_composite(base_image, txt)
combined.convert("RGB").save(output_path)
add_image_watermark("photo.jpg", "watermarked.jpg", "SAMPLE")
2. PDF水印
from PyPDF2 import PdfReader, PdfWriter
def add_pdf_watermark(input_pdf, output_pdf, watermark_text):
reader = PdfReader(input_pdf)
writer = PdfWriter()
for page in reader.pages:
page.merge_page(get_watermark_page(watermark_text))
writer.add_page(page)
with open(output_pdf, "wb") as f:
writer.write(f)
def get_watermark_page(text):
from reportlab.pdfgen import canvas
from io import BytesIO
packet = BytesIO()
can = canvas.Canvas(packet)
can.setFont("Helvetica", 50)
can.setFillColorRGB(0.5,0.5,0.5, alpha=0.3)
can.rotate(45)
can.drawString(100, 100, text)
can.save()
packet.seek(0)
return PdfReader(packet).pages[0]
add_pdf_watermark("document.pdf", "watermarked.pdf", "CONFIDENTIAL")
五、文件净化
1. 清理临时文件
import os
import glob
def clean_temp_files(directory):
patterns = ['*.tmp', '~*', '*.bak']
for pattern in patterns:
for filepath in glob.glob(os.path.join(directory, pattern)):
try:
os.remove(filepath)
except Exception as e:
print(f"Error deleting {filepath}: {e}")
clean_temp_files("./documents")
2. 敏感信息擦除(示例:清理CSV文件)
import csv
import re
def clean_csv_sensitive(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
cleaned_row = [
re.sub(r'\b\d{4}-\d{2}-\d{4}\b', '[ID MASKED]', cell)
for cell in row
]
writer.writerow(cleaned_row)
clean_csv_sensitive("data.csv", "cleaned_data.csv")
六、文件分析
1. 文件系统分析
import os
from collections import defaultdict
def analyze_directory(directory):
stats = defaultdict(lambda: {'count':0, 'size':0})
for root, dirs, files in os.walk(directory):
for file in files:
filepath = os.path.join(root, file)
try:
size = os.path.getsize(filepath)
ext = os.path.splitext(file)[1].lower() or 'no_ext'
stats[ext]['count'] += 1
stats[ext]['size'] += size
except Exception as e:
print(f"Error processing {filepath}: {e}")
for ext, data in stats.items():
data['size_mb'] = round(data['size'] / (1024*1024), 2)
print(f"{ext}: {data['count']} files, {data['size_mb']}MB")
analyze_directory("./documents")
2. 文件内容分析(示例:词频统计)
from collections import Counter
import re
def analyze_text_file(file_path, top_n=10):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read().lower()
words = re.findall(r'\b\w{4,}\b', text)
word_counts = Counter(words)
print(f"Top {top_n} frequent words:")
for word, count in word_counts.most_common(top_n):
print(f"{word}: {count}")
analyze_text_file("document.txt")
七、总结与扩展
最佳实践建议:
- 错误处理:在实际使用中需要添加try/except块处理文件操作异常
- 性能优化:处理大文件时使用流式处理(stream processing)
- 安全性:处理用户上传文件时进行格式验证和大小限制
- 扩展功能:
- 使用PyMuPDF处理更复杂的PDF操作
- 使用MoviePy处理视频文件
- 使用python-pptx处理PPT文档