在日常开发工作中,我们经常被重复性的事务工作消耗大量时间。今天分享3个我在实际工作中用到的Python脚本,覆盖文件分类整理、数据清洗、文件批量重命名等高频场景。
一、智能文件整理器:告别桌面混乱
自动按文件类型分类,并支持自定义规则:
import os
import shutil
from pathlib import Path
from datetime import datetime
class SmartOrganizer:
"""智能文件分类器,支持自定义规则与日志记录"""
# 文件类型映射规则(可扩展)
FILE_TYPES = {
'图片': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'],
'文档': ['.pdf', '.doc', '.docx', '.txt', '.md', '.xls', '.xlsx', '.ppt', '.pptx'],
'视频': ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv'],
'音频': ['.mp3', '.wav', '.flac', '.aac', '.ogg'],
'压缩包': ['.zip', '.rar', '.7z', '.tar', '.gz'],
'代码': ['.py', '.js', '.html', '.css', '.java', '.cpp', '.c', '.go', '.rs'],
'软件包': ['.exe', '.msi', '.dmg', '.pkg', '.deb']
}
def __init__(self, source_dir: str, target_dir: str = None):
self.source = Path(source_dir)
self.target = Path(target_dir) if target_dir else self.source / '已整理'
self.stats = {'moved': 0, 'skipped': 0, 'errors': []}
def get_category(self, filename: str) -> str:
"""根据扩展名判断文件类别"""
ext = Path(filename).suffix.lower()
for category, extensions in self.FILE_TYPES.items():
if ext in extensions:
return category
return '其他'
def organize(self, group_by_date: bool = True):
"""
执行整理
:param group_by_date: 是否按日期子文件夹分组
"""
if not self.source.exists():
raise FileNotFoundError(f"源目录不存在: {self.source}")
self.target.mkdir(parents=True, exist_ok=True)
for file_path in self.source.iterdir():
if file_path.is_file() and file_path != Path(__file__):
try:
self._process_file(file_path, group_by_date)
except Exception as e:
self.stats['errors'].append(f"{file_path.name}: {str(e)}")
self._generate_report()
def _process_file(self, file_path: Path, group_by_date: bool):
"""处理单个文件"""
category = self.get_category(file_path.name)
# 构建目标路径
dest_dir = self.target / category
if group_by_date:
# 按修改日期分组:2026-04/10
mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
date_folder = f"{mtime.year}-{mtime.month:02d}"
dest_dir = dest_dir / date_folder
dest_dir.mkdir(parents=True, exist_ok=True)
dest_path = dest_dir / file_path.name
# 处理重名文件
counter = 1
original_dest = dest_path
while dest_path.exists():
stem = original_dest.stem
suffix = original_dest.suffix
dest_path = original_dest.with_name(f"{stem}_{counter}{suffix}")
counter += 1
shutil.move(str(file_path), str(dest_path))
self.stats['moved'] += 1
print(f"✓ {file_path.name} -> {category}/")
def _generate_report(self):
"""生成整理报告"""
report = f"""
整理完成报告
=================
已移动文件: {self.stats['moved']}
跳过文件: {self.stats['skipped']}
错误数量: {len(self.stats['errors'])}
输出目录: {self.target.absolute()}
"""
print(report)
if self.stats['errors']:
print("\n⚠️ 错误详情:")
for error in self.stats['errors']:
print(f" - {error}")
# 使用示例
if __name__ == "__main__":
# 整理下载文件夹
organizer = SmartOrganizer(
source_dir=os.path.expanduser("~/Downloads"),
target_dir=os.path.expanduser("~/Downloads/已整理")
)
organizer.organize(group_by_date=True)
二、Excel数据清洗助手:数据处理不再头疼
自动化数据清洗管道,支持常见脏数据场景:
import pandas as pd
import numpy as np
from typing import List, Dict, Callable
import re
class ExcelCleaner:
"""Excel数据清洗流水线"""
def __init__(self, file_path: str):
self.df = pd.read_excel(file_path)
self.original_shape = self.df.shape
self.operations_log = []
def clean_pipeline(self, config: Dict) -> pd.DataFrame:
"""
执行清洗流水线
:param config: 配置字典,控制各步骤是否执行
"""
steps = [
('去除完全空行', self.remove_empty_rows, config.get('remove_empty_rows', True)),
('去除完全空列', self.remove_empty_cols, config.get('remove_empty_cols', True)),
('去除重复行', self.remove_duplicates, config.get('remove_duplicates', True)),
('标准化列名', self.standardize_columns, config.get('standardize_columns', True)),
('处理缺失值', self.handle_missing, config.get('handle_missing', 'fill')),
('数据类型转换', self.convert_types, config.get('convert_types', {})),
('字符串清理', self.clean_strings, config.get('clean_strings', True)),
]
for desc, func, enabled in steps:
if enabled:
try:
func() if not isinstance(enabled, dict) else func(**enabled)
self.operations_log.append(f"✓ {desc}")
except Exception as e:
self.operations_log.append(f"✗ {desc}: {str(e)}")
return self.df
def remove_empty_rows(self):
"""去除完全为空的行"""
self.df.dropna(how='all', inplace=True)
def remove_empty_cols(self):
"""去除完全为空的列"""
self.df.dropna(axis=1, how='all', inplace=True)
def remove_duplicates(self, subset: List[str] = None, keep: str = 'first'):
"""
去除重复行
:param subset: 基于哪些列判断重复,None表示所有列
:param keep: 'first'(保留首次)/'last'(保留末次)/False(全部删除)
"""
before = len(self.df)
self.df.drop_duplicates(subset=subset, keep=keep, inplace=True)
removed = before - len(self.df)
if removed > 0:
self.operations_log.append(f" 移除 {removed} 行重复数据")
def standardize_columns(self):
"""标准化列名:去空格、转小写、替换特殊字符"""
def clean_col(name):
name = str(name).strip().lower()
name = re.sub(r'\s+', '_', name) # 空格转下划线
name = re.sub(r'[^\w]', '', name) # 去除特殊字符
return name
self.df.columns = [clean_col(col) for col in self.df.columns]
def handle_missing(self, strategy: str = 'fill', fill_value = 'N/A'):
"""
处理缺失值
:param strategy: 'drop'(删除)/'fill'(填充)/'interpolate'(插值)
"""
if strategy == 'drop':
self.df.dropna(inplace=True)
elif strategy == 'fill':
# 针对不同类型填充不同值
for col in self.df.columns:
if self.df[col].dtype == 'object':
self.df[col].fillna(fill_value, inplace=True)
else:
self.df[col].fillna(0, inplace=True)
elif strategy == 'interpolate':
self.df.interpolate(method='linear', inplace=True)
def convert_types(self, type_map: Dict[str, str]):
"""
转换数据类型
:param type_map: {'列名': '目标类型', ...}
"""
for col, dtype in type_map.items():
if col in self.df.columns:
try:
if dtype == 'datetime':
self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
elif dtype == 'numeric':
self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
elif dtype == 'category':
self.df[col] = self.df[col].astype('category')
else:
self.df[col] = self.df[col].astype(dtype)
except Exception as e:
print(f"类型转换失败 {col}: {e}")
def clean_strings(self):
"""清理字符串:去除首尾空格、统一换行符"""
for col in self.df.select_dtypes(include=['object']).columns:
self.df[col] = self.df[col].astype(str).str.strip()
self.df[col] = self.df[col].str.replace(r'\r\n|\r|\n', ' ', regex=True)
def add_calculated_column(self, name: str, formula: Callable):
"""添加计算列"""
self.df[name] = self.df.apply(formula, axis=1)
self.operations_log.append(f"✓ 添加计算列: {name}")
def export(self, output_path: str, sheet_name: str = '清洗后数据'):
"""导出结果"""
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
self.df.to_excel(writer, sheet_name=sheet_name, index=False)
# 添加清洗报告sheet
report_df = pd.DataFrame({
'项目': ['原始行数', '原始列数', '处理后行数', '处理后列数', '操作记录'],
'值': [
self.original_shape[0],
self.original_shape[1],
self.df.shape[0],
self.df.shape[1],
'\n'.join(self.operations_log)
]
})
report_df.to_excel(writer, sheet_name='清洗报告', index=False)
print(f"✓ 已导出至: {output_path}")
print(f"清洗报告:\n" + "\n".join(self.operations_log))
# 使用示例
if __name__ == "__main__":
# 假设有一个销售数据Excel,包含脏数据
cleaner = ExcelCleaner("raw_sales_data.xlsx")
config = {
'remove_empty_rows': True,
'remove_duplicates': True,
'standardize_columns': True,
'handle_missing': 'fill', # 填充缺失值
'convert_types': {
'order_date': 'datetime',
'amount': 'numeric',
'customer_id': 'category'
},
'clean_strings': True
}
# 执行清洗
clean_df = cleaner.clean_pipeline(config)
# 添加计算列示例:计算折扣后金额
if 'amount' in clean_df.columns and 'discount' in clean_df.columns:
cleaner.add_calculated_column(
'final_amount',
lambda row: row['amount'] * (1 - row['discount']) if pd.notna(row['discount']) else row['amount']
)
# 导出
cleaner.export("cleaned_sales_data.xlsx")
三、批量重命名文件:告别手残党
工作中经常遇到需要批量重命名文件的场景,比如整理照片、规范日志文件名等。
import os
import re
from pathlib import Path
def batch_rename(directory, old_pattern, new_pattern, dry_run=True):
"""
批量重命名文件
:param directory: 目标目录
:param old_pattern: 旧文件名匹配模式(正则)
:param new_pattern: 新文件名替换模板,可用 \\1, \\2 引用捕获组
:param dry_run: 是否为试运行模式(默认True,只打印不执行)
"""
path = Path(directory)
if not path.exists():
print(f"❌ 目录不存在: {directory}")
return
renamed_count = 0
for file in path.iterdir():
if file.is_file():
new_name = re.sub(old_pattern, new_pattern, file.name)
if new_name != file.name:
new_path = file.parent / new_name
if dry_run:
print(f"[试运行] {file.name} -> {new_name}")
else:
file.rename(new_path)
print(f"✅ 已重命名: {file.name} -> {new_name}")
renamed_count += 1
print(f"\n总计: {renamed_count} 个文件")
if dry_run:
print("� 这是试运行模式,添加 dry_run=False 参数以实际执行")
# 使用示例:将 "IMG_2026410_123456.jpg" 重命名为 "2026-04-10_123456.jpg"
if __name__ == "__main__":
batch_rename(
directory="./photos",
old_pattern=r"IMG_(\d{4})(\d{2})(\d{2})_(\d+)\.jpg",
new_pattern=r"\1-\2-\3_\4.jpg",
dry_run=True # 先试运行,确认无误后改为 False
)