Python自动化办公:3个让你提前下班的实用脚本

47 阅读5分钟

在日常开发工作中,我们经常被重复性的事务工作消耗大量时间。今天分享3个我在实际工作中用到的Python脚本,覆盖文件分类整理、数据清洗、文件批量重命名等高频场景。

一、智能文件整理器:告别桌面混乱

自动按文件类型分类,并支持自定义规则:

import os
import shutil
from pathlib import Path
from datetime import datetime

class SmartOrganizer:
    """智能文件分类器,支持自定义规则与日志记录"""
    
    # 文件类型映射规则(可扩展)
    FILE_TYPES = {
        '图片': ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'],
        '文档': ['.pdf', '.doc', '.docx', '.txt', '.md', '.xls', '.xlsx', '.ppt', '.pptx'],
        '视频': ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv'],
        '音频': ['.mp3', '.wav', '.flac', '.aac', '.ogg'],
        '压缩包': ['.zip', '.rar', '.7z', '.tar', '.gz'],
        '代码': ['.py', '.js', '.html', '.css', '.java', '.cpp', '.c', '.go', '.rs'],
        '软件包': ['.exe', '.msi', '.dmg', '.pkg', '.deb']
    }
    
    def __init__(self, source_dir: str, target_dir: str = None):
        self.source = Path(source_dir)
        self.target = Path(target_dir) if target_dir else self.source / '已整理'
        self.stats = {'moved': 0, 'skipped': 0, 'errors': []}
        
    def get_category(self, filename: str) -> str:
        """根据扩展名判断文件类别"""
        ext = Path(filename).suffix.lower()
        for category, extensions in self.FILE_TYPES.items():
            if ext in extensions:
                return category
        return '其他'
    
    def organize(self, group_by_date: bool = True):
        """
        执行整理
        :param group_by_date: 是否按日期子文件夹分组
        """
        if not self.source.exists():
            raise FileNotFoundError(f"源目录不存在: {self.source}")
        
        self.target.mkdir(parents=True, exist_ok=True)
        
        for file_path in self.source.iterdir():
            if file_path.is_file() and file_path != Path(__file__):
                try:
                    self._process_file(file_path, group_by_date)
                except Exception as e:
                    self.stats['errors'].append(f"{file_path.name}: {str(e)}")
        
        self._generate_report()
    
    def _process_file(self, file_path: Path, group_by_date: bool):
        """处理单个文件"""
        category = self.get_category(file_path.name)
        
        # 构建目标路径
        dest_dir = self.target / category
        if group_by_date:
            # 按修改日期分组:2026-04/10
            mtime = datetime.fromtimestamp(file_path.stat().st_mtime)
            date_folder = f"{mtime.year}-{mtime.month:02d}"
            dest_dir = dest_dir / date_folder
        
        dest_dir.mkdir(parents=True, exist_ok=True)
        dest_path = dest_dir / file_path.name
        
        # 处理重名文件
        counter = 1
        original_dest = dest_path
        while dest_path.exists():
            stem = original_dest.stem
            suffix = original_dest.suffix
            dest_path = original_dest.with_name(f"{stem}_{counter}{suffix}")
            counter += 1
        
        shutil.move(str(file_path), str(dest_path))
        self.stats['moved'] += 1
        print(f"✓ {file_path.name} -> {category}/")
    
    def _generate_report(self):
        """生成整理报告"""
        report = f"""
        整理完成报告
        =================
        已移动文件: {self.stats['moved']}
        跳过文件:   {self.stats['skipped']}
        错误数量:   {len(self.stats['errors'])}
        输出目录:   {self.target.absolute()}
        """
        print(report)
        
        if self.stats['errors']:
            print("\n⚠️  错误详情:")
            for error in self.stats['errors']:
                print(f"  - {error}")

# 使用示例
if __name__ == "__main__":
    # 整理下载文件夹
    organizer = SmartOrganizer(
        source_dir=os.path.expanduser("~/Downloads"),
        target_dir=os.path.expanduser("~/Downloads/已整理")
    )
    organizer.organize(group_by_date=True)

二、Excel数据清洗助手:数据处理不再头疼

自动化数据清洗管道,支持常见脏数据场景:

import pandas as pd
import numpy as np
from typing import List, Dict, Callable
import re

class ExcelCleaner:
    """Excel数据清洗流水线"""
    
    def __init__(self, file_path: str):
        self.df = pd.read_excel(file_path)
        self.original_shape = self.df.shape
        self.operations_log = []
        
    def clean_pipeline(self, config: Dict) -> pd.DataFrame:
        """
        执行清洗流水线
        :param config: 配置字典,控制各步骤是否执行
        """
        steps = [
            ('去除完全空行', self.remove_empty_rows, config.get('remove_empty_rows', True)),
            ('去除完全空列', self.remove_empty_cols, config.get('remove_empty_cols', True)),
            ('去除重复行', self.remove_duplicates, config.get('remove_duplicates', True)),
            ('标准化列名', self.standardize_columns, config.get('standardize_columns', True)),
            ('处理缺失值', self.handle_missing, config.get('handle_missing', 'fill')),
            ('数据类型转换', self.convert_types, config.get('convert_types', {})),
            ('字符串清理', self.clean_strings, config.get('clean_strings', True)),
        ]
        
        for desc, func, enabled in steps:
            if enabled:
                try:
                    func() if not isinstance(enabled, dict) else func(**enabled)
                    self.operations_log.append(f"✓ {desc}")
                except Exception as e:
                    self.operations_log.append(f"✗ {desc}: {str(e)}")
        
        return self.df
    
    def remove_empty_rows(self):
        """去除完全为空的行"""
        self.df.dropna(how='all', inplace=True)
    
    def remove_empty_cols(self):
        """去除完全为空的列"""
        self.df.dropna(axis=1, how='all', inplace=True)
    
    def remove_duplicates(self, subset: List[str] = None, keep: str = 'first'):
        """
        去除重复行
        :param subset: 基于哪些列判断重复,None表示所有列
        :param keep: 'first'(保留首次)/'last'(保留末次)/False(全部删除)
        """
        before = len(self.df)
        self.df.drop_duplicates(subset=subset, keep=keep, inplace=True)
        removed = before - len(self.df)
        if removed > 0:
            self.operations_log.append(f"  移除 {removed} 行重复数据")
    
    def standardize_columns(self):
        """标准化列名:去空格、转小写、替换特殊字符"""
        def clean_col(name):
            name = str(name).strip().lower()
            name = re.sub(r'\s+', '_', name)  # 空格转下划线
            name = re.sub(r'[^\w]', '', name)  # 去除特殊字符
            return name
        
        self.df.columns = [clean_col(col) for col in self.df.columns]
    
    def handle_missing(self, strategy: str = 'fill', fill_value = 'N/A'):
        """
        处理缺失值
        :param strategy: 'drop'(删除)/'fill'(填充)/'interpolate'(插值)
        """
        if strategy == 'drop':
            self.df.dropna(inplace=True)
        elif strategy == 'fill':
            # 针对不同类型填充不同值
            for col in self.df.columns:
                if self.df[col].dtype == 'object':
                    self.df[col].fillna(fill_value, inplace=True)
                else:
                    self.df[col].fillna(0, inplace=True)
        elif strategy == 'interpolate':
            self.df.interpolate(method='linear', inplace=True)
    
    def convert_types(self, type_map: Dict[str, str]):
        """
        转换数据类型
        :param type_map: {'列名': '目标类型', ...}
        """
        for col, dtype in type_map.items():
            if col in self.df.columns:
                try:
                    if dtype == 'datetime':
                        self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
                    elif dtype == 'numeric':
                        self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
                    elif dtype == 'category':
                        self.df[col] = self.df[col].astype('category')
                    else:
                        self.df[col] = self.df[col].astype(dtype)
                except Exception as e:
                    print(f"类型转换失败 {col}: {e}")
    
    def clean_strings(self):
        """清理字符串:去除首尾空格、统一换行符"""
        for col in self.df.select_dtypes(include=['object']).columns:
            self.df[col] = self.df[col].astype(str).str.strip()
            self.df[col] = self.df[col].str.replace(r'\r\n|\r|\n', ' ', regex=True)
    
    def add_calculated_column(self, name: str, formula: Callable):
        """添加计算列"""
        self.df[name] = self.df.apply(formula, axis=1)
        self.operations_log.append(f"✓ 添加计算列: {name}")
    
    def export(self, output_path: str, sheet_name: str = '清洗后数据'):
        """导出结果"""
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            self.df.to_excel(writer, sheet_name=sheet_name, index=False)
            
            # 添加清洗报告sheet
            report_df = pd.DataFrame({
                '项目': ['原始行数', '原始列数', '处理后行数', '处理后列数', '操作记录'],
                '值': [
                    self.original_shape[0],
                    self.original_shape[1],
                    self.df.shape[0],
                    self.df.shape[1],
                    '\n'.join(self.operations_log)
                ]
            })
            report_df.to_excel(writer, sheet_name='清洗报告', index=False)
        
        print(f"✓ 已导出至: {output_path}")
        print(f"清洗报告:\n" + "\n".join(self.operations_log))

# 使用示例
if __name__ == "__main__":
    # 假设有一个销售数据Excel,包含脏数据
    cleaner = ExcelCleaner("raw_sales_data.xlsx")
    
    config = {
        'remove_empty_rows': True,
        'remove_duplicates': True,
        'standardize_columns': True,
        'handle_missing': 'fill',  # 填充缺失值
        'convert_types': {
            'order_date': 'datetime',
            'amount': 'numeric',
            'customer_id': 'category'
        },
        'clean_strings': True
    }
    
    # 执行清洗
    clean_df = cleaner.clean_pipeline(config)
    
    # 添加计算列示例:计算折扣后金额
    if 'amount' in clean_df.columns and 'discount' in clean_df.columns:
        cleaner.add_calculated_column(
            'final_amount',
            lambda row: row['amount'] * (1 - row['discount']) if pd.notna(row['discount']) else row['amount']
        )
    
    # 导出
    cleaner.export("cleaned_sales_data.xlsx")

三、批量重命名文件:告别手残党

工作中经常遇到需要批量重命名文件的场景,比如整理照片、规范日志文件名等。

import os
import re
from pathlib import Path

def batch_rename(directory, old_pattern, new_pattern, dry_run=True):
    """
    批量重命名文件
    :param directory: 目标目录
    :param old_pattern: 旧文件名匹配模式(正则)
    :param new_pattern: 新文件名替换模板,可用 \\1, \\2 引用捕获组
    :param dry_run: 是否为试运行模式(默认True,只打印不执行)
    """
    path = Path(directory)
    if not path.exists():
        print(f"❌ 目录不存在: {directory}")
        return
    
    renamed_count = 0
    
    for file in path.iterdir():
        if file.is_file():
            new_name = re.sub(old_pattern, new_pattern, file.name)
            if new_name != file.name:
                new_path = file.parent / new_name
                if dry_run:
                    print(f"[试运行] {file.name} -> {new_name}")
                else:
                    file.rename(new_path)
                    print(f"✅ 已重命名: {file.name} -> {new_name}")
                renamed_count += 1
    
    print(f"\n总计: {renamed_count} 个文件")
    if dry_run:
        print("� 这是试运行模式,添加 dry_run=False 参数以实际执行")

# 使用示例:将 "IMG_2026410_123456.jpg" 重命名为 "2026-04-10_123456.jpg"
if __name__ == "__main__":
    batch_rename(
        directory="./photos",
        old_pattern=r"IMG_(\d{4})(\d{2})(\d{2})_(\d+)\.jpg",
        new_pattern=r"\1-\2-\3_\4.jpg",
        dry_run=True  # 先试运行,确认无误后改为 False
    )