用Python实现自动化办公:批量处理Excel/PDF的10个技巧

3 阅读4分钟

摘要:每天花2小时做重复的Excel操作?Python可以帮你压缩到2分钟。本文分享10个实用的自动化办公技巧,覆盖Excel读写、PDF处理、文件批量操作等高频场景。

准备工作

pip install openpyxl pandas pdfplumber reportlab python-docx

技巧1:批量合并多个Excel文件

最常见的需求——把几十个Excel汇总成一个:

import pandas as pd
from pathlib import Path

def merge_excel(folder, output='merged.xlsx'):
    files = list(Path(folder).glob('*.xlsx'))
    dfs = []
    for f in files:
        df = pd.read_excel(f)
        df['来源文件'] = f.name  # 标记来源
        dfs.append(df)
    
    result = pd.concat(dfs, ignore_index=True)
    result.to_excel(output, index=False)
    print(f'合并完成:{len(files)}个文件 → {len(result)}行数据')

merge_excel('./monthly_reports/')

技巧2:按条件拆分Excel

反过来,把一个大表按部门/类别拆成多个文件:

def split_excel(input_file, column, output_dir='output'):
    Path(output_dir).mkdir(exist_ok=True)
    df = pd.read_excel(input_file)
    
    for value, group in df.groupby(column):
        filename = f'{output_dir}/{value}.xlsx'
        group.to_excel(filename, index=False)
        print(f'{value}: {len(group)}行 → {filename}')

split_excel('employees.xlsx', '部门')

技巧3:Excel数据清洗一条龙

def clean_data(input_file, output_file='cleaned.xlsx'):
    df = pd.read_excel(input_file)
    
    # 去除空行
    df.dropna(how='all', inplace=True)
    
    # 去除前后空格
    str_cols = df.select_dtypes(include='object').columns
    df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())
    
    # 去重
    before = len(df)
    df.drop_duplicates(inplace=True)
    print(f'去重:{before}{len(df)}行')
    
    # 统一日期格式
    for col in df.columns:
        if '日期' in col or 'date' in col.lower():
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # 手机号脱敏
    if '手机' in df.columns:
        df['手机'] = df['手机'].astype(str).apply(
            lambda x: x[:3] + '****' + x[7:] if len(x) == 11 else x
        )
    
    df.to_excel(output_file, index=False)
    print(f'清洗完成 → {output_file}')

clean_data('raw_data.xlsx')

技巧4:自动生成数据报表

from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.chart import BarChart, Reference

def create_report(data, output='report.xlsx'):
    wb = Workbook()
    ws = wb.active
    ws.title = '月度报表'
    
    # 标题样式
    title_font = Font(name='微软雅黑', size=14, bold=True)
    header_fill = PatternFill(start_color='4472C4', fill_type='solid')
    header_font = Font(name='微软雅黑', size=11, bold=True, color='FFFFFF')
    
    # 写标题
    ws.merge_cells('A1:D1')
    ws['A1'] = '2026年2月销售报表'
    ws['A1'].font = title_font
    ws['A1'].alignment = Alignment(horizontal='center')
    
    # 写表头
    headers = ['部门', '销售额', '目标', '完成率']
    for col, header in enumerate(headers, 1):
        cell = ws.cell(row=3, column=col, value=header)
        cell.font = header_font
        cell.fill = header_fill
        cell.alignment = Alignment(horizontal='center')
    
    # 写数据
    for row_idx, row_data in enumerate(data, 4):
        for col_idx, value in enumerate(row_data, 1):
            ws.cell(row=row_idx, column=col_idx, value=value)
    
    # 添加图表
    chart = BarChart()
    chart.title = '各部门销售额'
    chart.y_axis.title = '金额(万元)'
    data_ref = Reference(ws, min_col=2, min_row=3, max_row=3+len(data), max_col=3)
    cats = Reference(ws, min_col=1, min_row=4, max_row=3+len(data))
    chart.add_data(data_ref, titles_from_data=True)
    chart.set_categories(cats)
    ws.add_chart(chart, 'F3')
    
    # 调整列宽
    for col in ['A', 'B', 'C', 'D']:
        ws.column_dimensions[col].width = 15
    
    wb.save(output)
    print(f'报表生成 → {output}')

data = [
    ['技术部', 150, 120, '125%'],
    ['市场部', 200, 180, '111%'],
    ['运营部', 80, 100, '80%'],
    ['销售部', 300, 250, '120%'],
]
create_report(data)

技巧5:PDF文本提取

import pdfplumber

def extract_pdf_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_text = []
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                all_text.append(f'--- 第{i+1}页 ---\n{text}')
        return '\n\n'.join(all_text)

text = extract_pdf_text('contract.pdf')
print(text)

技巧6:PDF表格提取为Excel

import pdfplumber
import pandas as pd

def pdf_tables_to_excel(pdf_path, output='tables.xlsx'):
    all_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                df = pd.DataFrame(table[1:], columns=table[0])
                all_tables.append(df)
    
    if all_tables:
        with pd.ExcelWriter(output) as writer:
            for i, df in enumerate(all_tables):
                df.to_excel(writer, sheet_name=f'表格{i+1}', index=False)
        print(f'提取了{len(all_tables)}个表格 → {output}')
    else:
        print('未找到表格')

pdf_tables_to_excel('financial_report.pdf')

技巧7:批量PDF转图片

import subprocess
from pathlib import Path

def pdf_to_images(pdf_path, output_dir='images', dpi=200):
    """需要安装poppler: apt install poppler-utils"""
    Path(output_dir).mkdir(exist_ok=True)
    subprocess.run([
        'pdftoppm', '-png', '-r', str(dpi),
        pdf_path, f'{output_dir}/page'
    ])
    images = list(Path(output_dir).glob('*.png'))
    print(f'转换完成:{len(images)}张图片')

技巧8:批量文件重命名

from pathlib import Path
import re

def batch_rename(folder, pattern, replacement):
    """正则批量重命名"""
    folder = Path(folder)
    renamed = 0
    for f in folder.iterdir():
        new_name = re.sub(pattern, replacement, f.name)
        if new_name != f.name:
            f.rename(folder / new_name)
            print(f'{f.name}{new_name}')
            renamed += 1
    print(f'重命名了{renamed}个文件')

# 示例:把"报告_2026_01.xlsx"改成"2026-01-报告.xlsx"
batch_rename('./reports/', r'(.+)_(\d{4})_(\d{2})', r'\2-\3-\1')

技巧9:自动发送邮件报表

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email import encoders

def send_report(to_email, subject, body, attachment_path,
                smtp_host='smtp.example.com', smtp_port=465,
                username='[email]', password='[password]'):
    msg = MIMEMultipart()
    msg['From'] = username
    msg['To'] = to_email
    msg['Subject'] = subject
    msg.attach(MIMEText(body, 'html', 'utf-8'))
    
    # 添加附件
    with open(attachment_path, 'rb') as f:
        part = MIMEBase('application', 'octet-stream')
        part.set_payload(f.read())
        encoders.encode_base64(part)
        part.add_header('Content-Disposition',
                       f'attachment; filename="{Path(attachment_path).name}"')
        msg.attach(part)
    
    with smtplib.SMTP_SSL(smtp_host, smtp_port) as server:
        server.login(username, password)
        server.send_message(msg)
    print(f'邮件已发送至 {to_email}')

技巧10:定时自动执行

把上面的脚本串起来,用schedule库定时执行:

import schedule
import time

def daily_job():
    print('开始执行日报任务...')
    # 1. 合并当天数据
    merge_excel('./today_data/')
    # 2. 清洗数据
    clean_data('merged.xlsx')
    # 3. 生成报表
    create_report(load_data('cleaned.xlsx'))
    # 4. 发送邮件
    send_report('[email]', '日报', '<h1>今日报表</h1>', 'report.xlsx')
    print('日报任务完成!')

schedule.every().day.at('09:00').do(daily_job)

while True:
    schedule.run_pending()
    time.sleep(60)

总结

这10个技巧覆盖了日常办公自动化的大部分场景。核心思路就是:把重复操作抽象成函数,用Python批量执行。

一个建议:先从最痛的点开始自动化。如果你每天花1小时合并Excel,那就先自动化这个。等尝到甜头了,再逐步扩展到其他场景。

Python自动化办公的投入产出比极高——花半天写个脚本,能省下未来几百个小时的重复劳动。