python自动化-word文档读写

422 阅读7分钟

「这是我参与2022首次更文挑战的第27天,活动详情查看:2022首次更文挑战

使用库:python-docx

安装:pip3 install python-docx

官方文档: python-docx.readthedocs.io/en/latest/#

!pip3 install python-docx
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
     |████████████████████████████████| 5.6 MB 143 kB/s 
[?25hCollecting lxml>=2.3.2
  Downloading lxml-4.6.3-cp39-cp39-macosx_10_9_x86_64.whl (4.6 MB)
     |████████████████████████████████| 4.6 MB 143 kB/s 
[?25hBuilding wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25ldone
[?25h  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184600 sha256=1c87605adcd69ddfb607ab8adbe12a22072acb88e6d4b92da91dfee86b112b42
  Stored in directory: /Users/lichizou/Library/Caches/pip/wheels/83/8b/7c/09ae60c42c7ba4ed2dddaf2b8b9186cb105255856d6ed3dba5
Successfully built python-docx
Installing collected packages: lxml, python-docx
Successfully installed lxml-4.6.3 python-docx-0.8.11

word结构与python对象

文档:Document

整个文章是一个Document对象,对象里包含多个段落对象Paragraph,放在document.paragraphs

段落:Paragraph

一个段落对象Paragraph含有多个文字块run对象,放在paragraph.runs

文字块:Run

docx文档最小单位,对象哪文本样式一致

from docx import Document
# 新建文档
doc_1 = Document()
?doc_1.
Type:        Document
String form: <docx.document.Document object at 0x112138d00>
File:        /usr/local/lib/python3.9/site-packages/docx/document.py
Docstring:  
WordprocessingML (WML) document.

Not intended to be constructed directly. Use :func:`docx.Document` to open or create
a document.
doc_1.paragraphs
[]

标题

# 添加标题
doc_1.add_heading('1级标题', level = 0)
doc_1.add_heading('2级标题', level = 1)
doc_1.add_heading('3级标题', level = 2)
<docx.text.paragraph.Paragraph at 0x10e9d0880>

段落

数据对象:paragraph

段落的换行符需要手动添加

# 新增段落
paragraph_1 = doc_1.add_paragraph('段落1')
# 在当前段落新增段落
paragraph_2 = doc_1.add_paragraph('段落2')
prior_paragraph = paragraph_1.insert_paragraph_before('段落1前')

缩进和间距

#对齐:左侧,右侧,两端对齐,居中,分散对齐
from docx.enum.text import WD_ALIGN_PARAGRAPH
#LEFT: 左对齐
#CENTER: 文字居中
#RIGHT: 右对齐
#JUSTIFY: 文本两端对齐

paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT

间距:段前,段后,行距

# SINGLE :单倍行距(默认)
#ONE_POINT_FIVE : 1.5倍行距
# DOUBLE2 : 倍行距
#AT_LEAST : 最小值
#EXACTLY:固定值
# MULTIPLE : 多倍行距

# 行间距
paragraph.line_spacing_rule = WD_LINE_SPACING.EXACTLY #固定值
paragraph.line_spacing = Pt(18) # 固定值18磅
paragraph.line_spacing_rule = WD_LINE_SPACING.MULTIPLE #多倍行距
paragraph.line_spacing = 1.75 # 1.75倍行间距

换行和分页

# 在第二页增加1个段落
paragraph_3 = doc_1.add_paragraph('第二页第1个段落')

多节页眉与页脚的设置

from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
# 页眉与页脚
doc = Document()
header = doc.sections[0].header
print('页眉默认段落数:', len(header.paragraphs))
页眉默认段落数: 1
par = header.paragraphs[0]
par.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
par.add_run('第一页页眉')
<docx.text.run.Run at 0x117ccac10>
footer = doc.sections[0].footer
par = footer.paragraphs[0]
par.add_run('第一页的页脚')
par.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
# 默认之后的节的页眉与页脚样式和内容与前一节相同
doc.add_section()
doc.add_section()
<docx.section.Section at 0x117d43250>
# 第二页页眉设置居中
header = doc.sections[1].header
header.is_linked_to_previous = False # 第二个节页眉的样式与内容与第一个节不同
par = header.paragraphs[0]
par.add_run('第二页页眉')
par.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# 第二页页脚设置居中
footer = doc.sections[1].footer
footer.is_linked_to_previous = False
par = footer.paragraphs[0]
par.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
par.add_run('第二页页脚')
<docx.text.run.Run at 0x117ccaa00>
header = doc.sections[2].header
header.is_linked_to_previous = False # 第二个节页眉的样式与内容与第一个节不同
par = header.paragraphs[0]
par.add_run('第三页页眉')
par.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
footer = doc.sections[2].footer
footer.is_linked_to_previous = False
par = footer.paragraphs[0]
par.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT # 对齐单词不要拼错
par.add_run('第三页页脚')
<docx.text.run.Run at 0x117cc56d0>
doc.save('doc_14.docx')

字体

from docx.shared import RGBColor,Pt
#all_caps:全部大写字母
#bold:加粗
#color:字体颜色

#double_strike:双删除线
#hidden : 隐藏
#imprint : 印记
#italic : 斜体
#name  :字体
#shadow  :阴影
#strike  :  删除线
#subscript  :下标	
#superscript  :上标
#underline  :下划线
# 对段落1设置字体样式
paragraph_1.add_run('粗体').bold = True
paragraph_1.add_run('普通字体')
<docx.text.run.Run at 0x10e9d0730>
paragraph_1.add_run('斜体').italic = True

插入:图片或表格

doc = Document('doc_5.docx')
# 插入图片
doc.add_picture('./图片/周杰伦.jpg', width=Inches(1.0), height=Inches(1.0))
<docx.shape.InlineShape at 0x11842a400>
# 插入表格
table = doc.add_table(rows=2, cols=1)
table.style = 'Medium Grid 1 Accent 1' # and: Light Shading Accent 1
# 填写表格单元格内容
table.cell(0, 0).text = 'row1, col1'
table.rows[1].cells[0].text = 'row2, col1'
# 给表格增加一行
new_row = table.add_row().cells
new_row[0].text = 'row3, col1'
# 给表格增加一列
help(table.add_column)
Help on method add_column in module docx.table:

add_column(width) method of docx.table.Table instance
    Return a |_Column| object of *width*, newly added rightmost to the
    table.
doc.save('doc_7.docx')

案例2:整体页面结构


from docx import Document
from docx.shared import RGBColor, Pt, Inches, Cm
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
doc_2 = Document()
# 设置全局字体
doc_2.styles['Normal'].font.name = '宋体'
# 中文字体
doc_2.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
# 添加标题
heading_2 = doc_2.add_heading('title 1', level = 0)
heading_2.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # default left
# 新增段落
paragraph_2_1 = doc_2.add_paragraph()
# 设置段落格式
paragraph_2_1.paragraph_format.first_line_indent = Cm(0.75) # 首行缩进0.75cm
paragraph_2_1.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT # 居左对齐
paragraph_2_1.paragraph_format.space_after = Inches(1.0) # 段落后距离1.0英寸,72磅
paragraph_2_1.paragraph_format.line_spacing = 1.5 # 1.5倍行距
text = """random库函数:
1、seek() 种子,默认种子是系统时钟
2、random()函数,生成0到1的随机小数
3、uniform(a,b)生成a到b的随机小数
4、randint(a,b)生成一个a到b的随即整数
5、randrange(a,b,c) 生成一个a到b,以c递增的数
6、choice(<list>) 随机返回一个列表里面的元素
7、shuffle(<list>)将列表的元素随机打乱
8、sample(<list>,k)从列表中随机抽取k个元素
"""
r_2 = paragraph_2_1.add_run(text)
r_2.font.size = Pt(10)
r_2.font.bold = True
r_2.font.color.rgb = RGBColor(255, 0, 0)
doc_2.save('doc_2.docx')

案例3-字体设置

from docx import Document
from docx.oxml.ns import qn
from docx.enum.style import WD_STYLE_TYPE
doc_3 = Document() # 新建docx文档
# 设置文档里的可选字体样式
def add_font_style(doc, font_name_list):
    for font in font_name_list:
        if font in doc.styles:
            continue
        style_font = doc.styles.add_style(font, WD_STYLE_TYPE.CHARACTER)
        style_font.font.name = font
        doc.styles[font]._element.rPr.rFonts.set(qn('w:eastAsia'), font) # w:不能加空格
        #par = doc.add_paragraph()
        #text = par.add_run(text, style=font)
add_font_style(doc_3, ['宋体', '楷体', '华文中宋'])
par_3 = doc_3.add_paragraph()
r_3 = par_3.add_run('abhEFDSF,;宋体', style = '宋体')
font = r_3.font
font.name = 'Cambira'
par_3.add_run('abhEFDSF,;楷体\n', style='楷体').font.name = 'Cambira'
par_3.add_run('abhEFDSF,;华文中宋\n', style='华文中宋').font.name = 'Cambira'
doc_3.save('doc_3.docx')
help(par_3.add_run)
Help on method add_run in module docx.text.paragraph:

add_run(text=None, style=None) method of docx.text.paragraph.Paragraph instance
    Append a run to this paragraph containing *text* and having character
    style identified by style ID *style*. *text* can contain tab
    (``\t``) characters, which are converted to the appropriate XML form
    for a tab. *text* can also include newline (``\n``) or carriage
    return (``\r``) characters, each of which is converted to a line
    break.
# 封装字体设置
def set_font_style(doc, font, text):
    if font not in doc.styles:
        style_font = doc.styles.add_style(font, WD_STYLE_TYPE.CHARACTER)
        style_font.font.name = font
        doc.styles[font]._element.rPr.rFonts.set(qn('w:eastAsia'), font) # w:不能加空格
    par = doc.add_paragraph()
    text = par.add_run(text, style = font)
doc_4 = Document()
font_style_list = ['宋体', '楷体', '华文中宋']
for style in font_style_list:
    text = 'example:' + style + '\n'
    set_font_style(doc_4, style, text)
doc_4.save('doc_5.docx')

保存文件

# 普通保存
doc_1.save('doc_1.docx')

练习

根据excel表格

参会人名单.png

批量生成邀请函

邀请函样式.png

分析

1.确定word的格式:在word先写一下

标题:1级标题,楷体,居中。固定文字:邀请函

段落1: 楷体,左对齐。4个文字块:

文字块1:尊敬的

文字块2: 加粗,下划线。文字为变量,对应excel的a列

文字块3: 公司

文字块4: 加粗,下划线。文字为变量,对应excel的b列和c列的拼接。b+c

文字块5: ,您好:

段落2: 楷体,左缩进2个字符

文字块1:现诚挚邀请您于2021年10月27日参加

文字块2:黑体。DataWhale

文字块3: 主办的享受开源2050活动,地点在北京鸟巢,希望您届时莅临参加。

段落三:楷体,右对齐

文字块1: 邀请时间:

文字块2: 来自excel的d列,做格式化:”x年x月x日“

页眉页脚,不设置,采用默认缩进

2.对接excel

from openpyxl import load_workbook
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from docx.shared import Pt

从excel获取数据,格式化成dict

wb = load_workbook(filename='./图片/excel到word.xlsx')
wb.sheetnames
['邀请人员']
sheet = wb['邀请人员']
4
def format_date(d):
    return d.strftime('%Y年%m月%d日')
format_date(sheet['d2'].value)
'2021年02月15日'
def get_format_cell(row):
    dic = {}
    dic['company'] = row[0].value
    dic['people'] = row[1].value + row[2].value
    dic['date'] = row[3].value.strftime('%Y年%m月%d日') 
    return dic


for row in sheet.rows:
    if (row[0].row > 1):
        print(get_format_cell(row))
        #print(f"{row[0].value}:{row[1].value + row[2].value}:{format_date(row[3].value)}")

{'company': '阿里', 'people': '数据工程师牛云', 'date': '2021年02月15日'}
{'company': '腾讯', 'people': '数据分析师牛化腾', 'date': '2021年02月16日'}
{'company': '百度', 'people': '数据架构师张艳红', 'date': '2021年02月17日'}
{'company': '京东', 'people': '算法工程师王强东', 'date': '2021年02月18日'}

word格式测试

def add_style(doc, font_name):
    style = doc.styles.add_style(font_name, WD_STYLE_TYPE.CHARACTER)
    style.font.name = font_name
    doc.styles[font_name]._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
doc = Document()
add_style(doc, '楷体-简')
add_style(doc, '黑体')

doc.styles['Normal'].font.name = '楷体-简'
doc.styles['Normal'].font.size =  Pt(12)
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '楷体-简')
h1 = doc.add_paragraph('邀 请 函')
h1.alignment = WD_ALIGN_PARAGRAPH.CENTER
r1 = h1.runs[0]
r1.bold = True
r1.font.size = Pt(20)
p1 = doc.add_paragraph('')
#p1.style.font.name = '楷体'
p1.alignment = WD_ALIGN_PARAGRAPH.LEFT
p1.add_run('尊敬的')
r1 = p1.add_run('京东') # 
r1.bold = True
r1.underline = True
p1.add_run('公司')
r2 = p1.add_run('算法工程师王强')
r2.bold = True
r2.underline = True
p1.add_run(',您好:')
<docx.text.run.Run at 0x11d9c3820>
p2 = doc.add_paragraph('')
#p2.paragraph_format.first_line_indent = -8
p2.add_run('        现诚挚邀请您于2021年10月27日参加')
p2.add_run('DataWhale', style='黑体')
p2.add_run('主办的享受开源2050活动,地点在北京鸟巢,希望您届时莅临参加。')
<docx.text.run.Run at 0x11d9a16a0>
p3 = doc.add_paragraph('')
p3.alignment = WD_ALIGN_PARAGRAPH.RIGHT
p3.add_run('邀请时间:')
r1 = p3.add_run('2020年01年01日')
r1.bold = True
r1.underline = True
doc.save('ty.docx')

整合,从ecxel获取数据,输出到docx

def get_invate_file(dict):
    doc = Document()
    add_style(doc, '楷体-简')
    add_style(doc, '黑体')

    doc.styles['Normal'].font.name = '楷体-简'
    doc.styles['Normal'].font.size =  Pt(12)
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '楷体-简')

    h1 = doc.add_paragraph('邀 请 函')
    h1.alignment = WD_ALIGN_PARAGRAPH.CENTER
    r1 = h1.runs[0]
    r1.bold = True
    r1.font.size = Pt(20)

    p1 = doc.add_paragraph('')
    #p1.style.font.name = '楷体'
    p1.alignment = WD_ALIGN_PARAGRAPH.LEFT
    p1.add_run('尊敬的')
    r1 = p1.add_run(dict['company']) # 
    r1.bold = True
    r1.underline = True
    p1.add_run('公司')
    r2 = p1.add_run(dict['people'])
    r2.bold = True
    r2.underline = True
    p1.add_run(',您好:')

    p2 = doc.add_paragraph('')
    #p2.paragraph_format.first_line_indent = -8
    p2.add_run('        现诚挚邀请您于2021年10月27日参加')
    p2.add_run('DataWhale', style='黑体')
    p2.add_run('主办的享受开源2050活动,地点在北京鸟巢,希望您届时莅临参加。')

    p3 = doc.add_paragraph('')
    p3.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p3.add_run('邀请时间:')
    r1 = p3.add_run(dict['date'])
    r1.bold = True
    r1.underline = True

    file_name = '邀请函_' + dict['company'] + '_' + dict['people'] + '.docx'
    
    doc.save(file_name)
for row in sheet.rows:
    if (row[0].row > 1):
        get_invate_file(get_format_cell(row))