python读取word文档识别字段颜色,解析字段

498 阅读2分钟

python版本3.7.3,读取的文档格式为.docx

文中带有简单注释

import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜色类 

maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(__file__))
# coding=utf-8
#获取文档对象
def readDocx(fileName,type):
    xlsFile = curPath + '\\'+fileName+'.docx'   #地理(葡)Respueda G .es.pt
    print("xlsFile: "+xlsFile)
    file=docx.Document(xlsFile)
    # print("段落数:"+str(len(file.paragraphs)))

    index = 0
    data = {}
    i = 0
    global id
    global maxLength
    for p in file.paragraphs:
        i = i + 1
        if i <= 1:  #跳过第一行
            continue
        if p.text == "" or (not p.text.strip()):
            continue
        # print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+"  内容:"+p.text)
        if index == 0: #提取题目
            # print(p.text.find("-"),"题的内容是:", p.text)
            length = len(p.text)
            idx = p.text.find("Número")
            if idx != -1 and idx < 2:
                idx = idx + len("Número") + 1
                # print("Número: "+str(idx)+"   text: "+p.text)
                p.text = p.text[idx:(length)]
                # print("Número: "+str(idx)+"   text: "+p.text)
            
            indexStr = "-" #分隔符
            if p.text.find(indexStr) == -1:
                indexStr = "."
                if p.text.find(indexStr) == -1:
                    indexStr = " "
            # print("题的内容是:", p.text)
            idx = p.text.index(indexStr)+len(indexStr)
            length = len(p.text)
            if length > maxLength:
                maxLength = length
                # print(id,"最大字符数",maxLength)
            # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text)
            questionAndsubType = p.text[idx:(length)]
            questionAndsubTypeList = questionAndsubType.split("|")

            data["question"] = questionAndsubTypeList[0] #题目
            
            # if len(questionAndsubTypeList) > 1 : #类型
                # subType = questionAndsubTypeList[1].replace("\n", "")
                # print("---类型---",type_list.count(subType))
                # if type_list.count(subType) <= 0 :
                    # type_list.append(subType)

            data["subType"] = type#escape(subType)  #类型
        else:   #提取选项,以及正确答案
            # print("第"+str(id)+"题    选项"+ str(index) +"是:"+p.text)
            length = len(p.text)
            for n in p.runs:
                rgb = str(n.font.color.rgb) #读取段落颜色
                # print("runs"+rgb)
                if rgb == "00FF00":
                    # print("正确答案: ",index)
                    data["rightIndex"] = index
            #删除段落中不必要文字
            idx = p.text.find("(Direito)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correcta)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Right)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correct)")
            if idx != -1:
                p.text = p.text[0:idx]
            #删除段落中不必要文字
            
            data["option"+str(index)] = p.text

        index = index + 1

        if index >= 5:
            data["_id"] = id
            # print("data: "+str(data))
            convert_list.append(data)
            index = 0
            id = id + 1
            data = {}

def writeDocx(fileList,name):
    global id
    global convert_list
    global type_list
    id = 1
    convert_list = []
    type_list = []
    
    for p in fileList:
        readDocx(p["path"],p["type"])
    #题库
    jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径
    dirname = os.path.dirname(jsonPath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入
        f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():
    en_fileList =  [{"path":"en_us_topic\\地理(英)Respueda G .es.en",          "type":"World"},
                    {"path":"en_us_topic\\科学与技术(英)",                    "type":"Technology"},
                    {"path":"en_us_topic\\历史(英)Resupeda H.es.en",           "type":"History"},
                    {"path":"en_us_topic\\艺术和文学(英)Respueda A&L.es.en",  "type":"ArtAndLiterature"},
                    {"path":"en_us_topic\\娱乐(英)Respueda E.es.en",           "type":"Fashion"},
                    {"path":"en_us_topic\\运动(英)Respueda  D.es.en",          "type":"Sports"}]
    en_name = "en_us_topic"

    es_fileList =  [{"path":"es_es_topic\\地理(西)Respueda G ",                "type":"World"},
                    {"path":"es_es_topic\\科学与技术(西)Respueda C&T",            "type":"Technology"},
                    {"path":"es_es_topic\\历史(西)Resupeda H",                 "type":"History"},
                    {"path":"es_es_topic\\艺术和文学(西)Respueda A&L",        "type":"ArtAndLiterature"},
                    {"path":"es_es_topic\\娱乐(西)Respueda E",                 "type":"Fashion"},
                    {"path":"es_es_topic\\运动(西)Respueda  D",                "type":"Sports"}]
    es_name = "es_es_topic"

    pt_fileList =  [{"path":"pt_br_topic\\地理(葡)Respueda G .es.pt",          "type":"World"},
                    {"path":"pt_br_topic\\科学与技术(葡)",                    "type":"Technology"},
                    {"path":"pt_br_topic\\历史(葡)Resupeda H.es.pt",           "type":"History"},
                    {"path":"pt_br_topic\\艺术和文学(葡)Respueda A&L.es.pt",  "type":"ArtAndLiterature"},
                    {"path":"pt_br_topic\\娱乐(葡)Respueda E.es.pt",           "type":"Fashion"},
                    {"path":"pt_br_topic\\运动(葡)Respueda  D.es.pt",          "type":"Sports"}]
    pt_name = "pt_br_topic"

    writeDocx(pt_fileList,pt_name)
    writeDocx(es_fileList,es_name)
    writeDocx(en_fileList,en_name)
    
main()

更多技巧:313074041