Rust 读取 Word 内容

1,545 阅读1分钟

Rust库地址

Cargo.toml

docx-rs = "0.4.6"

解析代码: 层级很深,解析起来确实麻烦

fn main() -> Result<(), DocxError> {
    let mut file = File::open("hello.docx").unwrap();
    let mut buf = vec![];
    file.read_to_end(&mut buf).unwrap();

    let res = read_docx(&buf).unwrap();
    let children = res.document.children;

    for i in children {
        match i {
            DocumentChild::Paragraph(s) => {
                print_paragraph(s);
            }
            DocumentChild::Table(s) => {
                print_table(s);
            }
            DocumentChild::BookmarkStart(s) => {
                println!("s3 => {:?}\n", s);
            }
            DocumentChild::BookmarkEnd(s) => {
                println!("s4 => {:?}\n", s);
            }
            DocumentChild::CommentStart(s) => {
                println!("s5 => {:?}\n", s);
            }
            DocumentChild::CommentEnd(s) => {
                println!("s6 => {:?}\n", s);
            }
            DocumentChild::StructuredDataTag(s) => {
                println!("s7 => {:?}\n", s);
            }
            DocumentChild::TableOfContents(s) => {
                println!("s8 => {:?}\n", s);
            }
        }
    }

    Ok(())
}

fn print_table(s: Box<Table>) {
    // println!("s2 => {:?}\n", s);
    for ele in s.rows {
        match ele {
            TableChild::TableRow(tr) => {
                for ele2 in tr.cells {
                    match ele2 {
                        TableRowChild::TableCell(tc) => {
                            for ele3 in tc.children {
                                match ele3 {
                                    TableCellContent::Paragraph(tp) => {
                                        print_paragraph(Box::new(tp));
                                    }
                                    TableCellContent::Table(tt) => {
                                        for row_ele in tt.rows {
                                            match row_ele {
                                                TableChild::TableRow(ttr) => {
                                                    for ttr_ele in ttr.cells {
                                                        match ttr_ele {
                                                            TableRowChild::TableCell(ttr_tc) => {
                                                                for ele in ttr_tc.children {
                                                                    match ele {
                                                                       TableCellContent::Paragraph(p) => {
                                                                            print_paragraph(Box::new(p));
                                                                       },
                                                                        TableCellContent::Table(t) => {
                                                                            print_table(Box::new(t));
                                                                        },
                                                                    }
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

fn print_paragraph(s: Box<Paragraph>) {
    // println!("s1 => {:?}", s);
    for ele in s.children {
        match ele {
            ParagraphChild::Run(r) => {
                for ele2 in r.children {
                    match ele2 {
                        RunChild::Text(t) => {
                            println!("text: {},", t.text);
                        }
                        _ => {}
                    }
                }
            }
            _ => {}
        }
    }
}

这是Word文件截图

image.png

运行程序后的结果,这个库会把段落的每个字都解析出来

text: 1,
text: . ,
text: Hello你,
text: 你好,这是一份Word文件,
text: 2,
text: . Rust,
text: 语言,
text: R,
text: u,
text: st是一门高性能,内存安全的编程语言。,
text: 书籍,
text: 价格,
text: Java入门,
text: 1,
text: 0,
text: Java,
text: script ,
text: 入门,
text: 2,
text: 0,