.NET PDF 转文字(Tesseract OCR + PdfiumViewer)

115 阅读1分钟

环境搭建

参考:

Tesseract OCR : .NET Tesseract OCR - 掘金 (juejin.cn)

PdfiumViewer : .NET 以PdfiumViewer方式PDF转为图片 - 掘金 (juejin.cn)

代码整合

创建OCRHelper.cs

using System;
using System.IO;
using System.Text.RegularExpressions;
using Tesseract;

namespace OCR_7
{
    public static class OCRHelper
    {
        private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}";
        private static string tesseractPath = $@"Z:\.net_project\OCR_7\OCR_7\file\tesseract";

        public static string Scan(string path)
        {
            CreateDir();
            PDFToPng(path);
            return PngToText();
        }

        /// <summary>  
        /// 根据GUID获取16位的唯一字符串  
        /// </summary>  
        /// <param name=\"guid\"></param>  
        /// <returns></returns>  
        public static string GuidTo16String()
        {
            long i = 1;
            foreach (byte b in Guid.NewGuid().ToByteArray())
                i *= ((int)b + 1);
            return string.Format("{0:x}", i - DateTime.Now.Ticks);
        }

        private static void SaveText(string result)
        {
            string basePath = Environment.CurrentDirectory + "/ocr_file";

            try
            {
                //Pass the filepath and filename to the StreamWriter Constructor
                StreamWriter sw = new StreamWriter(basePath + "/text/result.txt");
                //Write a line of text
                sw.WriteLine(result);
                //Close the file
                sw.Close();
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception: " + e.Message);
            }
            finally
            {
                Console.WriteLine("Executing finally block.");
            }
        }

        private static string PngToText()
        {
            DirectoryInfo directory = new DirectoryInfo(imagePath);

            string res = "";
            //获取文件下的文件信息
            FileInfo[] files = directory.GetFiles();
            for (int i = 0; i < files.Length; i++)
            {
                string filePath = files[i].FullName;
                res += ScanPng(filePath);
               
            }
            try
            {
                Directory.Delete(imagePath,true);
            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.Message);
            }
            return handleResult(res);
        }

        private static string handleResult(string result)
        {
            Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase);
            result = replaceSpace.Replace(result, " ").Trim();

            return result;
        }

        private static string ScanPng(string path)
        {
            TesseractEngine engine = new TesseractEngine(tesseractPath, "eng");
            Pix pix = Pix.LoadFromFile(path);

            Page page = engine.Process(pix);
            return page.GetText();
        }

        private static void PDFToPng(string path)
        {
            PdfToImage(
              path,
              imagePath,
              "",
              "png",
              System.Drawing.Imaging.ImageFormat.Png,
              1,
              99
              );
        }



        private static void CreateDir()
        {
            CreateDirByPath(imagePath);
        }

        private static void CreateDirByPath(string path)
        {
            try
            {
                // Determine whether the directory exists.
                if (Directory.Exists(path))
                {
                    Console.WriteLine("That path exists already.");
                    return;
                }

                // Try to create the directory.
                DirectoryInfo di = Directory.CreateDirectory(path);
                Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path));

            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.ToString());
            }
        }

        /// <summary>
        /// pdf转图片
        /// </summary>
        /// <param name="pdfPath">pdf路径</param>
        /// <param name="imagePath">输出图片路径</param>
        /// <param name="imageName">输出图片名称</param>
        /// <param name="imagePathFormat">输出图片后缀</param>
        /// <param name="imageFormat">输出图片格式</param>
        /// <param name="startPageNum">开始页码</param>
        /// <param name="endPageNum">结束页码</param>
        public static void PdfToImage(
            string pdfPath,
            string imagePath,
            string imageName,
            string imagePathFormat,
            System.Drawing.Imaging.ImageFormat imageFormat,
            int startPageNum,
            int endPageNum
            )
        {
            #region 文件夹及路径处理
            if (!System.IO.Directory.Exists(imagePath))
            {
                System.IO.Directory.CreateDirectory(imagePath);
            }
            if (!imagePath.EndsWith("\\") && !imagePath.EndsWith("/"))
            {
                imagePath = imagePath + "\\";
            }
            if (!imagePathFormat.StartsWith("."))
            {
                imagePathFormat = "." + imagePathFormat;
            }
            #endregion
            var pdf = PdfiumViewer.PdfDocument.Load(pdfPath);//读取pdf
            var pdfPage = pdf.PageCount;//pdf页码
            var pdfSize = pdf.PageSizes;
            #region 开始结束页
            if (startPageNum <= 0) { startPageNum = 1; }
            if (endPageNum > pdf.PageCount) { endPageNum = pdf.PageCount; }
            if (startPageNum > endPageNum)//开始>结束
            {
                int tempPageNum = startPageNum;
                startPageNum = endPageNum;
                endPageNum = startPageNum;
            }
            #endregion

            for (int i = startPageNum; i <= endPageNum; i++)
            {
                System.Drawing.Size size = new System.Drawing.Size();
                //pdfSize为list类型,索引从0,而pdf页码从1开始,所以需要-1
                size.Width = (int)pdfSize[i - 1].Width;
                size.Height = (int)pdfSize[i - 1].Height;
                var stream = new System.IO.FileStream($"{imagePath}{imageName}{i}{imagePathFormat}", System.IO.FileMode.Create);
                var image = pdf.Render(i - 1, size.Width, size.Height, 350, 350, PdfiumViewer.PdfRenderFlags.Annotations);
                image.Save(stream, imageFormat);
                stream.Close();
                image.Dispose();
                stream.Dispose();
                System.Diagnostics.Process.Start(imagePath);
            }
            pdf.Dispose();
        }

    }
}

注意:

image.png

测试

OCRHelper.Scan(path): 根据pdf文件的地址,读取并打印内容

using System;

namespace OCR_7
{
    public class Program
    {
        static void Main(string[] args)
        {
            string s = OCRHelper.Scan("Z:\\.net_project\\OCR_7\\OCR_7\\file\\pdf\\01.pdf");
            Console.WriteLine(s);
        }
    }
}

image.png

image.png