环境搭建
参考:
Tesseract OCR : .NET Tesseract OCR - 掘金 (juejin.cn)
PdfiumViewer : .NET 以PdfiumViewer方式PDF转为图片 - 掘金 (juejin.cn)
代码整合
创建OCRHelper.cs
using System;
using System.IO;
using System.Text.RegularExpressions;
using Tesseract;
namespace OCR_7
{
public static class OCRHelper
{
private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}";
private static string tesseractPath = $@"Z:\.net_project\OCR_7\OCR_7\file\tesseract";
public static string Scan(string path)
{
CreateDir();
PDFToPng(path);
return PngToText();
}
/// <summary>
/// 根据GUID获取16位的唯一字符串
/// </summary>
/// <param name=\"guid\"></param>
/// <returns></returns>
public static string GuidTo16String()
{
long i = 1;
foreach (byte b in Guid.NewGuid().ToByteArray())
i *= ((int)b + 1);
return string.Format("{0:x}", i - DateTime.Now.Ticks);
}
private static void SaveText(string result)
{
string basePath = Environment.CurrentDirectory + "/ocr_file";
try
{
//Pass the filepath and filename to the StreamWriter Constructor
StreamWriter sw = new StreamWriter(basePath + "/text/result.txt");
//Write a line of text
sw.WriteLine(result);
//Close the file
sw.Close();
}
catch (Exception e)
{
Console.WriteLine("Exception: " + e.Message);
}
finally
{
Console.WriteLine("Executing finally block.");
}
}
private static string PngToText()
{
DirectoryInfo directory = new DirectoryInfo(imagePath);
string res = "";
//获取文件下的文件信息
FileInfo[] files = directory.GetFiles();
for (int i = 0; i < files.Length; i++)
{
string filePath = files[i].FullName;
res += ScanPng(filePath);
}
try
{
Directory.Delete(imagePath,true);
}
catch (Exception e)
{
Console.WriteLine("The process failed: {0}", e.Message);
}
return handleResult(res);
}
private static string handleResult(string result)
{
Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase);
result = replaceSpace.Replace(result, " ").Trim();
return result;
}
private static string ScanPng(string path)
{
TesseractEngine engine = new TesseractEngine(tesseractPath, "eng");
Pix pix = Pix.LoadFromFile(path);
Page page = engine.Process(pix);
return page.GetText();
}
private static void PDFToPng(string path)
{
PdfToImage(
path,
imagePath,
"",
"png",
System.Drawing.Imaging.ImageFormat.Png,
1,
99
);
}
private static void CreateDir()
{
CreateDirByPath(imagePath);
}
private static void CreateDirByPath(string path)
{
try
{
// Determine whether the directory exists.
if (Directory.Exists(path))
{
Console.WriteLine("That path exists already.");
return;
}
// Try to create the directory.
DirectoryInfo di = Directory.CreateDirectory(path);
Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path));
}
catch (Exception e)
{
Console.WriteLine("The process failed: {0}", e.ToString());
}
}
/// <summary>
/// pdf转图片
/// </summary>
/// <param name="pdfPath">pdf路径</param>
/// <param name="imagePath">输出图片路径</param>
/// <param name="imageName">输出图片名称</param>
/// <param name="imagePathFormat">输出图片后缀</param>
/// <param name="imageFormat">输出图片格式</param>
/// <param name="startPageNum">开始页码</param>
/// <param name="endPageNum">结束页码</param>
public static void PdfToImage(
string pdfPath,
string imagePath,
string imageName,
string imagePathFormat,
System.Drawing.Imaging.ImageFormat imageFormat,
int startPageNum,
int endPageNum
)
{
#region 文件夹及路径处理
if (!System.IO.Directory.Exists(imagePath))
{
System.IO.Directory.CreateDirectory(imagePath);
}
if (!imagePath.EndsWith("\\") && !imagePath.EndsWith("/"))
{
imagePath = imagePath + "\\";
}
if (!imagePathFormat.StartsWith("."))
{
imagePathFormat = "." + imagePathFormat;
}
#endregion
var pdf = PdfiumViewer.PdfDocument.Load(pdfPath);//读取pdf
var pdfPage = pdf.PageCount;//pdf页码
var pdfSize = pdf.PageSizes;
#region 开始结束页
if (startPageNum <= 0) { startPageNum = 1; }
if (endPageNum > pdf.PageCount) { endPageNum = pdf.PageCount; }
if (startPageNum > endPageNum)//开始>结束
{
int tempPageNum = startPageNum;
startPageNum = endPageNum;
endPageNum = startPageNum;
}
#endregion
for (int i = startPageNum; i <= endPageNum; i++)
{
System.Drawing.Size size = new System.Drawing.Size();
//pdfSize为list类型,索引从0,而pdf页码从1开始,所以需要-1
size.Width = (int)pdfSize[i - 1].Width;
size.Height = (int)pdfSize[i - 1].Height;
var stream = new System.IO.FileStream($"{imagePath}{imageName}{i}{imagePathFormat}", System.IO.FileMode.Create);
var image = pdf.Render(i - 1, size.Width, size.Height, 350, 350, PdfiumViewer.PdfRenderFlags.Annotations);
image.Save(stream, imageFormat);
stream.Close();
image.Dispose();
stream.Dispose();
System.Diagnostics.Process.Start(imagePath);
}
pdf.Dispose();
}
}
}
注意:
测试
OCRHelper.Scan(path): 根据pdf文件的地址,读取并打印内容
using System;
namespace OCR_7
{
public class Program
{
static void Main(string[] args)
{
string s = OCRHelper.Scan("Z:\\.net_project\\OCR_7\\OCR_7\\file\\pdf\\01.pdf");
Console.WriteLine(s);
}
}
}