Specifies in which cases OCR will be performed, during load PDF document.
Namespace: SautinSoft.DocumentAssembly: SautinSoft.Document (in SautinSoft.Document.dll) Version: 2024.11.20
Syntax Public Enumeration OCRMode
Members Member name | Value | Description |
---|
Disabled | 0 |
OCR is disabled.
|
Auto | 1 |
OCR will be performed if the image is large enough and over just one image per page.
|
Enabled | 2 |
OCR will be performed over all images per page.
|
Example See Developer Guide: Recognize an image using Tesseract (free OCR library)
Recognize an image using Tesseract (free OCR library) in C#
using System.IO;
using SautinSoft.Document;
using System;
using SkiaSharp;
namespace Example
{
class Program
{
static void Main(string[] args)
{
RecognizeImage();
}
static void RecognizeImage()
{
string inpFile = @"..\..\..\image.png";
string outFile = "Result1.docx";
ImageLoadOptions lo = new ImageLoadOptions();
lo.OCROptions.OCRMode = OCRMode.Enabled;
lo.OCROptions.Method = PerformOCRTesseract;
DocumentCore dc = DocumentCore.Load(inpFile, lo);
foreach (Run r in dc.GetChildElements(true, ElementType.Run))
{
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black;
r.CharacterFormat.Scaling = 100;
r.CharacterFormat.Spacing = 0;
r.CharacterFormat.Size = 12;
}
Section section = dc.Sections[0];
section.PageSetup.PaperType = PaperType.Letter;
section.PageSetup.Orientation = Orientation.Landscape;
double m = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point);
section.PageSetup.PageMargins = new PageMargins() { Top = m, Left = m, Right = m, Bottom = m };
dc.Save(outFile);
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
}
public static byte[] PerformOCRTesseract(byte[] image)
{
string tesseractLanguages = "rus+eng+vie";
string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
try
{
using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, true))
{
using (renderer.BeginDocument("Serachablepdf"))
{
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream msImg = new MemoryStream(image))
{
SKBitmap imgWithText = SKBitmap.Decode(msImg);
using (MemoryStream ms = new MemoryStream())
{
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
renderer.AddPage(page);
}
}
}
}
}
}
}
PdfLoadOptions pl = new PdfLoadOptions();
pl.ShowInvisibleText = true;
pl.PreserveEmbeddedFonts = PropertyState.Disabled;
pl.ConversionMode = PdfConversionMode.Continuous;
DocumentCore dc = DocumentCore.Load(File.OpenRead(tempFile + @".pdf"), pl);
byte[] returnPdf;
using (MemoryStream ms = new MemoryStream())
{
PdfSaveOptions ps = new PdfSaveOptions();
dc.Save(ms, ps);
returnPdf = ms.ToArray();
}
return returnPdf;
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Recognize an image using Tesseract (free OCR library) in VB.Net
Imports System
Imports System.IO
Imports SautinSoft.Document
Imports SkiaSharp
Module Sample
Sub Main()
RecognizeImage()
End Sub
Sub RecognizeImage()
Dim inpFile As String = "..\..\..\image.png"
Dim outFile As String = "Result.docx"
Dim lo As New ImageLoadOptions()
lo.OCROptions.OCRMode = OCRMode.Enabled
lo.OCROptions.Method = AddressOf PerformOCRTesseract
Dim dc As DocumentCore = DocumentCore.Load(inpFile, lo)
For Each r As Run In dc.GetChildElements(True, ElementType.Run)
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black
r.CharacterFormat.Scaling = 100
r.CharacterFormat.Spacing = 0
r.CharacterFormat.Size = 12
Next r
Dim section As Section = dc.Sections(0)
section.PageSetup.PaperType = PaperType.Letter
section.PageSetup.Orientation = Orientation.Landscape
Dim m As Double = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point)
section.PageSetup.PageMargins = New PageMargins() With {
.Top = m,
.Left = m,
.Right = m,
.Bottom = m
}
dc.Save(outFile)
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
End Sub
Public Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
Dim tesseractLanguages As String = "rus+eng+vie"
Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Try
Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, True)
Using renderer.BeginDocument("Serachablepdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using msImg As New MemoryStream(image)
Dim imgWithText As SKBitmap = SKBitmap.Decode(msImg)
Using ms As New MemoryStream()
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
renderer.AddPage(page)
End Using
End Using
End Using
End Using
End Using
End Using
End Using
Dim pl As New PdfLoadOptions()
pl.ShowInvisibleText = True
pl.PreserveEmbeddedFonts = PropertyState.Disabled
pl.ConversionMode = PdfConversionMode.Continuous
Dim dc As DocumentCore = DocumentCore.Load(File.OpenRead(tempFile & ".pdf"), pl)
Dim returnPdf() As Byte
Using ms As New MemoryStream()
Dim ps As New PdfSaveOptions()
dc.Save(ms, ps)
returnPdf = ms.ToArray()
End Using
Return returnPdf
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
If File.Exists(tempFile & ".pdf") Then
File.Delete(tempFile & ".pdf")
End If
End Try
End Function
End Module
See Also