Represents a class that stores loading options for Portable Document Format (PDF).
Inheritance Hierarchy SystemObject
SautinSoft.DocumentOCROptions
Namespace: SautinSoft.DocumentAssembly: SautinSoft.Document (in SautinSoft.Document.dll) Version: 2024.11.20
Syntax public sealed class OCROptions
Public NotInheritable Class OCROptions
The OCROptions type exposes the following members.
Constructors | Name | Description |
---|
| OCROptions |
A constructor for working with OCR.
|
TopProperties | Name | Description |
---|
| Method |
Method to perform OCR (any 3rd party). We offer free library from Nicomsoft: https://www.nicomsoft.com/nicomsoft-ocr-sdk-is-freeware-now.
|
| OCRMode |
Gets or sets OCR mode. Default value: Disabled.
|
TopExample See Developer Guide: Recognize an image using Tesseract (free OCR library)
Recognize an image using Tesseract (free OCR library) using C#
using System.IO;
using SautinSoft.Document;
using System;
using SkiaSharp;
namespace Example
{
class Program
{
static void Main(string[] args)
{
RecognizeImage();
}
static void RecognizeImage()
{
string inpFile = @"..\..\..\image.png";
string outFile = "Result1.docx";
ImageLoadOptions lo = new ImageLoadOptions();
lo.OCROptions.OCRMode = OCRMode.Enabled;
lo.OCROptions.Method = PerformOCRTesseract;
DocumentCore dc = DocumentCore.Load(inpFile, lo);
foreach (Run r in dc.GetChildElements(true, ElementType.Run))
{
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black;
r.CharacterFormat.Scaling = 100;
r.CharacterFormat.Spacing = 0;
r.CharacterFormat.Size = 12;
}
Section section = dc.Sections[0];
section.PageSetup.PaperType = PaperType.Letter;
section.PageSetup.Orientation = Orientation.Landscape;
double m = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point);
section.PageSetup.PageMargins = new PageMargins() { Top = m, Left = m, Right = m, Bottom = m };
dc.Save(outFile);
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
}
public static byte[] PerformOCRTesseract(byte[] image)
{
string tesseractLanguages = "rus+eng+vie";
string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
try
{
using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, true))
{
using (renderer.BeginDocument("Serachablepdf"))
{
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream msImg = new MemoryStream(image))
{
SKBitmap imgWithText = SKBitmap.Decode(msImg);
using (MemoryStream ms = new MemoryStream())
{
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
renderer.AddPage(page);
}
}
}
}
}
}
}
PdfLoadOptions pl = new PdfLoadOptions();
pl.ShowInvisibleText = true;
pl.PreserveEmbeddedFonts = PropertyState.Disabled;
pl.ConversionMode = PdfConversionMode.Continuous;
DocumentCore dc = DocumentCore.Load(File.OpenRead(tempFile + @".pdf"), pl);
byte[] returnPdf;
using (MemoryStream ms = new MemoryStream())
{
PdfSaveOptions ps = new PdfSaveOptions();
dc.Save(ms, ps);
returnPdf = ms.ToArray();
}
return returnPdf;
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Recognize an image using Tesseract (free OCR library) using VB.Net
Imports System
Imports System.IO
Imports SautinSoft.Document
Imports SkiaSharp
Module Sample
Sub Main()
RecognizeImage()
End Sub
Sub RecognizeImage()
Dim inpFile As String = "..\..\..\image.png"
Dim outFile As String = "Result.docx"
Dim lo As New ImageLoadOptions()
lo.OCROptions.OCRMode = OCRMode.Enabled
lo.OCROptions.Method = AddressOf PerformOCRTesseract
Dim dc As DocumentCore = DocumentCore.Load(inpFile, lo)
For Each r As Run In dc.GetChildElements(True, ElementType.Run)
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black
r.CharacterFormat.Scaling = 100
r.CharacterFormat.Spacing = 0
r.CharacterFormat.Size = 12
Next r
Dim section As Section = dc.Sections(0)
section.PageSetup.PaperType = PaperType.Letter
section.PageSetup.Orientation = Orientation.Landscape
Dim m As Double = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point)
section.PageSetup.PageMargins = New PageMargins() With {
.Top = m,
.Left = m,
.Right = m,
.Bottom = m
}
dc.Save(outFile)
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
End Sub
Public Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
Dim tesseractLanguages As String = "rus+eng+vie"
Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Try
Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, True)
Using renderer.BeginDocument("Serachablepdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using msImg As New MemoryStream(image)
Dim imgWithText As SKBitmap = SKBitmap.Decode(msImg)
Using ms As New MemoryStream()
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
renderer.AddPage(page)
End Using
End Using
End Using
End Using
End Using
End Using
End Using
Dim pl As New PdfLoadOptions()
pl.ShowInvisibleText = True
pl.PreserveEmbeddedFonts = PropertyState.Disabled
pl.ConversionMode = PdfConversionMode.Continuous
Dim dc As DocumentCore = DocumentCore.Load(File.OpenRead(tempFile & ".pdf"), pl)
Dim returnPdf() As Byte
Using ms As New MemoryStream()
Dim ps As New PdfSaveOptions()
dc.Save(ms, ps)
returnPdf = ms.ToArray()
End Using
Return returnPdf
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
If File.Exists(tempFile & ".pdf") Then
File.Delete(tempFile & ".pdf")
End If
End Try
End Function
End Module
See Also