Represents a class that stores options for loading images.
Inheritance Hierarchy Namespace: SautinSoft.DocumentAssembly: SautinSoft.Document (in SautinSoft.Document.dll) Version: 2024.11.20
Syntax public class ImageLoadOptions : LoadOptions
Public Class ImageLoadOptions
Inherits LoadOptions
The ImageLoadOptions type exposes the following members.
Constructors Properties | Name | Description |
---|
| OCROptions |
Allows to specify options for OCR (optical character recognition) and enable it.
|
TopExample See Developer Guide: Recognize an image using Tesseract (free OCR library) and save the result as DOCX document
Recognize an image using Tesseract (free OCR library) and save the result as DOCX document using C#
using System.IO;
using SautinSoft.Document;
using System;
using SkiaSharp;
namespace Example
{
class Program
{
static void Main(string[] args)
{
RecognizeImage();
}
static void RecognizeImage()
{
string inpFile = @"..\..\..\image.png";
string outFile = "Result1.docx";
ImageLoadOptions lo = new ImageLoadOptions();
lo.OCROptions.OCRMode = OCRMode.Enabled;
lo.OCROptions.Method = PerformOCRTesseract;
DocumentCore dc = DocumentCore.Load(inpFile, lo);
foreach (Run r in dc.GetChildElements(true, ElementType.Run))
{
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black;
r.CharacterFormat.Scaling = 100;
r.CharacterFormat.Spacing = 0;
r.CharacterFormat.Size = 12;
}
Section section = dc.Sections[0];
section.PageSetup.PaperType = PaperType.Letter;
section.PageSetup.Orientation = Orientation.Landscape;
double m = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point);
section.PageSetup.PageMargins = new PageMargins() { Top = m, Left = m, Right = m, Bottom = m };
dc.Save(outFile);
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
}
public static byte[] PerformOCRTesseract(byte[] image)
{
string tesseractLanguages = "rus+eng+vie";
string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
try
{
using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, true))
{
using (renderer.BeginDocument("Serachablepdf"))
{
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream msImg = new MemoryStream(image))
{
SKBitmap imgWithText = SKBitmap.Decode(msImg);
using (MemoryStream ms = new MemoryStream())
{
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
renderer.AddPage(page);
}
}
}
}
}
}
}
PdfLoadOptions pl = new PdfLoadOptions();
pl.ShowInvisibleText = true;
pl.PreserveEmbeddedFonts = PropertyState.Disabled;
pl.ConversionMode = PdfConversionMode.Continuous;
DocumentCore dc = DocumentCore.Load(File.OpenRead(tempFile + @".pdf"), pl);
byte[] returnPdf;
using (MemoryStream ms = new MemoryStream())
{
PdfSaveOptions ps = new PdfSaveOptions();
dc.Save(ms, ps);
returnPdf = ms.ToArray();
}
return returnPdf;
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Recognize an image using Tesseract (free OCR library) and save the result as DOCX document using VB.Net
Imports System
Imports System.IO
Imports SautinSoft.Document
Imports SkiaSharp
Module Sample
Sub Main()
RecognizeImage()
End Sub
Sub RecognizeImage()
Dim inpFile As String = "..\..\..\image.png"
Dim outFile As String = "Result.docx"
Dim lo As New ImageLoadOptions()
lo.OCROptions.OCRMode = OCRMode.Enabled
lo.OCROptions.Method = AddressOf PerformOCRTesseract
Dim dc As DocumentCore = DocumentCore.Load(inpFile, lo)
For Each r As Run In dc.GetChildElements(True, ElementType.Run)
r.CharacterFormat.FontColor = SautinSoft.Document.Color.Black
r.CharacterFormat.Scaling = 100
r.CharacterFormat.Spacing = 0
r.CharacterFormat.Size = 12
Next r
Dim section As Section = dc.Sections(0)
section.PageSetup.PaperType = PaperType.Letter
section.PageSetup.Orientation = Orientation.Landscape
Dim m As Double = LengthUnitConverter.Convert(5, LengthUnit.Millimeter, LengthUnit.Point)
section.PageSetup.PageMargins = New PageMargins() With {
.Top = m,
.Left = m,
.Right = m,
.Bottom = m
}
dc.Save(outFile)
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
End Sub
Public Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
Dim tesseractLanguages As String = "rus+eng+vie"
Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Try
Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, True)
Using renderer.BeginDocument("Serachablepdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using msImg As New MemoryStream(image)
Dim imgWithText As SKBitmap = SKBitmap.Decode(msImg)
Using ms As New MemoryStream()
imgWithText.Encode(ms, SKEncodedImageFormat.Png, 100)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
renderer.AddPage(page)
End Using
End Using
End Using
End Using
End Using
End Using
End Using
Dim pl As New PdfLoadOptions()
pl.ShowInvisibleText = True
pl.PreserveEmbeddedFonts = PropertyState.Disabled
pl.ConversionMode = PdfConversionMode.Continuous
Dim dc As DocumentCore = DocumentCore.Load(File.OpenRead(tempFile & ".pdf"), pl)
Dim returnPdf() As Byte
Using ms As New MemoryStream()
Dim ps As New PdfSaveOptions()
dc.Save(ms, ps)
returnPdf = ms.ToArray()
End Using
Return returnPdf
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
If File.Exists(tempFile & ".pdf") Then
File.Delete(tempFile & ".pdf")
End If
End Try
End Function
End Module
See Also