OCR PDF with Vector Text in C# and .NET

OCR (оптическое распознавание символов) — это технология, которая позволяет преобразовывать различные типы документов, такие как отсканированные бумажные документы, PDF-файлы или изображения, снятые камерой, в редактируемые и доступные для поиска данные.

В этой статье мы рассмотрим, как использовать библиотеку SautinSoft.Pdf для распознавания текста в PDF-документе, содержащем текст в виде векторной графики на C# и .NET.

Пошаговое руководство:

Добавить SautinSoft.PDF из NuGet.
Загрузить PDF-документ.
Сохранить векторизованное содержимое в виде объектов изображения.
Perform распознавание текста.
Сохранить документ в формате DOCX.

Входной файл: simple text.pdf

Выходной результат:

Полный код

C#
VB.Net

GitHub

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// OCR a PDF document containing text as vector graphics
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
        /// </remarks>
        static void Main()
        {
            try
            {
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Vectorized text.pdf");
                MemoryStream ms = new MemoryStream();
                pdfDocument.Save(ms, new ImageSaveOptions());
                pdfDocument = new PdfDocument();
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {
                    var pdfPage = pdfDocument.Pages.Add();
                    engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                    {
                        byte[] imgBytes = ms.ToArray();
                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                        {
                            using (var page = engine.Process(img, "Serachablepdf"))
                            {
                                var st = page.GetText();
                                double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);

                                using (var iter = page.GetIterator())
                                {
                                    iter.Begin();

                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                text.FontSize = liRect.Height * scale;
                                                //text.Opacity = 0;
                                                text.Append(iter.GetText(PageIteratorLevel.TextLine));
                                                pdfPage.Content.DrawText(text, new PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
                                                text.Clear();
                                            } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                        } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                    } while (iter.Next(PageIteratorLevel.Block));
                                }
                            }
                        }
                    }
                }
                pdfDocument.Save(@"text.docx");
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(@"text.docx") { UseShellExecute = true });
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
    Friend Class OCR
        ''' <summary>
        ''' OCR a PDF document containing text as vector graphics
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
        ''' </remarks>
        Public Shared Sub Main()
            Dim liRect As Rect = Nothing
            Try
                Dim tesseractLanguages = "eng"
                Dim tesseractData = Path.GetFullPath(".\tessdata")
                Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
                Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Vectorized text.pdf")
                Dim mss As List(Of MemoryStream) = New List(Of MemoryStream)()
                Dim ms As MemoryStream = New MemoryStream()
                pdfDocument.Save(ms, New ImageSaveOptions())
                pdfDocument = New PdfDocument()
                Dim text As PdfFormattedText = New PdfFormattedText()

                Using engine As TesseractEngine = New TesseractEngine(tesseractData, tesseractLanguages, EngineMode.Default)
                    Dim pdfPage = pdfDocument.Pages.Add()
                    engine.DefaultPageSegMode = PageSegMode.Auto
                    If True Then
                        Dim imgBytes As Byte() = ms.ToArray()
                        Using img = Pix.LoadFromMemory(imgBytes)
                            Using page = engine.Process(img, "Serachablepdf")
                                Dim st = page.GetText()
                                Dim scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height)

                                Using iter = page.GetIterator()
                                    iter.Begin()

                                    Do
                                        Do
                                            Do
                                                iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
                                                text.FontSize = liRect.Height * scale
                                                'text.Opacity = 0;
                                                text.Append(iter.GetText(PageIteratorLevel.TextLine))
                                                pdfPage.Content.DrawText(text, New PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height))
                                                text.Clear()
                                            Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
                                        Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
                                    Loop While iter.Next(PageIteratorLevel.Block)
                                End Using
                            End Using
                        End Using
                    End If
                End Using
                pdfDocument.Save("text.docx")
                Process.Start(New ProcessStartInfo("text.docx") With {
                    .UseShellExecute = True
                })
            Catch e As Exception
                Console.WriteLine()
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
                Console.ReadKey()
                Throw New Exception("Error Tesseract: " & e.Message)
            Finally

            End Try
        End Sub
    End Class
End Namespace

Download

Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.ru или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:

Имя(необязательно):

Email:

Сообщение:

Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.

OCR PDF with Vector Text in C# and .NET

Цифры введены неправильно, пожалуйста попробуйте еще раз.

Вопросы и предложения всегда приветствуются!