Как конвертировать документ PDF со отсканированными изображениями в Word на C# и .NET


В этом примере показано, как извлечь только текст из отсканированного PDF-файла.

Предположим, у нас есть заполненный договор аренды автомобиля в Scanned.pdf Существует также скрытый текст, который дублирует содержимое изображения.

Задача состоит в том, чтобы создать соответствующий документ Word, сделав текст видимым и удалив все изображения.

Загрузите полученный файл: Result-Scanned PDF для Word.docx

Полный код

using SautinSoft.Document;

namespace Example
{
    class Program
    {        
        static void Main(string[] args)
        {
            // Get your free 100-day key here:   
            // https://sautinsoft.com/start-for-free/

            ScannedPdfToWord();
        }

        /// <summary>
        /// The method converts a PDF document with scanned images to Word. But it works only if the PDF document contains a hidden text atop of the images.
        /// </summary>
        /// <remarks>
        /// Details: https://www.sautinsoft.com/products/document/help/net/developer-guide/from-customers-scanned-pdf-to-word-in-csharp-vb-net.php
        /// </remarks>
        static void ScannedPdfToWord()
        {
            // Actually there are a lot of PDF documents which looks like created using a scanner, 
            // but they also contain a hidden text atop of the contents. 
            // This hidden text duplicates the content of the scanned images. 
            // This is made specially to have the ability to perform the 'find' operation.

            // Our steps:
            // 1. Load the PDF with the these settings: 
            // - show hidden text;
            // - skip all images during the loading. 
            // 2. Change the font color to the 'Black' for the all text.
            // 3. Save the document as DOCX.
            string inpFile = @"..\..\..\Scanned.pdf";
            string outFile = @"Result.docx";

            PdfLoadOptions pdfLO = new PdfLoadOptions()
            {
				// 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
				// 'Enabled' - Always load embedded fonts in PDF.
				// 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.
                PreserveEmbeddedFonts = PropertyState.Enabled,
                PreserveImages = false,
                ShowInvisibleText = true,                
            };

            DocumentCore dc = DocumentCore.Load(inpFile, pdfLO);

            dc.DefaultCharacterFormat.FontColor = Color.Black;
            foreach (Element element in dc.GetChildElements(true, ElementType.Paragraph))
            {
                foreach (Inline inline in (element as Paragraph).Inlines)
                {
                    if (inline is Run)
                        (inline as Run).CharacterFormat.FontColor = Color.Black;
                }
                (element as Paragraph).CharacterFormatForParagraphMark.FontColor = Color.Black;
            }
            dc.Save(outFile);

            // Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
        }
    }
}

Download

Imports SautinSoft.Document

Namespace Example
	Friend Class Program
		Shared Sub Main(ByVal args() As String)
			ScannedPdfToWord()
		End Sub
        ''' Get your free 100-day key here:   
        ''' https://sautinsoft.com/start-for-free/
        ''' <summary>
        ''' The method converts a PDF document with scanned images to Word. But it works only if the PDF document contains a hidden text atop of the images.
        ''' </summary>
        ''' <remarks>
        ''' Details: https://www.sautinsoft.com/products/document/help/net/developer-guide/from-customers-scanned-pdf-to-word-in-csharp-vb-net.php
        ''' </remarks>
        Private Shared Sub ScannedPdfToWord()
			' Actually there are a lot of PDF documents which looks like created using a scanner, 
			' but they also contain a hidden text atop of the contents. 
			' This hidden text duplicates the content of the scanned images. 
			' This is made specially to have the ability to perform the 'find' operation.

			' Our steps:
			' 1. Load the PDF with the these settings: 
			' - show hidden text;
			' - skip all images during the loading. 
			' 2. Change the font color to the 'Black' for the all text.
			' 3. Save the document as DOCX.
			Dim inpFile As String = "..\..\..\Scanned.pdf"
			Dim outFile As String = "Result.docx"

			Dim pdfLO As New PdfLoadOptions
			With pdfLO
				' 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
				' 'Enabled' - Always load embedded fonts in PDF.
				' 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.
				.PreserveEmbeddedFonts = PropertyState.Enabled
				.PreserveImages = False
				.ShowInvisibleText = True
			End With

			Dim dc As DocumentCore = DocumentCore.Load(inpFile, pdfLO)

            dc.DefaultCharacterFormat.FontColor = Color.Black
			For Each element As Element In dc.GetChildElements(True, ElementType.Paragraph)
				For Each inline As Inline In (TryCast(element, Paragraph)).Inlines
					If TypeOf inline Is Run Then
						TryCast(inline, Run).CharacterFormat.FontColor = Color.Black
					End If
				Next inline
				TryCast(element, Paragraph).CharacterFormatForParagraphMark.FontColor = Color.Black
			Next element
			dc.Save(outFile)

			' Open the result for demonstration purposes.
			System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
		End Sub
	End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.ru или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.