Как конвертировать PDF в HTML на C# и .NET

  1. Добавьте SautinSoft.Document из Nuget.
  2. Загрузите входной документ.
  3. Сохраните в формате HTML.

SautinSoft.Document может помочь Вашему приложению преобразовать документ из одного формата в другой.
Вам нужно будет только Load() документ и Save() в нужном формате:


            DocumentCore dc = DocumentCore.Load("...");
            dc.Save("....");

SautinSoft.Document поддерживает форматы:

PDF DOCX RTF HTML Текст Изображения
Create/Read/Write Create/Read/Write Create/Read/Write Create/Read/Write Create/Read/Write Create/Read(OCR)/Write

Полный код

using System.IO;
using SautinSoft.Document;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {
            // Get your free 100-day key here:   
            // https://sautinsoft.com/start-for-free/

            ConvertFromFile();
            ConvertFromStream();
        }

        /// <summary>
        /// Convert PDF to HTML (file to file).
        /// </summary>
		/// <remarks>
        /// Details: https://sautinsoft.com/products/document/help/net/developer-guide/convert-pdf-to-html-in-csharp-vb.php
        /// </remarks>
        static void ConvertFromFile()
        {
            string inpFile = @"..\..\..\example.pdf";
            string outFile = @"Result.html";

            // Specifying PdfLoadOptions we explicitly set that a loadable document is PDF.
            PdfLoadOptions pdfLO = new PdfLoadOptions()
            {
                // 'false' - means to load vector graphics as is. Don't transform it to raster images.
                RasterizeVectorGraphics = false,

                // The PDF format doesn't have real tables, in fact it's a set of orthogonal graphic lines.
                // In case of 'true' the component will detect and recreate tables from graphic lines.
                DetectTables = false,                
                // 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
				// 'Enabled' - Always load embedded fonts in PDF.
				// 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.				
                PreserveEmbeddedFonts = PropertyState.Auto
            };

            DocumentCore dc = DocumentCore.Load(inpFile, pdfLO);
            dc.Save(outFile, new HtmlFixedSaveOptions());
			
			// Important for Linux: Install MS Fonts
			// sudo apt install ttf-mscorefonts-installer -y

            // Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
        }

        /// <summary>
        /// Convert PDF to HTML (using Stream).
        /// </summary>
		/// <remarks>
        /// Details: https://sautinsoft.com/products/document/help/net/developer-guide/convert-pdf-to-html-in-csharp-vb.php
        /// </remarks>
        static void ConvertFromStream()
        {

            // We need files only for demonstration purposes.
            // The conversion process will be done completely in memory.
            string inpFile = @"..\..\..\example.pdf";
            string outFile = @"ResultStream.html";
            byte[] inpData = File.ReadAllBytes(inpFile);
            byte[] outData = null;

            using (MemoryStream msInp = new MemoryStream(inpData))
            {
                // Specifying PdfLoadOptions we explicitly set that a loadable document is PDF.
                PdfLoadOptions pdfLO = new PdfLoadOptions()
                {
                    // 'false' - means to load vector graphics as is. Don't transform it to raster images.
                    RasterizeVectorGraphics = false,

                    // The PDF format doesn't have real tables, in fact it's a set of orthogonal graphic lines.
                    // In case of 'true' the component will detect and recreate tables from graphic lines.
                    DetectTables = false,
                    // 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
					// 'Enabled' - Always load embedded fonts in PDF.
					// 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.					
                    PreserveEmbeddedFonts = PropertyState.Auto
                };

                // Load a document.
                DocumentCore dc = DocumentCore.Load(msInp, pdfLO);

                // Save the document to HTML format.
                using (MemoryStream outMs = new MemoryStream())
                {
                    dc.Save(outMs, new HtmlFixedSaveOptions() );
                    outData = outMs.ToArray(); 

			// Important for Linux: Install MS Fonts
			// sudo apt install ttf-mscorefonts-installer -y					
                }
                // Show the result for demonstration purposes.
                if (outData != null)
                {
                    File.WriteAllBytes(outFile, outData);
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
                }
            }
        }
    }
}

Download

Imports System.IO
Imports SautinSoft.Document

Namespace Example
    Friend Class Program
        Shared Sub Main(ByVal args() As String)
            ConvertFromFile()
            ConvertFromStream()
        End Sub
        ''' Get your free 100-day key here:   
        ''' https://sautinsoft.com/start-for-free/
        ''' <summary>
        ''' Convert PDF to HTML (file to file).
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/document/help/net/developer-guide/convert-pdf-to-html-in-csharp-vb.php
        ''' </remarks>
        Private Shared Sub ConvertFromFile()
            Dim inpFile As String = "..\..\..\example.pdf"
            Dim outFile As String = "Result.html"

            ' Specifying PdfLoadOptions we explicitly set that a loadable document is PDF.
            Dim pdfLO As New PdfLoadOptions()
            With pdfLO
                .RasterizeVectorGraphics = False
                .DetectTables = False
                ' 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
                ' 'Enabled' - Always load embedded fonts in PDF.
                ' 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.
                .PreserveEmbeddedFonts = PropertyState.Auto
            End With

            ' RasterizeVectorGraphics = False
            ' This means to load vector graphics as is. Don't transform it to raster images.

            ' DetectTables = False
            ' This means don't detect tables.
            ' The PDF format doesn't have real tables, in fact it's a set of orthogonal graphic lines.
            ' Set it to 'True' and the component will detect and recreate tables from graphic lines.

            Dim dc As DocumentCore = DocumentCore.Load(inpFile, pdfLO)
            dc.Save(outFile, New HtmlFixedSaveOptions())
			
			' Important for Linux: Install MS Fonts
			' sudo apt install ttf-mscorefonts-installer -y

            ' Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
        End Sub

        ''' <summary>
        ''' Convert PDF to HTML (using Stream).
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/document/help/net/developer-guide/convert-pdf-to-html-in-csharp-vb.php
        ''' </remarks>
        Private Shared Sub ConvertFromStream()

            ' We need files only for demonstration purposes.
            ' The conversion process will be done completely in memory.
            Dim inpFile As String = "..\..\..\example.pdf"
            Dim outFile As String = "ResultStream.html"
            Dim inpData() As Byte = File.ReadAllBytes(inpFile)
            Dim outData() As Byte = Nothing

            Using msInp As New MemoryStream(inpData)

                ' Specifying PdfLoadOptions we explicitly set that a loadable document is PDF.
                Dim pdfLO As New PdfLoadOptions()
                With pdfLO
                    .RasterizeVectorGraphics = False
                    .DetectTables = False
                    ' 'Disabled' - Never load embedded fonts in PDF. Use the fonts with the same name installed at the system or similar by font metrics.
                    ' 'Enabled' - Always load embedded fonts in PDF.
                    ' 'Auto' - Load only embedded fonts missing in the system. In other case, use the system fonts.
                    .PreserveEmbeddedFonts = PropertyState.Auto
                End With


                ' RasterizeVectorGraphics = False
                ' This means to load vector graphics as is. Don't transform it to raster images.

                ' DetectTables = False
                ' This means don't detect tables.
                ' The PDF format doesn't have real tables, in fact it's a set of orthogonal graphic lines.
                ' Set it to 'True' and the component will detect and recreate tables from graphic lines.

                ' Load a document.
                Dim dc As DocumentCore = DocumentCore.Load(msInp, pdfLO)

                ' Save the document to HTML format.
                Using outMs As New MemoryStream()
                    dc.Save(outMs, New HtmlFixedSaveOptions())
                    outData = outMs.ToArray()
                End Using
                ' Show the result for demonstration purposes.
                If outData IsNot Nothing Then
                    File.WriteAllBytes(outFile, outData)
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
                End If
            End Using
        End Sub
    End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.com или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.