Как преобразовать PDF в HTML в памяти на C# и .NET


Полный код

using System;
using System.IO;

namespace Sample
{
    class Sample
    {
        static void Main(string[] args)
        {
            ConvertPdfBytesToHtml();
            //ConvertPdfStreamToHtml();
        }

        private static void ConvertPdfBytesToHtml()
        {
            // We need files only for demonstration purposes.
            // The whole conversion process will be done in memory.
            string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
            string htmlFile = "Result.html";

                                  // Get your free 30-day key here:   
			 // https://sautinsoft.com/start-for-free/
			
            // Convert PDF to HTML in memory
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();


            // Let's force the component to store images inside HTML document
            // using base-64 encoding.
            // Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = true;
            f.HtmlOptions.Title = "Simple text";            

            // Read a PDF document to byte array
            // Assume that we already have the  PDF as array of bytes.
            byte[] pdf = File.ReadAllBytes(pdfFile);

            f.OpenPdf(pdf);

            if (f.PageCount > 0)
            {
                // Convert PDF to HTML in memory
                string html = f.ToHtml();

                // Save HTML to the file only for demonstration purpose.
                if (html != null)
                {
                    File.WriteAllText(htmlFile, html);
                    // Open the result for demonstration purposes.
                        System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });

                }
            }
        }
        private static void ConvertPdfStreamToHtml()
        {
            // We need files only for demonstration purposes.
            // The whole conversion process will be done in memory.
            string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
            string htmlFile = "Result.html";

                                  // Get your free 30-day key here:   
			 // https://sautinsoft.com/start-for-free/
			
            // Convert PDF to HTML in memory
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();


            // Let's force the component to store images inside HTML document
            // using base-64 encoding.
            // Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = true;
            f.HtmlOptions.Title = "Simple text";

            // Assume that we have a PDF document as Stream.
            using (FileStream fs = File.OpenRead(pdfFile))
            {
                f.OpenPdf(fs);

                if (f.PageCount > 0)
                {
                    // Convert PDF to HTML to a MemoryStream.
                    using (MemoryStream msHtml = new MemoryStream())
                    {
                        int res = f.ToHtml(msHtml);
                        // Open the result for demonstration purposes.
                        if (res == 0)
                        {
                            File.WriteAllBytes(htmlFile, msHtml.ToArray());
                            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
                        }
                    }
                }
            }
        }
    }
}

Download

Imports System
Imports System.IO

Namespace Sample
    Friend Class Sample
        Shared Sub Main(ByVal args() As String)
            ConvertPdfBytesToHtml()
            'ConvertPdfStreamToHtml()
        End Sub

        Private Shared Sub ConvertPdfBytesToHtml()
            ' We need files only for demonstration purposes.
            ' The whole conversion process will be done in memory.
            Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
            Dim htmlFile As String = "Result.html"
	                                ' Get your free 30-day key here: 
	                                ' https://sautinsoft.com/start-for-free/
		
            ' Convert PDF to HTML in memory
            Dim f As New SautinSoft.PdfFocus()

            ' This property is necessary only for licensed version.
            'f.Serial = "XXXXXXXXXXX"

            ' Let's force the component to store images inside HTML document
            ' using base-64 encoding.
            ' Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = True
            f.HtmlOptions.Title = "Simple text"

            ' Read a PDF document to byte array
            ' Assume that we already have the  PDF as array of bytes.
            Dim pdf() As Byte = File.ReadAllBytes(pdfFile)

            f.OpenPdf(pdf)

            If f.PageCount > 0 Then
                ' Convert PDF to HTML in memory
                Dim html As String = f.ToHtml()

                ' Save HTML to the file only for demonstration purpose.
                If html IsNot Nothing Then
                    File.WriteAllText(htmlFile, html)
                    ' Open the result for demonstration purposes.
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})

                End If
            End If
        End Sub
        Private Shared Sub ConvertPdfStreamToHtml()
            ' We need files only for demonstration purposes.
            ' The whole conversion process will be done in memory.
            Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
            Dim htmlFile As String = "Result.html"
	                                ' Get your free 30-day key here: 
	                                ' https://sautinsoft.com/start-for-free/
		
            ' Convert PDF to HTML in memory
            Dim f As New SautinSoft.PdfFocus()

            ' Let's force the component to store images inside HTML document
            ' using base-64 encoding.
            ' Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = True
            f.HtmlOptions.Title = "Simple text"

            ' Assume that we have a PDF document as Stream.
            Using fs As FileStream = File.OpenRead(pdfFile)
                f.OpenPdf(fs)

                If f.PageCount > 0 Then
                    ' Convert PDF to HTML to a MemoryStream.
                    Using msHtml As New MemoryStream()
                        Dim res As Integer = f.ToHtml(msHtml)
                        ' Open the result for demonstration purposes.
                        If res = 0 Then
                            File.WriteAllBytes(htmlFile, msHtml.ToArray())
                            System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
                        End If
                    End Using
                End If
            End Using
        End Sub
    End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу [email protected] или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.