Преобразование PDF в HTML в памяти с получением списка изображений на C# и .NET


Полный код

using System;
using System.IO;
using System.Collections.Generic;
using System.Drawing;

namespace Sample
{
    class Sample
    {
        static void Main(string[] args)
        {
            ConvertPdfBytesToHtml();
        }

        private static void ConvertPdfBytesToHtml()
        {
            // We need files only for demonstration purposes.
            // The whole conversion process will be done in memory.
            string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
            string htmlFile = "Result.htm";

            // This is the list with extracted images.
            // It will be filled by images after the conversion.
            List<Image> imgCollection = new List<Image>();
			
                                  // Get your free 30-day key here:   
			 // https://sautinsoft.com/start-for-free/
			
            // Convert PDF to HTML in memory
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();


            // Let's force the component to store images inside HTML document
            // using base-64 encoding.
            // Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = true;
            f.HtmlOptions.Title = "Simple text";
            

            // Read a PDF document to byte array.
            // Assume that we already have the  PDF as array of bytes.
            byte[] pdf = File.ReadAllBytes(pdfFile);

            f.OpenPdf(pdf);

            if (f.PageCount > 0)
            {
                // Convert PDF to HTML in memory
                string htmlString = f.ToHtml(1, f.PageCount, imgCollection);

                // Save HTML to a file only for the demonstration purpose.
                if (htmlString != null)
                {
                    // Show info about images and save them
                    Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count);
                    DirectoryInfo imgDir = new DirectoryInfo("Extracted Images");
                    if (!imgDir.Exists)
                        imgDir.Create();

                    int count = 1;
                    foreach (Image img in imgCollection)
                    {
                        Console.WriteLine("\t {0,4} x {1,4} px", img.Width, img.Height);
                        string imageFileName = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"));
                        img.Save(imageFileName, System.Drawing.Imaging.ImageFormat.Jpeg);
                        count++;
                    }
                    // Open the result for demonstration purposes.
                    File.WriteAllText(htmlFile, htmlString);
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(imgDir.FullName) { UseShellExecute = true });
                }
            }
        }
    }
}

Download

Imports Microsoft.VisualBasic
Imports System
Imports System.IO
Imports System.Collections.Generic
Imports System.Drawing

Namespace Sample
    Friend Class Sample
        Shared Sub Main(ByVal args() As String)
            ConvertPdfBytesToHtml()
        End Sub

        Private Shared Sub ConvertPdfBytesToHtml()
            ' We need files only for demonstration purposes.
            ' The whole conversion process will be done in memory.
            Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
            Dim htmlFile As String = "Result.htm"

            ' This is the list with extracted images.
            ' It will be filled by images after the conversion.
            Dim imgCollection As New List(Of Image)()
	                                ' Get your free 30-day key here: 
	                                ' https://sautinsoft.com/start-for-free/
		
            ' Convert PDF to HTML in memory
            Dim f As New SautinSoft.PdfFocus()
            ' Let's force the component to store images inside HTML document
            ' using base-64 encoding.
            ' Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = True
            f.HtmlOptions.Title = "Simple text"

            ' Read a PDF document to byte array.
            ' Assume that we already have the  PDF as array of bytes.
            Dim pdf() As Byte = File.ReadAllBytes(pdfFile)

            f.OpenPdf(pdf)

            If f.PageCount > 0 Then
                ' Convert PDF to HTML in memory
                Dim htmlString As String = f.ToHtml(1, f.PageCount, imgCollection)

                ' Save HTML to a file only for the demonstration purpose.
                If htmlString IsNot Nothing Then
                    ' Show info about images and save them
                    Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count)
                    Dim imgDir As New DirectoryInfo("Extracted Images")
                    If Not imgDir.Exists Then
                        imgDir.Create()
                    End If

                    Dim count As Integer = 1
                    For Each img As Image In imgCollection
                        Console.WriteLine(vbTab & " {0,4} x {1,4} px", img.Width, img.Height)
                        Dim imageFileName As String = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"))
                        img.Save(imageFileName, System.Drawing.Imaging.ImageFormat.Jpeg)
                        count += 1
                    Next img
                    ' Open the result for demonstration purposes.
                    File.WriteAllText(htmlFile, htmlString)
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(imgDir.FullName) With {.UseShellExecute = True})
                End If
            End If
        End Sub
    End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу [email protected] или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.