Как преобразовать несколько PDF-файлов в HTML на C# и .NET
Полный код
using System;
using System.IO;
using System.Linq;
using System.Text;
using SautinSoft;
namespace Sample
{
class Sample
{
static void Main(string[] args)
{
// Before starting, we recommend to get a free 100-day key:
// https://sautinsoft.com/start-for-free/
// Apply the key here:
// SautinSoft.PdfFocus.SetLicense("...");
ConvertMultiplePdfToHtmls();
//ConvertMultiplePdfToSingleHtml();
}
/// <summary>
/// Converts multiple PDF files to HTML files.
/// </summary>
static void ConvertMultiplePdfToHtmls()
{
// Directory with *.pdf files.
string pdfDirectory = Path.GetFullPath(@"..\..\..\");
string[] pdfFiles = Directory.GetFiles(pdfDirectory, "*.pdf");
DirectoryInfo htmlDirectory = new DirectoryInfo(@"htmls");
if (!htmlDirectory.Exists)
htmlDirectory.Create();
PdfFocus f = new PdfFocus();
int success = 0;
int total = 0;
foreach (string pdfFile in pdfFiles)
{
Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile));
f.OpenPdf(pdfFile);
total++;
if (f.PageCount > 0)
{
// Path (must exist) to a directory to store images after converting. Notice also to the property "ImageSubFolder".
f.HtmlOptions.ImageFolder = htmlDirectory.FullName;
// A folder (will be created by the component) without any drive letters, only the folder as "myfolder".
f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile));
// A template name for images
f.HtmlOptions.ImageFileName = "picture";
// Auto - the same image format as in the source PDF;
// 'Jpeg' to make the document size less;
// 'PNG' to keep the highest quality, but the highest size too.
f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto;
// How to store images: Inside HTML document as base64 images or as linked separate image files.
f.HtmlOptions.IncludeImageInHtml = false;
string htmlFile = Path.GetFileNameWithoutExtension(pdfFile) + ".html";
string htmlFilePath = Path.Combine(htmlDirectory.FullName, htmlFile);
if (f.ToHtml(htmlFilePath) == 0)
{
success++;
}
}
}
// Show results:
Console.WriteLine("{0} of {1} files converted successfully!", success, total);
// Open folder with HTML files after converting.
// Open the result for demonstration purposes.
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlDirectory.FullName) { UseShellExecute = true });
}
/// <summary>
/// Converts multiple PDF files into a single HTML document.
/// </summary>
static void ConvertMultiplePdfToSingleHtml()
{
// Directory with *.pdf files.
string pdfDirectory = Path.GetFullPath(@"..\..\..\");
string htmlFile = "Result.html";
string[] pdfFiles = Directory.GetFiles(pdfDirectory, "*.pdf");
// Here we'll keep our Html document.
StringBuilder singleHtml = new StringBuilder();
singleHtml.Append("<html>\r\n<head>\r\n");
singleHtml.Append(@"<meta http-equiv = ""Content-Type"" content=""text/html; charset=utf-8"" />");
singleHtml.Append("\r\n</head>\r\n<body>");
PdfFocus f = new PdfFocus();
int success = 0;
int total = 0;
foreach (string pdfFile in pdfFiles)
{
Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile));
f.OpenPdf(pdfFile);
total++;
if (f.PageCount > 0)
{
// How to store images: Inside HTML document as base64 images or as linked separate image files.
f.HtmlOptions.IncludeImageInHtml = false;
// Create own subfolder for each converted file to store images separately and don't mix up them.
f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile));
// A template name for images
f.HtmlOptions.ImageFileName = "picture";
// Auto - the same image format as in the source PDF;
// 'Jpeg' to make the document size less;
// 'PNG' to keep the highest quality, but the highest size too.
f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto;
// Let's make our CSS inline to be able merge HTML documents without any problems.
f.HtmlOptions.InlineCSS = true;
// We need only contents of <body>...</body>.
f.HtmlOptions.ProduceOnlyHtmlBody = true;
string tempHtml = f.ToHtml();
if (!String.IsNullOrEmpty(tempHtml))
{
success++;
// Add tempHtml into a single HTML.
singleHtml.Append(tempHtml);
}
}
}
singleHtml.Append("</body></html>");
// Show results:
File.WriteAllText(htmlFile, singleHtml.ToString());
Console.WriteLine("{0} of {1} files converted and merged into {2}!", success, total, Path.GetFileName(htmlFile));
// Open the result for demonstration purposes.
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
}
}
}
Imports Microsoft.VisualBasic
Imports System
Imports System.IO
Imports System.Linq
Imports System.Text
Imports SautinSoft
Namespace Sample
Friend Class Sample
Shared Sub Main(ByVal args() As String)
' Before starting, we recommend to get a free 100-day key:
' https://sautinsoft.com/start-for-free/
' Apply the key here
' SautinSoft.PdfFocus.SetLicense("...");
'ConvertMultiplePdfToHtmls()
ConvertMultiplePdfToSingleHtml()
End Sub
''' <summary>
''' Converts multiple PDF files to HTML files.
''' </summary>
Private Shared Sub ConvertMultiplePdfToHtmls()
' Directory with *.pdf files.
Dim pdfDirectory As String = Path.GetFullPath("..\..\..\")
Dim pdfFiles() As String = Directory.GetFiles(pdfDirectory, "*.pdf")
Dim htmlDirectory As New DirectoryInfo("htmls")
If Not htmlDirectory.Exists Then
htmlDirectory.Create()
End If
Dim f As New PdfFocus()
Dim success As Integer = 0
Dim total As Integer = 0
For Each pdfFile As String In pdfFiles
Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile))
f.OpenPdf(pdfFile)
total += 1
If f.PageCount > 0 Then
' Path (must exist) to a directory to store images after converting. Notice also to the property "ImageSubFolder".
f.HtmlOptions.ImageFolder = htmlDirectory.FullName
' A folder (will be created by the component) without any drive letters, only the folder as "myfolder".
f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile))
' A template name for images
f.HtmlOptions.ImageFileName = "picture"
' Auto - the same image format as in the source PDF;
' 'Jpeg' to make the document size less;
' 'PNG' to keep the highest quality, but the highest size too.
f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto
' How to store images: Inside HTML document as base64 images or as linked separate image files.
f.HtmlOptions.IncludeImageInHtml = False
Dim htmlFile As String = Path.GetFileNameWithoutExtension(pdfFile) & ".html"
Dim htmlFilePath As String = Path.Combine(htmlDirectory.FullName, htmlFile)
If f.ToHtml(htmlFilePath) = 0 Then
success += 1
End If
End If
Next pdfFile
' Show results:
Console.WriteLine("{0} of {1} files converted successfully!", success, total)
' Open folder with HTML files after converting.
' Open the result for demonstration purposes.
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlDirectory.FullName) With {.UseShellExecute = True})
End Sub
''' <summary>
''' Converts multiple PDF files into a single HTML document.
''' </summary>
Private Shared Sub ConvertMultiplePdfToSingleHtml()
' Directory with *.pdf files.
Dim pdfDirectory As String = Path.GetFullPath("..\")
Dim htmlFile As String = "Result.html"
Dim pdfFiles() As String = Directory.GetFiles(pdfDirectory, "*.pdf")
' Here we'll keep our Html document.
Dim singleHtml As New StringBuilder()
singleHtml.Append("<html>" & vbCrLf & "<head>" & vbCrLf)
singleHtml.Append("<meta http-equiv = ""Content-Type"" content=""text/html; charset=utf-8"" />")
singleHtml.Append(vbCrLf & "</head>" & vbCrLf & "<body>")
Dim f As New PdfFocus()
Dim success As Integer = 0
Dim total As Integer = 0
For Each pdfFile As String In pdfFiles
Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile))
f.OpenPdf(pdfFile)
total += 1
If f.PageCount > 0 Then
' How to store images: Inside HTML document as base64 images or as linked separate image files.
f.HtmlOptions.IncludeImageInHtml = False
' Create own subfolder for each converted file to store images separately and don't mix up them.
f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile))
' A template name for images
f.HtmlOptions.ImageFileName = "picture"
' Auto - the same image format as in the source PDF;
' 'Jpeg' to make the document size less;
' 'PNG' to keep the highest quality, but the highest size too.
f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto
' Let's make our CSS inline to be able merge HTML documents without any problems.
f.HtmlOptions.InlineCSS = True
' We need only contents of <body>...</body>.
f.HtmlOptions.ProduceOnlyHtmlBody = True
Dim tempHtml As String = f.ToHtml()
If Not String.IsNullOrEmpty(tempHtml) Then
success += 1
' Add tempHtml into a single HTML.
singleHtml.Append(tempHtml)
End If
End If
Next pdfFile
singleHtml.Append("</body></html>")
' Show results:
File.WriteAllText(htmlFile, singleHtml.ToString())
Console.WriteLine("{0} of {1} files converted and merged into {2}!", success, total, Path.GetFileName(htmlFile))
' Open the result for demonstration purposes.
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
End Sub
End Class
End Namespace
Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.ru или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже: