Как захватить текст и изображения из существующего PDF, DOCX или любого документа по определенным координатам (x, y) на C# и .NET

  1. Добавьте SautinSoft.Document из Nuget.
  2. Загрузите PDF-документ.
  3. Выполните итерацию по страницам документа.
  4. Захватите элементы по (X,Y) и (ширина, высота).

Полный код

using System;
using System.IO;
using System.Linq;
using SautinSoft.Document;
using SautinSoft.Document.Drawing;
using SautinSoft.Document.Tables;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {
            // Get your free 100-day key here:   
            // https://sautinsoft.com/start-for-free/

            CaptureTextZoneByXY();
        }
        /// <summary>
        /// How to capture text and images from the existing PDF, DOCX, any document by specific (x,y) coordinates
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/document/help/net/developer-guide/capture-text-images-in-pdf-docx-document-by-specific-zone-x-y-coordinates-net-csharp-vb.php
        /// </remarks>
        static void CaptureTextZoneByXY()
        {
            // Let us say, we want to capture text and graphics into:            
            // Left-Top:(0, 50) mm,
            var mmXY = (0f, 50f);
            // Width: 250 mm, Height: 150 mm.
            var mmWH = (250f, 150f);

            // Zero-page index, e.g. page 1 has index 0.
            int[] pageCollection = new int[1] { 0 };

            // Convert mm to points
            double leftX = LengthUnitConverter.Convert(mmXY.Item1, LengthUnit.Millimeter, LengthUnit.Point);
            double topY = LengthUnitConverter.Convert(mmXY.Item2, LengthUnit.Millimeter, LengthUnit.Point);
            double width = LengthUnitConverter.Convert(mmWH.Item1, LengthUnit.Millimeter, LengthUnit.Point);
            double height = LengthUnitConverter.Convert(mmWH.Item2, LengthUnit.Millimeter, LengthUnit.Point);

            string inpFile = Path.GetFullPath(@"..\..\..\Potato Beetle.pdf");
            string outFile = "Result.docx";

            // 1. Load an existing document, load only specigic pages.
            PdfLoadOptions opt = new PdfLoadOptions()
            {
                SelectedPages = pageCollection,
                DetectTables = true,
                ConversionMode = PdfConversionMode.Flowing,
            };
            DocumentCore dc = DocumentCore.Load(inpFile, opt);

            // 2. Create new document to store captured data.
            DocumentCore dcCaptured = new DocumentCore();
            // Create import session.
            ImportSession session = new ImportSession(dc, dcCaptured, StyleImportingMode.KeepSourceFormatting);

            // 3. Iterate through document pages
            // and capture elements by (X,Y) and (width, height).
            var paginator = dc.GetPaginator(new PaginatorOptions() { UpdateFields = true });
            int pageIndex = 0;
            foreach (var page in paginator.Pages)
            {
                Section importedSection = null;
                foreach (var elementFrame in page.GetElementFrames(
                    ElementType.Paragraph,
                    ElementType.Table
                    ))
                {
                    // Is element inside capturing zone?
                    if (elementFrame.Bounds.Left >= leftX &&
                        elementFrame.Bounds.Left <= leftX + width &&
                        elementFrame.Bounds.Top >= topY &&
                        elementFrame.Bounds.Top <= topY + height)

                    {
                        if (importedSection == null)
                        {
                            importedSection = dcCaptured.Import<Section>(dc.Sections[pageIndex], false, session);
                            dcCaptured.Sections.Add(importedSection);
                        }

                        if (elementFrame.Element is Paragraph par)
                        {
                            var importedPar = dcCaptured.Import<Paragraph>(par, true, session);
                            importedSection.Blocks.Add(importedPar);
                        }
                        else if (elementFrame.Element is Table table)
                        {
                            var importedTable = dcCaptured.Import<Table>(table, true, session);
                            importedSection.Blocks.Add(importedTable);
                        }
                    }
                }
                pageIndex++;
            }
            dcCaptured.Save(outFile);
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
        }        
    }
}

Download

Imports System
Imports System.IO
Imports System.Linq
Imports SautinSoft.Document
Imports SautinSoft.Document.Drawing
Imports SautinSoft.Document.Tables

Namespace Example
	Friend Class Program
		Shared Sub Main(ByVal args() As String)
			' Get your free 100-day key here:   
			' https://sautinsoft.com/start-for-free/

			CaptureTextZoneByXY()
		End Sub
		''' <summary>
		''' How to capture text and images from the existing PDF, DOCX, any document by specific (x,y) coordinates
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/document/help/net/developer-guide/capture-text-images-in-pdf-docx-document-by-specific-zone-x-y-coordinates-net-csharp-vb.php
		''' </remarks>
		Private Shared Sub CaptureTextZoneByXY()
			' Let us say, we want to capture text and graphics into:            
			' Left-Top:(0, 50) mm,
			Dim mmXY = (0F, 50.0F)
			' Width: 250 mm, Height: 150 mm.
			Dim mmWH = (250.0F, 150.0F)

			' Zero-page index, e.g. page 1 has index 0.
			Dim pageCollection() As Integer = {0}

			' Convert mm to points
			Dim leftX As Double = LengthUnitConverter.Convert(mmXY.Item1, LengthUnit.Millimeter, LengthUnit.Point)
			Dim topY As Double = LengthUnitConverter.Convert(mmXY.Item2, LengthUnit.Millimeter, LengthUnit.Point)
			Dim width As Double = LengthUnitConverter.Convert(mmWH.Item1, LengthUnit.Millimeter, LengthUnit.Point)
			Dim height As Double = LengthUnitConverter.Convert(mmWH.Item2, LengthUnit.Millimeter, LengthUnit.Point)

			Dim inpFile As String = Path.GetFullPath("..\..\..\Potato Beetle.pdf")
			Dim outFile As String = "Result.docx"

			' 1. Load an existing document, load only specigic pages.
			Dim opt As New PdfLoadOptions() With {
				.SelectedPages = pageCollection,
				.DetectTables = True,
				.ConversionMode = PdfConversionMode.Flowing
			}
			Dim dc As DocumentCore = DocumentCore.Load(inpFile, opt)

			' 2. Create new document to store captured data.
			Dim dcCaptured As New DocumentCore()
			' Create import session.
			Dim session As New ImportSession(dc, dcCaptured, StyleImportingMode.KeepSourceFormatting)

			' 3. Iterate through document pages
			' and capture elements by (X,Y) and (width, height).
			Dim paginator = dc.GetPaginator(New PaginatorOptions() With {.UpdateFields = True})
			Dim pageIndex As Integer = 0
			For Each page In paginator.Pages
				Dim importedSection As Section = Nothing
				For Each elementFrame In page.GetElementFrames(ElementType.Paragraph, ElementType.Table)
					' Is element inside capturing zone?
					If elementFrame.Bounds.Left >= leftX AndAlso elementFrame.Bounds.Left <= leftX + width AndAlso elementFrame.Bounds.Top >= topY AndAlso elementFrame.Bounds.Top <= topY + height Then

						If importedSection Is Nothing Then
							importedSection = dcCaptured.Import(Of Section)(dc.Sections(pageIndex), False, session)
							dcCaptured.Sections.Add(importedSection)
						End If

						Dim tempVar As Boolean = TypeOf elementFrame.Element Is Paragraph
						Dim par As Paragraph = If(tempVar, CType(elementFrame.Element, Paragraph), Nothing)
						If tempVar Then
							Dim importedPar = dcCaptured.Import(Of Paragraph)(par, True, session)
							importedSection.Blocks.Add(importedPar)
						Else
							Dim tempVar2 As Boolean = TypeOf elementFrame.Element Is Table
							Dim table As Table = If(tempVar2, CType(elementFrame.Element, Table), Nothing)
							If tempVar2 Then
								Dim importedTable = dcCaptured.Import(Of Table)(table, True, session)
								importedSection.Blocks.Add(importedTable)
							End If
						End If
					End If
				Next elementFrame
				pageIndex += 1
			Next page
			dcCaptured.Save(outFile)
			System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
		End Sub
	End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.com или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.