In the world of document processing, especially on a large scale, extracting specific text from a PDF document can be an important task. Whether it's data extraction, form field analysis, or content filtering, the ability to accurately identify and extract text from a specific rectangular area in a PDF file can simplify many workflows. In this article, you will learn about the process of extracting text from a custom rectangle using C# and .Net the SautinSoft.Pdf library.
Step-by-step guide:
Полный код
using System;
using System.IO;
using SautinSoft;
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
class Program
{
/// <summary>
/// Reading text
/// </summary>
/// <remarks>
/// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/reading-text-from-specific-rectangular-area.php
/// </remarks>
static void Main()
{
// Before starting this example, please get a free 100-day trial key:
// https://sautinsoft.com/start-for-free/
// Apply the key here:
// PdfDocument.SetLicense("...");
string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
var pageIndex = 0;
double areaLeft = 200, areaRight = 520, areaBottom = 510, areaTop = 720;
using (var document = PdfDocument.Load(pdfFile))
{
// Retrieve first page object.
var page = document.Pages[pageIndex];
// Retrieve text content elements that are inside specified area on the first page.
var contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator();
while (contentEnumerator.MoveNext())
{
if (contentEnumerator.Current.ElementType == PdfContentElementType.Text)
{
var textElement = (PdfTextContent)contentEnumerator.Current;
var bounds = textElement.Bounds;
contentEnumerator.Transform.Transform(bounds);
if (bounds.Left > areaLeft && bounds.Right < areaRight &&
bounds.Bottom > areaBottom && bounds.Top < areaTop)
{
// Read the text of an element located in a given area
Console.Write(textElement.ToString());
}
}
}
}
}
}
Option Infer On
Imports System
Imports System.IO
Imports SautinSoft
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Friend Class Program
''' <summary>
''' Reading text
''' </summary>
''' <remarks>
''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/reading-text-from-specific-rectangular-area.php
''' </remarks>
Shared Sub Main()
' Before starting this example, please get a free license:
' https://sautinsoft.com/start-for-free/
' Apply the key here:
' PdfDocument.SetLicense("...");
Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
Dim pageIndex = 0
Dim areaLeft As Double = 200, areaRight As Double = 520, areaBottom As Double = 510, areaTop As Double = 720
Using document = PdfDocument.Load(pdfFile)
' Retrieve first page object.
Dim page = document.Pages(pageIndex)
' Retrieve text content elements that are inside specified area on the first page.
Dim contentEnumerator = page.Content.Elements.All(page.Transform).GetEnumerator()
Do While contentEnumerator.MoveNext()
If contentEnumerator.Current.ElementType = PdfContentElementType.Text Then
Dim textElement = CType(contentEnumerator.Current, PdfTextContent)
Dim bounds = textElement.Bounds
contentEnumerator.Transform.Transform(bounds)
If bounds.Left > areaLeft AndAlso bounds.Right < areaRight AndAlso bounds.Bottom > areaBottom AndAlso bounds.Top < areaTop Then
' Read the text of an element located in a given area
Console.Write(textElement.ToString())
End If
End If
Loop
End Using
End Sub
End Class
Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.ru или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже: