Extracting text from PDF documents is an important task for many applications, whether it's data analysis, business process automation, or reporting. In this article, we will look at how to extract text from a PDF document along specified boundaries (coordinates) using C# and .NET.
Step-by-step guide:
Полный код
using System;
using System.IO;
using SautinSoft;
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
class Program
{
/// <summary>
/// How to extract text by given bounds.
/// </summary>
/// <remarks>
/// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/extract-text-from-pdf-by-given-bounds.php
/// </remarks>
static void Main()
{
// Before starting this example, please get a free 100-day trial key:
// https://sautinsoft.com/start-for-free/
// Apply the key here:
// PdfDocument.SetLicense("...");
string inpFile = Path.GetFullPath(@"..\..\..\extract-text.pdf");
using (var document = PdfDocument.Load(inpFile))
{
// Get the page from which we want to make the extraction
var page = document.Pages[0];
// NOTE: In PDF, location (0, 0) is at the bottom-left corner of the page
// and the positive y axis extends vertically upward.
var pageBounds = page.CropBox;
// Extract text content from the given bounds
var text = page.Content.GetText(new PdfTextOptions
{
Bounds = new PdfQuad(
new PdfPoint(20, pageBounds.Top - 20),
new PdfPoint(pageBounds.Right, pageBounds.Top - 20),
new PdfPoint(pageBounds.Right, pageBounds.Top - 120),
new PdfPoint(20, pageBounds.Top - 120)),
Order = PdfTextOrder.Reading
});
// Writing the extracted text
Console.WriteLine($"Result: {text}");
Console.WriteLine($"Text position: " +
$"(X: {text.Bounds.Left:0.##}, " +
$"Y: {text.Bounds.Bottom:0.##}), " +
$"Width: {text.Bounds.Width:0.##}, " +
$"Height: {text.Bounds.Height:0.##}.");
Console.ReadKey();
}
}
}
Option Infer On
Imports System
Imports System.IO
Imports SautinSoft
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Friend Class Program
''' <summary>
''' How to extract text by given bounds.
''' </summary>
''' <remarks>
''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/extract-text-from-pdf-by-given-bounds.php
''' </remarks>
Shared Sub Main()
' Before starting this example, please get a free license:
' https://sautinsoft.com/start-for-free/
' Apply the key here:
' PdfDocument.SetLicense("...");
Dim inpFile As String = Path.GetFullPath("..\..\..\extract-text.pdf")
Using document = PdfDocument.Load(inpFile)
' Get the page from which we want to make the extraction
Dim page = document.Pages(0)
' NOTE: In PDF, location (0, 0) is at the bottom-left corner of the page
' and the positive y axis extends vertically upward.
Dim pageBounds = page.CropBox
' Extract text content from the given bounds
Dim text = page.Content.GetText(New PdfTextOptions With {
.Bounds = New PdfQuad(New PdfPoint(20, pageBounds.Top - 20), New PdfPoint(pageBounds.Right, pageBounds.Top - 20), New PdfPoint(pageBounds.Right, pageBounds.Top - 120), New PdfPoint(20, pageBounds.Top - 120)),
.Order = PdfTextOrder.Reading
})
' Writing the extracted text
Console.WriteLine($"Result: {text}")
Console.WriteLine($"Text position: " & $"(X: {text.Bounds.Left:0.##}, " & $"Y: {text.Bounds.Bottom:0.##}), " & $"Width: {text.Bounds.Width:0.##}, " & $"Height: {text.Bounds.Height:0.##}.")
Console.ReadKey()
End Using
End Sub
End Class
Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.ru или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже: