Как запустить полнотекстовый поиск в файлах PDF, DOCX, RTF и HTML с помощью C# и .NET

  1. Добавьте SautinSoft.Document из Nuget.
  2. Загрузите все документы из папки.
  3. Найдите определенный текст в каждом документе.
  4. Вычислите все совпадения.

Здесь мы покажем вам, как использовать полнотекстовый поиск в определенном каталоге, включая подкаталоги.
Используя регулярные выражения, мы найдем - "video" (video, VIDEO, ViDeO и т.д) во всех файлах (DOCX, RTF, PDF и HTML) внутри указанного каталога и выведем результаты на консоль.

Полный код

using System;
using System.IO;
using System.Collections.Generic;
using SautinSoft.Document;
using System.Drawing;
using System.Drawing.Imaging;
using System.Linq;
using System.Text.RegularExpressions;



namespace Sample
{
    class Sample
    {

        static void Main(string[] args)
        {
            // Get your free 100-day key here:   
            // https://sautinsoft.com/start-for-free/

            string searchDir = Path.GetFullPath(@"..\..\..\searching\");
            string searchText = "with";
            FullTextSearching(searchDir, searchText);
        }

        /// <summary>
        /// This sample shows how to launch full text search in the specific directory.
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/document/help/net/developer-guide/full-text-searching-in-documents-net-csharp-vb.php
        /// </remarks>
        public static void FullTextSearching(string searchPath, string searchText)
        {
            DirectoryInfo searchDir = new DirectoryInfo(searchPath);
            List<string> supportedFiles = new List<string>();

            // 1. Find theS files to make search.
            // Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
            // including subdirectories.
            foreach (string file in Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories))
            {
                string ext = Path.GetExtension(file).ToLower();

                if (ext == ".docx" || ext == ".pdf" || ext == ".html" || ext == ".rtf")
                    supportedFiles.Add(file);
            }

            // 2. Perform the text search in the each file using a loop.
            // We'll search the word "video" in the each and count how many times the file contains it.
            Console.WriteLine($"The results for \"{searchText}\":");

            int totalFiles = 0, totalMatches = 0;
            foreach (string file in supportedFiles)
            {
                DocumentCore dc = DocumentCore.Load(file);
                totalFiles++;
                Regex regex = new Regex($"\\b({searchText})\\b", RegexOptions.IgnoreCase);

                // Show also subfolder if we aren't in the root folder.
                DirectoryInfo dirInfo = new DirectoryInfo(Path.GetDirectoryName(file));
                string fileName = String.Empty;

                if (dirInfo.FullName.TrimEnd(new char[] { '\\' }) != searchDir.FullName.TrimEnd(new char[] { '\\' }))
                    fileName = file.Substring(searchPath.Length, file.Length - searchPath.Length);
                else
                    // We are in the root folder.
                    fileName = Path.GetFileName(file);

                int matches = dc.Content.Find(regex).Count();
                totalMatches += matches;

                Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.");
            }
            Console.WriteLine($"\nSearching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.");
            Console.WriteLine("Press any key ...");
            Console.ReadKey();
        }
    }
}

Download

Imports System
Imports System.IO
Imports System.Collections.Generic
Imports SautinSoft.Document
Imports System.Drawing
Imports System.Linq
Imports System.Text.RegularExpressions



Namespace Sample
    Friend Class Sample

        Shared Sub Main(ByVal args() As String)
            Dim searchDir As String = Path.GetFullPath("..\..\..\searching\")
            Dim searchText As String = "with"
            FullTextSearching(searchDir, searchText)
        End Sub
        ''' Get your free 100-day key here:   
        ''' https://sautinsoft.com/start-for-free/
        ''' <summary>
        ''' This sample shows how to launch full text search in the specific directory.
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/document/help/net/developer-guide/full-text-searching-in-documents-net-csharp-vb.php
        ''' </remarks>
        Public Shared Sub FullTextSearching(ByVal searchPath As String, ByVal searchText As String)
            Dim searchDir As New DirectoryInfo(searchPath)
            Dim supportedFiles As New List(Of String)()

            ' 1. Find theS files to make search.
            ' Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
            ' including subdirectories.
            For Each file As String In Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories)
                Dim ext As String = Path.GetExtension(file).ToLower()

                If ext = ".docx" OrElse ext = ".pdf" OrElse ext = ".html" OrElse ext = ".rtf" Then
                    supportedFiles.Add(file)
                End If
            Next file

            ' 2. Perform the text search in the each file using a loop.
            ' We'll search the word "video" in the each and count how many times the file contains it.
            Console.WriteLine($"The results for ""{searchText}"":")

            Dim totalFiles As Integer = 0, totalMatches As Integer = 0
            For Each file As String In supportedFiles
                Dim dc As DocumentCore = DocumentCore.Load(file)
                totalFiles += 1
                Dim regex As New Regex($"\b({searchText})\b", RegexOptions.IgnoreCase)

                ' Show also subfolder if we aren't in the root folder.
                Dim dirInfo As New DirectoryInfo(Path.GetDirectoryName(file))
                Dim fileName As String = String.Empty

                If dirInfo.FullName.TrimEnd(New Char() {"\"c}) <> searchDir.FullName.TrimEnd(New Char() {"\"c}) Then
                    fileName = file.Substring(searchPath.Length, file.Length - searchPath.Length)
                Else
                    ' We are in the root folder.
                    fileName = Path.GetFileName(file)
                End If

                Dim matches As Integer = dc.Content.Find(regex).Count()
                totalMatches += matches

                Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.")
            Next file
            Console.WriteLine($"Searching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.")
            Console.WriteLine("Press any key ...")
            Console.ReadKey()
        End Sub
    End Class
End Namespace

Download


Если вам нужен пример кода или у вас есть вопрос: напишите нам по адресу support@sautinsoft.com или спросите в онлайн-чате (правый нижний угол этой страницы) или используйте форму ниже:



Вопросы и предложения всегда приветствуются!

Мы разрабатываем компоненты .Net с 2002 года. Мы знаем форматы PDF, DOCX, RTF, HTML, XLSX и Images. Если вам нужна помощь в создании, изменении или преобразовании документов в различных форматах, мы можем вам помочь. Мы напишем для вас любой пример кода абсолютно бесплатно.