To extract text by line from PDF document, use the ExtractText() method without passing any parameter. However, this method also can take a SimpleTextExtractionStrategy object as an argument, which keeps track of the current Y position of each string and inserts a line break into the output if the Y position has changed.
Step 1: Load a sample PDF document to PdfDocument object.
PdfDocument doc = new PdfDocument(); doc.LoadFromFile("sample.pdf");
Step 2: Get the first page.
PdfPageBase page = doc.Pages[0];
Step 3: Create an instance of SimpleTextExtractionStrategy class.
SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
Step 4: Extract text from the given PDF page by SimpleTextExtractionStrategy.
string text = page.ExtractText(strategy)
Step 5: Write text to a TXT file.
FileStream fs = new FileStream("result.txt",FileMode.Create); StreamWriter sw = new StreamWriter(fs); sw.Write(text);
Output:
Full Code:
[C#]
using Spire.Pdf; using Spire.Pdf.Exporting.Text; using System.IO; namespace ExtractText { class Program { static void Main(string[] args) { PdfDocument doc = new PdfDocument(); doc.LoadFromFile("sample.pdf"); PdfPageBase page = doc.Pages[0]; SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); string text = page.ExtractText(strategy); FileStream fs = new FileStream("result.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); sw.Write(text); sw.Flush(); sw.Close(); } } }
[VB.NET]
Imports Spire.Pdf Imports Spire.Pdf.Exporting.Text Imports System.IO Namespace ExtractText Class Program Private Shared Sub Main(args As String()) Dim doc As New PdfDocument() doc.LoadFromFile("sample.pdf") Dim page As PdfPageBase = doc.Pages(0) Dim strategy As New SimpleTextExtractionStrategy() Dim text As String = page.ExtractText(strategy) Dim fs As New FileStream("result.txt", FileMode.Create) Dim sw As New StreamWriter(fs) sw.Write(text) sw.Flush() sw.Close() End Sub End Class End Namespace