I think I am having trouble ensuring I am feeding a properly formatted PDF byte stream to Spire.Pdf. The error I am getting is saying the format is invalid.
What I am trying to do is accept a byte stream and load the PDF, then extract the text from the first page.
Here is my current code. Any advice on my goal would be appreciated!
- Code: Select all
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Web;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.Functions.Worker;
using Microsoft.Extensions.Logging;
using Microsoft.Identity.Client.Extensions.Msal;
using Spire.Pdf; // https://medium.com/@alexaae9/how-to-extract-text-from-pdf-in-c-developer-guide-a0633699bac2#9502
using Spire.Pdf.Texts;
namespace Company.Function;
public class getPDFText
{
private readonly ILogger<getPDFText> _logger;
public getPDFText(ILogger<getPDFText> logger)
{
_logger = logger;
}
[Function("getPDFText")]
public static async Task<IActionResult> Run([HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequest req)
{
string strPdfTextContent = "";
using var ms = new MemoryStream();
try
{
await req.Body.CopyToAsync(ms);
ms.Position = 0;
byte[] fileBytes = ms.ToArray();
strPdfTextContent = GetTextExtraction(fileBytes);
}
catch (System.Exception ex)
{
return new ConflictObjectResult(ex.ToString());
//return new OkObjectResult(strPdfTextContent);
}
return new OkObjectResult(strPdfTextContent);
}
public static string GetTextExtraction(byte[] pdfBytes)
{
PdfDocument pdf = new PdfDocument();
string strOutputContent = "";
pdf.LoadFromBytes(pdfBytes);
StringBuilder extractedText = new StringBuilder();
PdfTextExtractor textExtractor = new PdfTextExtractor(pdf.Pages[0]);
PdfTextExtractOptions extractOptions = new PdfTextExtractOptions();
extractOptions.IsExtractAllText = true;
strOutputContent = textExtractor.ExtractText(extractOptions);
pdf.Close();
return strOutputContent;
}
}