Spire.PDF is a professional PDF library applied to creating, writing, editing, handling and reading PDF files without any external dependencies. Get free and professional technical support for Spire.PDF for .NET, Java, Android, C++, Python.

Mon Nov 24, 2025 10:49 pm

Hello!

I think I am having trouble ensuring I am feeding a properly formatted PDF byte stream to Spire.Pdf. The error I am getting is saying the format is invalid.

What I am trying to do is accept a byte stream and load the PDF, then extract the text from the first page.

Here is my current code. Any advice on my goal would be appreciated!

Code: Select all
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Web;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.Functions.Worker;
using Microsoft.Extensions.Logging;
using Microsoft.Identity.Client.Extensions.Msal;
using Spire.Pdf; // https://medium.com/@alexaae9/how-to-extract-text-from-pdf-in-c-developer-guide-a0633699bac2#9502
using Spire.Pdf.Texts;
namespace Company.Function;

public class getPDFText
{
    private readonly ILogger<getPDFText> _logger;

    public getPDFText(ILogger<getPDFText> logger)
    {
        _logger = logger;
    }

    [Function("getPDFText")]
    public static async Task<IActionResult> Run([HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequest req)
    {
        string strPdfTextContent = "";

        using var ms = new MemoryStream();

        try
        {
            await req.Body.CopyToAsync(ms);
            ms.Position = 0;

            byte[] fileBytes = ms.ToArray();

            strPdfTextContent = GetTextExtraction(fileBytes);
        }
        catch (System.Exception ex)
        {
            return new ConflictObjectResult(ex.ToString());
            //return new OkObjectResult(strPdfTextContent);
        }

        return new OkObjectResult(strPdfTextContent);
    }

    public static string GetTextExtraction(byte[] pdfBytes)
    {
        PdfDocument pdf = new PdfDocument();
        string strOutputContent = "";

        pdf.LoadFromBytes(pdfBytes);

        StringBuilder extractedText = new StringBuilder();

        PdfTextExtractor textExtractor = new PdfTextExtractor(pdf.Pages[0]);

        PdfTextExtractOptions extractOptions = new PdfTextExtractOptions();

        extractOptions.IsExtractAllText = true;
        strOutputContent = textExtractor.ExtractText(extractOptions);

        pdf.Close();

        return strOutputContent;
    }
}

bhall_des
 
Posts: 1
Joined: Wed Nov 05, 2025 5:42 pm

Tue Nov 25, 2025 4:14 am

Hello,

Thank you for your inquiry.
When I tested with the following code, I was able to extract text from the file normally using both the free version Spire.PDF 11.11.0 and the commercial version 11.11.5, and could not reproduce the issue you encountered:
Code: Select all
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.Functions.Worker;
using Microsoft.Extensions.Logging;
using Spire.Pdf.Texts;
using Spire.Pdf;
using System.Text;

namespace AzureFunction1125
{
    public class GetPDFText
    {
        private readonly ILogger<GetPDFText> _logger;

        public GetPDFText(ILogger<GetPDFText> logger)
        {
            _logger = logger;
        }

        [Function("GetPDFText")]
        public static async Task<IActionResult> Run([HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequest req)
        {
            // Handle GET request: Return HTML upload page
            if (req.Method == "GET")
            {
                return new ContentResult
                {
                    Content = @"
                <html>
                <body>
                    <h1>PDF Text Extractor</h1>
                    <form method='post' enctype='multipart/form-data'>
                        <input type='file' name='pdfFile' accept='.pdf' required>
                        <button type='submit'>Extract Text</button>
                    </form>
                </body>
                </html>",
                    ContentType = "text/html"
                };
            }

            // Handle POST request: Process PDF file
            string strPdfTextContent = "";
            using var ms = new MemoryStream();

            try
            {
                // Check if it's a form file upload (from web browser)
                if (req.HasFormContentType && req.Form.Files.Count > 0)
                {
                    var file = req.Form.Files[0];
                    using var fileStream = file.OpenReadStream();
                    await fileStream.CopyToAsync(ms);
                }
                else
                {
                    // Handle raw byte stream (from curl/API clients)
                    await req.Body.CopyToAsync(ms);
                }

                // Reset stream position and convert to byte array
                ms.Position = 0;
                byte[] fileBytes = ms.ToArray();

                // Validate received data
                if (fileBytes.Length == 0)
                {
                    return new BadRequestObjectResult("No PDF data received. Please upload a PDF file.");
                }

                // Extract text from PDF bytes
                strPdfTextContent = GetTextExtraction(fileBytes);
            }
            catch (System.Exception ex)
            {
                return new BadRequestObjectResult($"Error: {ex.Message}");
            }

            // Return extracted text content
            return new OkObjectResult(strPdfTextContent);
        }

        public static string GetTextExtraction(byte[] pdfBytes)
        {
            PdfDocument pdf = new PdfDocument();
            string strOutputContent = "";

            pdf.LoadFromBytes(pdfBytes);

            StringBuilder extractedText = new StringBuilder();

            PdfTextExtractor textExtractor = new PdfTextExtractor(pdf.Pages[0]);

            PdfTextExtractOptions extractOptions = new PdfTextExtractOptions();

            extractOptions.IsExtractAllText = true;
            strOutputContent = textExtractor.ExtractText(extractOptions);

            pdf.Close();

            return strOutputContent;
        }
    }
}



Could you please confirm if you are using our latest version for testing? If not, please update to this version and try again. If the issue persists, please provide your test files, a minimal demo to reproduce the problem and the environment details where the error occurred (e.g., OS version, .NET version) for further investigation. You may upload the file as an attachment or send it to [email protected]. Thank you in advance for your cooperation.
Sincerely,
Talia
E-iceblue support team
User avatar

talia.liu
 
Posts: 331
Joined: Mon Apr 14, 2025 3:33 am

Return to Spire.PDF