Extract Text and Images from Word in Java

This article demonstrates how to extract text and images from Word documents by using Spire.Doc for Java.

Extract Text

import com.spire.doc.Document;

import java.io.FileWriter;
import java.io.IOException;

public class ExtractText {

    public static void main(String[] args) throws IOException {

        //load Word document
        Document document = new Document();
        document.loadFromFile("C:\\Users\\Administrator\\Desktop\\sample.docx");

        //get text from document as string
        String text=document.getText();

        //write string to a .txt file
        writeStringToTxt(text," ExtractedText.txt");
    }

    public static void writeStringToTxt(String content, String txtFileName) throws IOException{

        FileWriter fWriter= new FileWriter(txtFileName,true);
        try {
            fWriter.write(content);
        }catch(IOException ex){
            ex.printStackTrace();
        }finally{
            try{
                fWriter.flush();
                fWriter.close();
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    }
}

Extract Text and Images from Word in Java

Extract Images

import com.spire.doc.Document;
import com.spire.doc.documents.DocumentObjectType;
import com.spire.doc.fields.DocPicture;
import com.spire.doc.interfaces.ICompositeObject;
import com.spire.doc.interfaces.IDocumentObject;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

public class ExtractImages {

    public static void main(String[] args) throws IOException {

        //load word document
        Document document = new Document();
        document.loadFromFile("C:\\Users\\Administrator\\Desktop\\sample.docx");

        //create a Queue object
        Queue nodes = new LinkedList();
        nodes.add(document);

        //create a List object
        List images = new ArrayList();

        //loop through the child objects of the document
        while (nodes.size() > 0) {
            ICompositeObject node = nodes.poll();
            for (int i = 0; i < node.getChildObjects().getCount(); i++) {
                IDocumentObject child = node.getChildObjects().get(i);
                if (child instanceof ICompositeObject) {
                    nodes.add((ICompositeObject) child);

                    //get each image and add it to the list
                    if (child.getDocumentObjectType() == DocumentObjectType.Picture) {
                        DocPicture picture = (DocPicture) child;
                        images.add(picture.getImage());
                    }
                }
            }
        }

        //save images as .png files
        for (int i = 0; i < images.size(); i++) {
            File file = new File(String.format("output/ExtractedImage-%d.png", i));
            ImageIO.write(images.get(i), "PNG", file);
        }
    }
}

Extract Text and Images from Word in Java