Hi Team,
We need to read and extract the text with formatting from the table of contents available in DOCX file.
Please advise on the solution.
Sample docx is attached in which we want to extract the "Contents" and the list as well.
Document doc = new Document();
doc.LoadFromFile(@"C:\Heading_Mismatch.docx");
Section section = doc.Sections[0];
for (int i = 0; i < section.Body.ChildObjects.Count; i++)
{
if (section.Body.ChildObjects[i].DocumentObjectType == DocumentObjectType.StructureDocumentTag)
{
StructureDocumentTag tag = section.Body.ChildObjects[i] as StructureDocumentTag;
DocumentObjectCollection objects = tag.ChildObjects; //Includes "the "Contents" and the list in table of contents "
}
}
Document doc = new Document();
doc.LoadFromFile(@"C:\Heading_Mismatch.docx");
Section section = doc.Sections[0];
StringBuilder stringBuilder = new StringBuilder();
String text;
Paragraph paragraph;
TextRange textRange;
for (int i = 0; i < section.Body.ChildObjects.Count; i++)
{
if (section.Body.ChildObjects[i].DocumentObjectType == DocumentObjectType.StructureDocumentTag)
{
StructureDocumentTag tag = section.Body.ChildObjects[i] as StructureDocumentTag;
DocumentObjectCollection objects = tag.ChildObjects; //Includes "the "Contents" and the list in table of contents "
for (int j = 0; j < objects.Count; j++)
{
if (objects[j].DocumentObjectType == DocumentObjectType.Paragraph)
{
paragraph= objects[j] as Paragraph;
//Read the text of paragraph
text = paragraph.Text;
stringBuilder.AppendLine(text);
for(int k=0;k<paragraph.ChildObjects.Count;k++)
{
if (paragraph.ChildObjects[k].DocumentObjectType == DocumentObjectType.TextRange)
{
//Read some formatting of text
textRange = paragraph.ChildObjects[k] as TextRange;
String fontName = textRange.CharacterFormat.FontName;
float fontSize = textRange.CharacterFormat.FontSize;
Color textColor = textRange.CharacterFormat.TextColor;
}
}
}
}
}
}
File.WriteAllText("TOC.txt", stringBuilder.ToString());
if (objects[j].DocumentObjectType == DocumentObjectType.Paragraph)
{
paragraph= objects[j] as Paragraph;
//Read the text of paragraph
text = paragraph.Text;
for(int k=0;k<paragraph.ChildObjects.Count;k++)
{
if (paragraph.ChildObjects[k].DocumentObjectType == DocumentObjectType.TextRange)
{
String text1 = textRange.Text;
}
}
}
Document doc = new Document();
doc.LoadFromFile(input);
Body body = doc.Sections[0].Body;
Paragraph paragraph;
for (int i = 0; i < body.ChildObjects.Count; i++)
{
if (body.ChildObjects[i].DocumentObjectType == DocumentObjectType.Paragraph)
{
paragraph = (Paragraph)body.ChildObjects[i];
if (paragraph.StyleName.Contains("TOC"))
{
for (int j = 0; j < paragraph.ChildObjects.Count; j++)
{
if (paragraph.ChildObjects[j].DocumentObjectType == DocumentObjectType.Field)
{
String text = (paragraph.ChildObjects[j] as Field).FieldText;
}
}
}
}
}