I found that the Spire library works well for extracting numbered lists from DOC files, so I’d like to use it.
How can I extract both tables and text in the order they appear in the DOC file?
def extract_paragraph_text(paragraph):
"""extract paragraph text(with list number)"""
full_text = ""
# add list symbols(if exists)
if paragraph.ListText:
full_text += paragraph.ListText.strip() + " "
full_text += paragraph.Text.strip()
return full_text.strip()
def extract_table_data(table):
"""extract table data"""
table_data = []
for i in range(table.Rows.Count):
row_data = []
for j in range(table.Rows[i].Cells.Count):
cell_text = ""
for k in range(table.Rows[i].Cells[j].Paragraphs.Count):
para = table.Rows[i].Cells[j].Paragraphs[k]
cell_text += extract_paragraph_text(para) + " "
row_data.append(cell_text.strip())
table_data.append(row_data)
return table_data
doc = Document()
doc.LoadFromFile("TextAndTableInOrder.doc")
# store ordered results
ordered_content = []
# all document elements
for i in range(doc.Sections.Count):
section = doc.Sections[i]
for j in range(section.Body.ChildObjects.Count):
element = section.Body.ChildObjects[j]
if isinstance(element, Paragraph):
# detal paragraph
para_text = extract_paragraph_text(element)
if para_text.strip():
ordered_content.append(("text", para_text))
elif isinstance(element, Table):
# detal table
table_data = extract_table_data(element)
ordered_content.append(("table", table_data))
for item in ordered_content:
if item[0] == "text":
print("paragraph_text:", item[1])
elif item[0] == "table":
print("table_data:")
for row in item[1]:
print("\t".join(row))
else:
print("unknown:", item)
doc.Close()