mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: track emphasized text msword (#1048)
* feat: add functionality to track emphasized text (`bold/italic` formatting) from paragraph * chore: add docstring * chore: fix lint errors * feat: ignore spaces when extracting emphasized texts from a paragraph * feat: add functionality to track emphasized text (`bold/italic` formatting) from table * test: add test case for grabbing emphasized texts from element metadata * chore: fix lint errors * chore: update changelog & version * Update ingest test fixtures (#1047)
This commit is contained in:
parent
2888c20a46
commit
b76d2ee745
@ -2,11 +2,12 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Update `partition_doc` and `partition_docx` to track emphasized texts in the output
|
||||
* Adds post processing function `filter_element_types`
|
||||
* Set the default strategy for partitioning images to `hi_res`
|
||||
* Add page break parameter section in API documentation to sync with change in Prod API
|
||||
* Update `partition_html` to track emphasized texts in the output
|
||||
* Update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag
|
||||
* Track emphasized texts in `partition_html` output
|
||||
* Add parameter `include_tail_text` to `_construct_text` to enable (skip) tail text inclusion
|
||||
|
||||
### Features
|
||||
|
BIN
example-docs/fake-doc-emphasized-text.doc
Executable file
BIN
example-docs/fake-doc-emphasized-text.doc
Executable file
Binary file not shown.
BIN
example-docs/fake-doc-emphasized-text.docx
Executable file
BIN
example-docs/fake-doc-emphasized-text.docx
Executable file
Binary file not shown.
@ -14,7 +14,12 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import (
|
||||
_get_emphasized_texts_from_paragraph,
|
||||
_get_emphasized_texts_from_table,
|
||||
partition_docx,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
@ -285,3 +290,74 @@ def test_partition_docx_from_file_without_metadata_date(
|
||||
elements = partition_docx(file=sf)
|
||||
|
||||
assert elements[0].metadata.last_modified is None
|
||||
|
||||
|
||||
def test_get_emphasized_texts_from_paragraph(
|
||||
filename="example-docs/fake-doc-emphasized-text.docx",
|
||||
):
|
||||
expected = [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
document = docx.Document(filename)
|
||||
paragraph = document.paragraphs[1]
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
assert paragraph.text == "I am a bold italic bold-italic text."
|
||||
assert emphasized_texts == expected
|
||||
|
||||
paragraph = document.paragraphs[2]
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
assert paragraph.text == ""
|
||||
assert emphasized_texts == []
|
||||
|
||||
paragraph = document.paragraphs[3]
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
assert paragraph.text == "I am a normal text."
|
||||
assert emphasized_texts == []
|
||||
|
||||
|
||||
def test_get_emphasized_texts_from_table(
|
||||
filename="example-docs/fake-doc-emphasized-text.docx",
|
||||
):
|
||||
expected = [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
document = docx.Document(filename)
|
||||
table = document.tables[0]
|
||||
emphasized_texts = _get_emphasized_texts_from_table(table)
|
||||
assert emphasized_texts == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "partition_func"),
|
||||
[
|
||||
("fake-doc-emphasized-text.docx", partition_docx),
|
||||
("fake-doc-emphasized-text.doc", partition_doc),
|
||||
],
|
||||
)
|
||||
def test_partition_docx_grabs_emphasized_texts(filename, partition_func):
|
||||
elements = partition_func(filename=f"example-docs/{filename}")
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.emphasized_texts == [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
|
||||
assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
|
||||
assert elements[1].metadata.emphasized_texts == [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
|
||||
assert elements[2] == NarrativeText("I am a normal text.")
|
||||
assert elements[2].metadata.emphasized_texts is None
|
||||
|
@ -15,7 +15,13 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "CHAPTER 1",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "CHAPTER 1"
|
||||
},
|
||||
@ -25,7 +31,13 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "INTRODUCTION",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "INTRODUCTION"
|
||||
},
|
||||
|
@ -15,7 +15,13 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "CHAPTER 1",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "CHAPTER 1"
|
||||
},
|
||||
@ -25,7 +31,13 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "INTRODUCTION",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "INTRODUCTION"
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
[
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "f8db6c6e535705336195aa2c1d23d414",
|
||||
"element_id": "c00fc0e5ac303c40f9089791e5e485b1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
@ -9,11 +9,11 @@
|
||||
"page_name": "Stanley Cups",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n"
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
|
||||
},
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "20f5163a43ac6eb04a40d269d3ad0663",
|
||||
"element_id": "31421b5cd94fedb10dc82738503b4505",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
@ -21,6 +21,6 @@
|
||||
"page_name": "Stanley Cups Since 67",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n"
|
||||
"text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
|
||||
}
|
||||
]
|
@ -4,7 +4,13 @@
|
||||
"element_id": "7e8cd2056da73a7fefb6cd91f4e5d199",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "Title",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "Title"
|
||||
},
|
||||
@ -13,7 +19,13 @@
|
||||
"element_id": "9870998df89c1da4e01378d0fd085106",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "This is a good reason to continue",
|
||||
"tag": "b"
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "This is a good reason to continue"
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
[
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "b3e92c24311471ee2c4884b010dd55a0",
|
||||
"element_id": "677f7fdbfa79de9d91e157663dd559cd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
@ -10,11 +10,11 @@
|
||||
"page_name": "Example Test",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>MA</td>\n <td>What C datatypes are 8 bits? (assume i386)</td>\n <td>int</td>\n <td></td>\n <td>float</td>\n <td></td>\n <td>double</td>\n <td></td>\n <td>char</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>Bagpipes are awesome.</td>\n <td>true</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Rank the following in their order of operation.</td>\n <td>Parentheses</td>\n <td>Exponents</td>\n <td>Division</td>\n <td>Addition</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>The student activities fee is</td>\n <td>95</td>\n <td>dollars for students enrolled in</td>\n <td>19</td>\n <td>units or more,</td>\n <td></td>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Match the lower-case greek letter with its capital form.</td>\n <td>λ</td>\n <td>Λ</td>\n <td>α</td>\n <td>γ</td>\n <td>Γ</td>\n <td>φ</td>\n <td>Φ</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n MA\n What C datatypes are 8 bits? (assume i386)\n int\n \n float\n \n double\n \n char\n \n \n TF\n Bagpipes are awesome.\n true\n \n \n \n \n \n \n \n \n ESS\n How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n \n \n \n \n \n \n \n \n \n ORD\n Rank the following in their order of operation.\n Parentheses\n Exponents\n Division\n Addition\n \n \n \n \n \n FIB\n The student activities fee is\n 95\n dollars for students enrolled in\n 19\n units or more,\n \n \n \n \n \n MAT\n Match the lower-case greek letter with its capital form.\n λ\n Λ\n α\n γ\n Γ\n φ\n Φ\n \n \n"
|
||||
"text": "\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n"
|
||||
},
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "adf2eb068afa00f6dfaa4adf8195ce25",
|
||||
"element_id": "079ef3ee8c03cb36789b08765181ebc4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
@ -23,11 +23,11 @@
|
||||
"page_name": "Format Abbr.",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Question Format Abbreviations</td>\n <td></td>\n </tr>\n <tr>\n <td></td>\n <td></td>\n </tr>\n <tr>\n <td>Abbreviation</td>\n <td>Question Type</td>\n </tr>\n <tr>\n <td>MC</td>\n <td>Multiple Choice</td>\n </tr>\n <tr>\n <td>MA</td>\n <td>Multiple Answer</td>\n </tr>\n <tr>\n <td>TF</td>\n <td>True/False</td>\n </tr>\n <tr>\n <td>ESS</td>\n <td>Essay</td>\n </tr>\n <tr>\n <td>ORD</td>\n <td>Ordering</td>\n </tr>\n <tr>\n <td>MAT</td>\n <td>Matching</td>\n </tr>\n <tr>\n <td>FIB</td>\n <td>Fill in the Blank</td>\n </tr>\n <tr>\n <td>FIL</td>\n <td>File response</td>\n </tr>\n <tr>\n <td>NUM</td>\n <td>Numeric Response</td>\n </tr>\n <tr>\n <td>SR</td>\n <td>Short response</td>\n </tr>\n <tr>\n <td>OP</td>\n <td>Opinion</td>\n </tr>\n <tr>\n <td>FIB_PLUS</td>\n <td>Multiple Fill in the Blank</td>\n </tr>\n <tr>\n <td>JUMBLED_SENTENCE</td>\n <td>Jumbled Sentence</td>\n </tr>\n <tr>\n <td>QUIZ_BOWL</td>\n <td>Quiz Bowl</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n \n \n Question Format Abbreviations\n \n \n \n \n \n \n \n Abbreviation\n Question Type\n \n \n MC\n Multiple Choice\n \n \n MA\n Multiple Answer\n \n \n TF\n True/False\n \n \n ESS\n Essay\n \n \n ORD\n Ordering\n \n \n MAT\n Matching\n \n \n FIB\n Fill in the Blank\n \n \n FIL\n File response\n \n \n NUM\n Numeric Response\n \n \n SR\n Short response\n \n \n OP\n Opinion\n \n \n FIB_PLUS\n Multiple Fill in the Blank\n \n \n JUMBLED_SENTENCE\n Jumbled Sentence\n \n \n QUIZ_BOWL\n Quiz Bowl\n \n \n"
|
||||
"text": "\n\n\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\n\n\nQuestion Format Abbreviations\n\n\n\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n"
|
||||
},
|
||||
{
|
||||
"type": "Table",
|
||||
"element_id": "55c06f516945f32a0187cfd94ba7e074",
|
||||
"element_id": "c7b7d8780a970d589554c3784283b67e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "tests-example.xls",
|
||||
@ -36,6 +36,6 @@
|
||||
"page_name": "Readme",
|
||||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>File Information</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Source</td>\n </tr>\n <tr>\n <td>http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Version</td>\n </tr>\n <tr>\n <td>1.0 (January 2012)</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>Contact</td>\n </tr>\n <tr>\n <td>bb-help@andrew.cmu.edu</td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td></td>\n </tr>\n <tr>\n <td>About</td>\n </tr>\n <tr>\n <td>This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions</td>\n </tr>\n </tbody>\n</table>"
|
||||
},
|
||||
"text": "\n \n \n \n \n \n \n \n \n http://www.cmu.edu/blackboard\n \n \n \n \n \n File Information\n \n \n \n \n \n \n \n \n Source\n \n \n http://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n \n \n \n \n \n \n \n \n Version\n \n \n 1.0 (January 2012)\n \n \n \n \n \n \n \n \n Contact\n \n \n bb-help@andrew.cmu.edu\n \n \n \n \n \n \n \n \n About\n \n \n This is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n \n \n"
|
||||
"text": "\n\n\n\n\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\n\n\n\n\nFile Information\n\n\n\n\n\n\n\n\nSource\n\n\nhttp://www.cmu.edu/blackboard/files/evaluate/tests-example.xls\n\n\n\n\n\n\n\n\nVersion\n\n\n1.0 (January 2012)\n\n\n\n\n\n\n\n\nContact\n\n\nbb-help@andrew.cmu.edu\n\n\n\n\n\n\n\n\nAbout\n\n\nThis is an example and template for preparing Blackboard tests offline. See the full directions at: http://www.cmu.edu/blackboard/evaluate#manage_tests/import_questions\n\n\n"
|
||||
}
|
||||
]
|
@ -244,6 +244,8 @@ def _get_links_from_tag(tag_elem: etree.Element) -> List[Link]:
|
||||
|
||||
|
||||
def _get_emphasized_texts_from_tag(tag_elem: etree.Element) -> List[dict]:
|
||||
"""Get emphasized texts enclosed in <strong>, <em>, <span>, <b>, <i> tags
|
||||
from a tag element in HTML"""
|
||||
emphasized_texts = []
|
||||
tags_to_track = ["strong", "em", "span", "b", "i"]
|
||||
if tag_elem is None:
|
||||
|
@ -5,6 +5,7 @@ from typing import IO, BinaryIO, List, Optional, Tuple, Union, cast
|
||||
|
||||
import docx
|
||||
from docx.oxml.shared import qn
|
||||
from docx.table import Table as DocxTable
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
|
||||
@ -169,6 +170,7 @@ def partition_docx(
|
||||
for element_item in document.element.body:
|
||||
if element_item.tag.endswith("tbl"):
|
||||
table = document.tables[table_index]
|
||||
emphasized_texts = _get_emphasized_texts_from_table(table)
|
||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
||||
element = Table(text_table)
|
||||
@ -178,6 +180,7 @@ def partition_docx(
|
||||
filename=metadata_filename,
|
||||
page_number=page_number,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
emphasized_texts=emphasized_texts if emphasized_texts else None,
|
||||
)
|
||||
elements.append(element)
|
||||
table_index += 1
|
||||
@ -185,12 +188,14 @@ def partition_docx(
|
||||
if "<w:numPr>" in element_item.xml:
|
||||
is_list = True
|
||||
paragraph = docx.text.paragraph.Paragraph(element_item, document)
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
|
||||
if para_element is not None:
|
||||
para_element.metadata = ElementMetadata(
|
||||
filename=metadata_filename,
|
||||
page_number=page_number,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
emphasized_texts=emphasized_texts if emphasized_texts else None,
|
||||
)
|
||||
elements.append(para_element)
|
||||
is_list = False
|
||||
@ -369,3 +374,27 @@ def convert_and_partition_docx(
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def _get_emphasized_texts_from_paragraph(paragraph: Paragraph) -> List[dict]:
|
||||
"""Get emphasized texts with bold/italic formatting from a paragraph in MS Word"""
|
||||
emphasized_texts = []
|
||||
for run in paragraph.runs:
|
||||
text = run.text.strip() if run.text else None
|
||||
if not text:
|
||||
continue
|
||||
if run.bold:
|
||||
emphasized_texts.append({"text": text, "tag": "b"})
|
||||
if run.italic:
|
||||
emphasized_texts.append({"text": text, "tag": "i"})
|
||||
return emphasized_texts
|
||||
|
||||
|
||||
def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]:
|
||||
emphasized_texts = []
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
for paragraph in cell.paragraphs:
|
||||
_emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
emphasized_texts += _emphasized_texts
|
||||
return emphasized_texts
|
||||
|
Loading…
x
Reference in New Issue
Block a user