fix: detect list items in MS Word documents (#909)

* fix merge conflict

* update changelog and version
This commit is contained in:
John 2023-07-10 10:29:08 -05:00 committed by GitHub
parent 79f734d3f9
commit 6173362620
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 32 additions and 8 deletions

View File

@ -1,4 +1,4 @@
## 0.8.1-dev0
## 0.8.1-dev1
### Enhancements
@ -7,6 +7,7 @@
### Fixes
* Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output.
* Fix list detection in MS Word documents.
## 0.8.0

Binary file not shown.

View File

@ -171,6 +171,21 @@ def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_detects_lists(filename="example-docs/example-list-items-multiple.docx"):
elements = partition_docx(filename=filename)
list_elements = []
narrative_elements = []
for element in elements:
if isinstance(element, ListItem):
list_elements.append(element)
else:
narrative_elements.append(element)
assert elements[-1] == ListItem(
"This is simply dummy text of the printing and typesetting industry.",
)
assert len(list_elements) == 10
def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
elements = partition_docx(filename=filename, include_metadata=False)
assert elements[0].metadata.filetype is None

View File

@ -1 +1 @@
__version__ = "0.8.1-dev0" # pragma: no cover
__version__ = "0.8.1-dev1" # pragma: no cover

View File

@ -148,8 +148,8 @@ def partition_docx(
document_contains_pagebreaks = _element_contains_pagebreak(document._element)
page_number = 1 if document_contains_pagebreaks else None
section = 0
is_list = False
for element_item in document.element.body:
if element_item.tag.endswith("tbl"):
table = document.tables[table_index]
@ -165,14 +165,17 @@ def partition_docx(
elements.append(element)
table_index += 1
elif element_item.tag.endswith("p"):
if "<w:numPr>" in element_item.xml:
is_list = True
paragraph = docx.text.paragraph.Paragraph(element_item, document)
para_element: Optional[Text] = _paragraph_to_element(paragraph)
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
if para_element is not None:
para_element.metadata = ElementMetadata(
filename=metadata_filename,
page_number=page_number,
)
elements.append(para_element)
is_list = False
elif element_item.tag.endswith("sectPr"):
if len(headers_and_footers) > section:
footers = headers_and_footers[section][1]
@ -191,7 +194,10 @@ def partition_docx(
return elements
def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]:
def _paragraph_to_element(
paragraph: docx.text.paragraph.Paragraph,
is_list=False,
) -> Optional[Text]:
"""Converts a docx Paragraph object into the appropriate unstructured document element.
If the paragraph style is "Normal" or unknown, we try to predict the element type from the
raw text."""
@ -205,7 +211,9 @@ def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[
# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
# Unknown style names will also return None
if element_class is None:
if is_list:
return _text_to_element(text, is_list)
elif element_class is None:
return _text_to_element(text)
else:
return element_class(text)
@ -227,9 +235,9 @@ def _element_contains_pagebreak(element) -> bool:
return False
def _text_to_element(text: str) -> Optional[Text]:
def _text_to_element(text: str, is_list=False) -> Optional[Text]:
"""Converts raw text into an unstructured Text element."""
if is_bulleted_text(text):
if is_bulleted_text(text) or is_list:
clean_text = clean_bullets(text).strip()
return ListItem(text=clean_bullets(text)) if clean_text else None