feat: add ability to parse LayoutElement lists (#165)

* added ability to split list items

* changelog and version bump

* retrigger ci
This commit is contained in:
Matt Robinson 2023-01-20 08:55:11 -05:00 committed by GitHub
parent c1822911a5
commit 8d3e616846
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 73 additions and 6 deletions

View File

@ -1,9 +1,10 @@
## 0.4.4-dev0
## 0.4.4-dev1
* Updated `partition_pdf` and `partition_image` to return `unstructured` `Element` objects
* Fixed the healthcheck url path when partitioning images and PDFs via API
* Adds an optional `coordinates` attribute to document objects
* Adds `FigureCaption` and `CheckBox` document elements
* Added ability to split lists detected in `LayoutElement` objects
## 0.4.3

View File

@ -1,6 +1,13 @@
from unstructured_inference.inference.layout import LayoutElement
from unstructured.documents.elements import CheckBox, FigureCaption, NarrativeText, Text, Title
from unstructured.documents.elements import (
CheckBox,
FigureCaption,
ListItem,
NarrativeText,
Text,
Title,
)
import unstructured.partition.common as common
@ -66,3 +73,31 @@ def test_normalize_layout_element_unchecked_box():
)
element = common.normalize_layout_element(layout_element)
assert element == CheckBox(checked=False, coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]])
def test_normalize_layout_element_enumerated_list():
layout_element = LayoutElement(
type="List",
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",
)
elements = common.normalize_layout_element(layout_element)
assert elements == [
ListItem(text="I'm so cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
ListItem(text="You're cool too.", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
ListItem(text="We're all cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
]
def test_normalize_layout_element_bulleted_list():
layout_element = LayoutElement(
type="List",
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
text="* I'm so cool! * You're cool too. * We're all cool!",
)
elements = common.normalize_layout_element(layout_element)
assert elements == [
ListItem(text="I'm so cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
ListItem(text="You're cool too.", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
ListItem(text="We're all cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
]

View File

@ -1 +1 @@
__version__ = "0.4.4-dev0" # pragma: no cover
__version__ = "0.4.4-dev1" # pragma: no cover

View File

@ -40,7 +40,9 @@ UNICODE_BULLETS: Final[List[str]] = [
"\x95",
"·",
]
UNICODE_BULLETS_RE = re.compile(f"({'|'.join(UNICODE_BULLETS)})")
UNICODE_BULLETS_RE = re.compile(f"(?:{'|'.join(UNICODE_BULLETS)})")
ENUMERATED_BULLETS_RE = re.compile(r"(?:(?:\d{1,3}|[a-z][A-Z])\.?){1,3}")
EMAIL_HEAD_PATTERN = (
r"(MIME-Version: 1.0(.*)?\n)?Date:.*\nMessage-ID:.*\nSubject:.*\nFrom:.*\nTo:.*"

View File

@ -1,14 +1,18 @@
from typing import List, Union
from unstructured.documents.elements import (
Element,
CheckBox,
FigureCaption,
ListItem,
NarrativeText,
Text,
Title,
)
from unstructured.nlp.patterns import UNICODE_BULLETS_RE, ENUMERATED_BULLETS_RE
def normalize_layout_element(layout_element) -> Element:
def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
"""Converts a list of unstructured_inference DocumentLayout objects to a list of
unstructured Elements."""
@ -27,9 +31,26 @@ def normalize_layout_element(layout_element) -> Element:
return NarrativeText(text=text, coordinates=coordinates)
elif element_type == "Figure":
return FigureCaption(text=text, coordinates=coordinates)
elif element_type == "List":
return layout_list_to_list_items(text, coordinates)
elif element_type == "Checked":
return CheckBox(checked=True, coordinates=coordinates)
elif element_type == "Unchecked":
return CheckBox(checked=False, coordinates=coordinates)
else:
return Text(text=text, coordinates=coordinates)
def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Element]:
"""Converts a list LayoutElement to a list of ListItem elements."""
split_items = ENUMERATED_BULLETS_RE.split(text)
# NOTE(robinson) - this means there wasn't a match for the enumerated bullets
if len(split_items) == 1:
split_items = UNICODE_BULLETS_RE.split(text)
list_items: List[Element] = list()
for text_segment in split_items:
if len(text_segment.strip()) > 0:
list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))
return list_items

View File

@ -69,7 +69,15 @@ def partition_pdf_or_image(
filename=filename, file=file, url=url, token=token, data=data
)
return [normalize_layout_element(el) for el in layout_elements]
elements: List[Element] = list()
for layout_element in layout_elements:
element = normalize_layout_element(layout_element)
if isinstance(element, list):
elements.extend(element)
else:
elements.append(element)
return elements
def _partition_pdf_or_image_local(