mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 23:24:27 +00:00
feat: add ability to parse LayoutElement lists (#165)
* added ability to split list items * changelog and version bump * retrigger ci
This commit is contained in:
parent
c1822911a5
commit
8d3e616846
@ -1,9 +1,10 @@
|
||||
## 0.4.4-dev0
|
||||
## 0.4.4-dev1
|
||||
|
||||
* Updated `partition_pdf` and `partition_image` to return `unstructured` `Element` objects
|
||||
* Fixed the healthcheck url path when partitioning images and PDFs via API
|
||||
* Adds an optional `coordinates` attribute to document objects
|
||||
* Adds `FigureCaption` and `CheckBox` document elements
|
||||
* Added ability to split lists detected in `LayoutElement` objects
|
||||
|
||||
## 0.4.3
|
||||
|
||||
|
||||
@ -1,6 +1,13 @@
|
||||
from unstructured_inference.inference.layout import LayoutElement
|
||||
|
||||
from unstructured.documents.elements import CheckBox, FigureCaption, NarrativeText, Text, Title
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
FigureCaption,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
import unstructured.partition.common as common
|
||||
|
||||
|
||||
@ -66,3 +73,31 @@ def test_normalize_layout_element_unchecked_box():
|
||||
)
|
||||
element = common.normalize_layout_element(layout_element)
|
||||
assert element == CheckBox(checked=False, coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]])
|
||||
|
||||
|
||||
def test_normalize_layout_element_enumerated_list():
|
||||
layout_element = LayoutElement(
|
||||
type="List",
|
||||
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
||||
text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",
|
||||
)
|
||||
elements = common.normalize_layout_element(layout_element)
|
||||
assert elements == [
|
||||
ListItem(text="I'm so cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
ListItem(text="You're cool too.", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
ListItem(text="We're all cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
]
|
||||
|
||||
|
||||
def test_normalize_layout_element_bulleted_list():
|
||||
layout_element = LayoutElement(
|
||||
type="List",
|
||||
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
||||
text="* I'm so cool! * You're cool too. * We're all cool!",
|
||||
)
|
||||
elements = common.normalize_layout_element(layout_element)
|
||||
assert elements == [
|
||||
ListItem(text="I'm so cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
ListItem(text="You're cool too.", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
ListItem(text="We're all cool!", coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]]),
|
||||
]
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.4-dev0" # pragma: no cover
|
||||
__version__ = "0.4.4-dev1" # pragma: no cover
|
||||
|
||||
@ -40,7 +40,9 @@ UNICODE_BULLETS: Final[List[str]] = [
|
||||
"\x95",
|
||||
"·",
|
||||
]
|
||||
UNICODE_BULLETS_RE = re.compile(f"({'|'.join(UNICODE_BULLETS)})")
|
||||
UNICODE_BULLETS_RE = re.compile(f"(?:{'|'.join(UNICODE_BULLETS)})")
|
||||
|
||||
ENUMERATED_BULLETS_RE = re.compile(r"(?:(?:\d{1,3}|[a-z][A-Z])\.?){1,3}")
|
||||
|
||||
EMAIL_HEAD_PATTERN = (
|
||||
r"(MIME-Version: 1.0(.*)?\n)?Date:.*\nMessage-ID:.*\nSubject:.*\nFrom:.*\nTo:.*"
|
||||
|
||||
@ -1,14 +1,18 @@
|
||||
from typing import List, Union
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
CheckBox,
|
||||
FigureCaption,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.nlp.patterns import UNICODE_BULLETS_RE, ENUMERATED_BULLETS_RE
|
||||
|
||||
|
||||
def normalize_layout_element(layout_element) -> Element:
|
||||
def normalize_layout_element(layout_element) -> Union[Element, List[Element]]:
|
||||
"""Converts a list of unstructured_inference DocumentLayout objects to a list of
|
||||
unstructured Elements."""
|
||||
|
||||
@ -27,9 +31,26 @@ def normalize_layout_element(layout_element) -> Element:
|
||||
return NarrativeText(text=text, coordinates=coordinates)
|
||||
elif element_type == "Figure":
|
||||
return FigureCaption(text=text, coordinates=coordinates)
|
||||
elif element_type == "List":
|
||||
return layout_list_to_list_items(text, coordinates)
|
||||
elif element_type == "Checked":
|
||||
return CheckBox(checked=True, coordinates=coordinates)
|
||||
elif element_type == "Unchecked":
|
||||
return CheckBox(checked=False, coordinates=coordinates)
|
||||
else:
|
||||
return Text(text=text, coordinates=coordinates)
|
||||
|
||||
|
||||
def layout_list_to_list_items(text: str, coordinates: List[float]) -> List[Element]:
|
||||
"""Converts a list LayoutElement to a list of ListItem elements."""
|
||||
split_items = ENUMERATED_BULLETS_RE.split(text)
|
||||
# NOTE(robinson) - this means there wasn't a match for the enumerated bullets
|
||||
if len(split_items) == 1:
|
||||
split_items = UNICODE_BULLETS_RE.split(text)
|
||||
|
||||
list_items: List[Element] = list()
|
||||
for text_segment in split_items:
|
||||
if len(text_segment.strip()) > 0:
|
||||
list_items.append(ListItem(text=text_segment.strip(), coordinates=coordinates))
|
||||
|
||||
return list_items
|
||||
|
||||
@ -69,7 +69,15 @@ def partition_pdf_or_image(
|
||||
filename=filename, file=file, url=url, token=token, data=data
|
||||
)
|
||||
|
||||
return [normalize_layout_element(el) for el in layout_elements]
|
||||
elements: List[Element] = list()
|
||||
for layout_element in layout_elements:
|
||||
element = normalize_layout_element(layout_element)
|
||||
if isinstance(element, list):
|
||||
elements.extend(element)
|
||||
else:
|
||||
elements.append(element)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def _partition_pdf_or_image_local(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user