mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 14:45:31 +00:00
Refactor: element type (#2035)
### Summary - add constants for element type - replace the `TYPE_TO_TEXT_ELEMENT_MAP` dictionary using the `ElementType` constants - replace element type strings using the constants ### Testing CI should pass.
This commit is contained in:
parent
c688216b38
commit
bb58c1bb0b
@ -18,7 +18,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||
* **Adds include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
|
||||
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
|
||||
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
|
||||
* **Add support for generic partition configs in ingest cli** Along with the explicit partition options supported by the cli, an `additional_partition_args` arg was added to allow users to pass in any other arguments that should be added when calling partition(). This helps keep any changes to the input parameters of the partition() exposed in the CLI.
|
||||
|
||||
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
|
||||
Address,
|
||||
CompositeElement,
|
||||
Element,
|
||||
ElementType,
|
||||
Footer,
|
||||
Header,
|
||||
ListItem,
|
||||
@ -120,7 +121,11 @@ def test_parition_docx_from_team_chat():
|
||||
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
|
||||
"saved-by Dennis Forsythe",
|
||||
]
|
||||
assert [e.category for e in elements] == ["UncategorizedText", "UncategorizedText", "Table"]
|
||||
assert [e.category for e in elements] == [
|
||||
ElementType.UNCATEGORIZED_TEXT,
|
||||
ElementType.UNCATEGORIZED_TEXT,
|
||||
ElementType.TABLE,
|
||||
]
|
||||
|
||||
|
||||
def test_partition_docx_from_filename(
|
||||
|
||||
@ -7,7 +7,7 @@ import requests
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.documents.elements import ElementType, Title
|
||||
from unstructured.partition.md import partition_md
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
@ -295,4 +295,4 @@ def test_partition_md_parse_table():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "simple-table.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].category == "Table"
|
||||
assert elements[0].category == ElementType.TABLE
|
||||
|
||||
@ -9,6 +9,7 @@ from unstructured_inference.inference import layout
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.partition import image, ocr, pdf
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
from unstructured.utils import only
|
||||
@ -126,7 +127,9 @@ def test_partition_image_with_auto_strategy(
|
||||
filename="example-docs/layout-parser-paper-fast.jpg",
|
||||
):
|
||||
elements = image.partition_image(filename=filename, strategy="auto")
|
||||
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
|
||||
titles = [
|
||||
el for el in elements if el.category == ElementType.TITLE and len(el.text.split(" ")) > 10
|
||||
]
|
||||
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
idx = 3
|
||||
assert titles[0].text == title
|
||||
|
||||
@ -10,6 +10,7 @@ from unstructured_inference.inference.layoutelement import (
|
||||
LayoutElement,
|
||||
)
|
||||
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.partition import ocr
|
||||
from unstructured.partition.ocr import pad_element_bboxes
|
||||
from unstructured.partition.utils.constants import (
|
||||
@ -248,7 +249,7 @@ def test_get_elements_from_ocr_regions(mock_embedded_text_regions):
|
||||
x2=1256.334784222222,
|
||||
y2=406.9837855555556,
|
||||
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
||||
type="UncategorizedText",
|
||||
type=ElementType.UNCATEGORIZED_TEXT,
|
||||
),
|
||||
]
|
||||
|
||||
@ -271,7 +272,7 @@ def test_zoom_image(zoom):
|
||||
@pytest.fixture()
|
||||
def mock_layout(mock_embedded_text_regions):
|
||||
return [
|
||||
LayoutElement(text=r.text, type="UncategorizedText", bbox=r.bbox)
|
||||
LayoutElement(text=r.text, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
|
||||
for r in mock_embedded_text_regions
|
||||
]
|
||||
|
||||
@ -354,7 +355,7 @@ def mock_embedded_text_regions():
|
||||
|
||||
def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
|
||||
ocr_elements = [
|
||||
LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox)
|
||||
LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
|
||||
for r in mock_ocr_regions
|
||||
]
|
||||
|
||||
@ -379,7 +380,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
|
||||
|
||||
def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions):
|
||||
ocr_elements = [
|
||||
LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox)
|
||||
LayoutElement(text=r.text, source=None, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox)
|
||||
for r in mock_ocr_regions
|
||||
]
|
||||
|
||||
@ -410,7 +411,7 @@ def test_pad_element_bboxes(padding, expected_bbox):
|
||||
y2=40,
|
||||
text="",
|
||||
source=None,
|
||||
type="UncategorizedText",
|
||||
type=ElementType.UNCATEGORIZED_TEXT,
|
||||
)
|
||||
expected_original_element_bbox = (10, 20, 30, 40)
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
CoordinatesMetadata,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
FigureCaption,
|
||||
Header,
|
||||
ListItem,
|
||||
@ -515,7 +516,7 @@ def test_ocr_data_to_elements(
|
||||
bbox=r.bbox,
|
||||
text=r.text,
|
||||
source=r.source,
|
||||
type="UncategorizedText",
|
||||
type=ElementType.UNCATEGORIZED_TEXT,
|
||||
)
|
||||
for r in text_regions
|
||||
]
|
||||
@ -527,7 +528,7 @@ def test_ocr_data_to_elements(
|
||||
)
|
||||
|
||||
assert len(ocr_data) == len(elements)
|
||||
assert {el.category for el in elements} == {"UncategorizedText"}
|
||||
assert {el.category for el in elements} == {ElementType.UNCATEGORIZED_TEXT}
|
||||
|
||||
# check coordinates metadata
|
||||
image_width, image_height = image.size
|
||||
|
||||
@ -15,6 +15,7 @@ from unstructured.documents.elements import (
|
||||
CoordinateSystem,
|
||||
DataSourceMetadata,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
FigureCaption,
|
||||
Image,
|
||||
Link,
|
||||
@ -40,7 +41,7 @@ def test_convert_to_isd():
|
||||
isd = base.convert_to_isd(elements)
|
||||
|
||||
assert isd[0]["text"] == "Title 1"
|
||||
assert isd[0]["type"] == "Title"
|
||||
assert isd[0]["type"] == ElementType.TITLE
|
||||
|
||||
assert isd[1]["text"] == "Narrative 1"
|
||||
assert isd[1]["type"] == "NarrativeText"
|
||||
|
||||
@ -676,38 +676,76 @@ class Footer(Text):
|
||||
category = "Footer"
|
||||
|
||||
|
||||
class ElementType:
|
||||
TITLE = "Title"
|
||||
TEXT = "Text"
|
||||
UNCATEGORIZED_TEXT = "UncategorizedText"
|
||||
NARRATIVE_TEXT = "NarrativeText"
|
||||
BULLETED_TEXT = "BulletedText"
|
||||
ABSTRACT = "Abstract"
|
||||
THREADING = "Threading"
|
||||
FORM = "Form"
|
||||
FIELD_NAME = "Field-Name"
|
||||
VALUE = "Value"
|
||||
LINK = "Link"
|
||||
COMPOSITE_ELEMENT = "CompositeElement"
|
||||
IMAGE = "Image"
|
||||
PICTURE = "Picture"
|
||||
FIGURE_CAPTION = "FigureCaption"
|
||||
FIGURE = "Figure"
|
||||
CAPTION = "Caption"
|
||||
LIST = "List"
|
||||
LIST_ITEM = "ListItem"
|
||||
LIST_ITEM_OTHER = "List-item"
|
||||
CHECKED = "Checked"
|
||||
UNCHECKED = "Unchecked"
|
||||
ADDRESS = "Address"
|
||||
EMAIL_ADDRESS = "EmailAddress"
|
||||
PAGE_BREAK = "PageBreak"
|
||||
FORMULA = "Formula"
|
||||
TABLE = "Table"
|
||||
HEADER = "Header"
|
||||
HEADLINE = "Headline"
|
||||
SUB_HEADLINE = "Subheadline"
|
||||
PAGE_HEADER = "Page-header" # Title?
|
||||
SECTION_HEADER = "Section-header"
|
||||
FOOTER = "Footer"
|
||||
FOOTNOTE = "Footnote"
|
||||
PAGE_FOOTER = "Page-footer"
|
||||
|
||||
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||
"UncategorizedText": Text,
|
||||
"FigureCaption": FigureCaption,
|
||||
"Figure": FigureCaption,
|
||||
"Text": NarrativeText,
|
||||
"NarrativeText": NarrativeText,
|
||||
"ListItem": ListItem,
|
||||
"BulletedText": ListItem,
|
||||
"Title": Title,
|
||||
"Address": Address,
|
||||
"EmailAddress": EmailAddress,
|
||||
"Image": Image,
|
||||
"PageBreak": PageBreak,
|
||||
"Table": Table,
|
||||
"Header": Header,
|
||||
"Footer": Footer,
|
||||
"Caption": FigureCaption,
|
||||
"Footnote": Footer,
|
||||
"Formula": Formula,
|
||||
"List-item": ListItem,
|
||||
"Page-footer": Footer,
|
||||
"Page-header": Header, # Title?
|
||||
"Picture": Image,
|
||||
ElementType.UNCATEGORIZED_TEXT: Text,
|
||||
ElementType.FIGURE_CAPTION: FigureCaption,
|
||||
ElementType.FIGURE: FigureCaption,
|
||||
ElementType.TEXT: NarrativeText,
|
||||
ElementType.NARRATIVE_TEXT: NarrativeText,
|
||||
ElementType.LIST_ITEM: ListItem,
|
||||
ElementType.BULLETED_TEXT: ListItem,
|
||||
ElementType.TITLE: Title,
|
||||
ElementType.ADDRESS: Address,
|
||||
ElementType.EMAIL_ADDRESS: EmailAddress,
|
||||
ElementType.IMAGE: Image,
|
||||
ElementType.PAGE_BREAK: PageBreak,
|
||||
ElementType.TABLE: Table,
|
||||
ElementType.HEADER: Header,
|
||||
ElementType.FOOTER: Footer,
|
||||
ElementType.CAPTION: FigureCaption,
|
||||
ElementType.FOOTNOTE: Footer,
|
||||
ElementType.FORMULA: Formula,
|
||||
ElementType.LIST_ITEM_OTHER: ListItem,
|
||||
ElementType.PAGE_FOOTER: Footer,
|
||||
ElementType.PAGE_HEADER: Header, # Title?
|
||||
ElementType.PICTURE: Image,
|
||||
# this mapping favors ensures yolox produces backward compatible categories
|
||||
"Section-header": Title,
|
||||
"Headline": Title,
|
||||
"Subheadline": Title,
|
||||
"Abstract": NarrativeText,
|
||||
"Threading": NarrativeText,
|
||||
"Form": NarrativeText,
|
||||
"Field-Name": Title,
|
||||
"Value": NarrativeText,
|
||||
"Link": NarrativeText,
|
||||
"CompositeElement": Text,
|
||||
ElementType.SECTION_HEADER: Title,
|
||||
ElementType.HEADLINE: Title,
|
||||
ElementType.SUB_HEADLINE: Title,
|
||||
ElementType.ABSTRACT: NarrativeText,
|
||||
ElementType.THREADING: NarrativeText,
|
||||
ElementType.FORM: NarrativeText,
|
||||
ElementType.FIELD_NAME: Title,
|
||||
ElementType.VALUE: NarrativeText,
|
||||
ElementType.LINK: NarrativeText,
|
||||
ElementType.COMPOSITE_ELEMENT: Text,
|
||||
}
|
||||
|
||||
@ -28,6 +28,7 @@ from unstructured.documents.elements import (
|
||||
CoordinatesMetadata,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
ListItem,
|
||||
PageBreak,
|
||||
Text,
|
||||
@ -136,7 +137,7 @@ def normalize_layout_element(
|
||||
class_prob_metadata = ElementMetadata(detection_class_prob=float(prob)) # type: ignore
|
||||
else:
|
||||
class_prob_metadata = ElementMetadata()
|
||||
if element_type == "List":
|
||||
if element_type == ElementType.LIST:
|
||||
if infer_list_items:
|
||||
return layout_list_to_list_items(
|
||||
text,
|
||||
@ -163,12 +164,12 @@ def normalize_layout_element(
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
)
|
||||
if element_type == "Headline":
|
||||
if element_type == ElementType.HEADLINE:
|
||||
_element_class.metadata.category_depth = 1
|
||||
elif element_type == "Subheadline":
|
||||
elif element_type == ElementType.SUB_HEADLINE:
|
||||
_element_class.metadata.category_depth = 2
|
||||
return _element_class
|
||||
elif element_type == "Checked":
|
||||
elif element_type == ElementType.CHECKED:
|
||||
return CheckBox(
|
||||
checked=True,
|
||||
coordinates=coordinates,
|
||||
@ -176,7 +177,7 @@ def normalize_layout_element(
|
||||
metadata=class_prob_metadata,
|
||||
detection_origin=origin,
|
||||
)
|
||||
elif element_type == "Unchecked":
|
||||
elif element_type == ElementType.UNCHECKED:
|
||||
return CheckBox(
|
||||
checked=False,
|
||||
coordinates=coordinates,
|
||||
|
||||
@ -22,6 +22,7 @@ from unstructured_inference.inference.layoutelement import (
|
||||
from unstructured_inference.models.tables import UnstructuredTableTransformerModel
|
||||
from unstructured_pytesseract import Output
|
||||
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.utils.config import env_config
|
||||
from unstructured.partition.utils.constants import (
|
||||
@ -256,7 +257,7 @@ def supplement_element_with_table_extraction(
|
||||
the table's text content is rendered into an html string.
|
||||
"""
|
||||
for element in elements:
|
||||
if element.type == "Table":
|
||||
if element.type == ElementType.TABLE:
|
||||
padding = env_config.TABLE_IMAGE_CROP_PAD
|
||||
padded_element = pad_element_bboxes(element, padding=padding)
|
||||
cropped_image = image.crop(
|
||||
@ -368,7 +369,9 @@ def get_layout_elements_from_ocr(
|
||||
# and merging steps are not necessary.
|
||||
|
||||
layout_elements = [
|
||||
LayoutElement(bbox=r.bbox, text=r.text, source=r.source, type="UncategorizedText")
|
||||
LayoutElement(
|
||||
bbox=r.bbox, text=r.text, source=r.source, type=ElementType.UNCATEGORIZED_TEXT
|
||||
)
|
||||
for r in ocr_regions
|
||||
]
|
||||
else:
|
||||
@ -762,7 +765,9 @@ def get_elements_from_ocr_regions(
|
||||
|
||||
merged_regions = [merge_text_regions(group) for group in grouped_regions]
|
||||
return [
|
||||
LayoutElement(text=r.text, source=r.source, type="UncategorizedText", bbox=r.bbox)
|
||||
LayoutElement(
|
||||
text=r.text, source=r.source, type=ElementType.UNCATEGORIZED_TEXT, bbox=r.bbox
|
||||
)
|
||||
for r in merged_regions
|
||||
]
|
||||
|
||||
|
||||
@ -45,6 +45,7 @@ from unstructured.documents.elements import (
|
||||
CoordinatesMetadata,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
Image,
|
||||
Link,
|
||||
ListItem,
|
||||
@ -323,7 +324,7 @@ def partition_pdf_or_image(
|
||||
)
|
||||
layout_elements = []
|
||||
for el in _layout_elements:
|
||||
if hasattr(el, "category") and el.category == "UncategorizedText":
|
||||
if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
|
||||
new_el = element_from_text(cast(Text, el).text)
|
||||
new_el.metadata = el.metadata
|
||||
else:
|
||||
@ -348,7 +349,7 @@ def partition_pdf_or_image(
|
||||
|
||||
layout_elements = []
|
||||
for el in _layout_elements:
|
||||
if hasattr(el, "category") and el.category == "UncategorizedText":
|
||||
if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
|
||||
new_el = element_from_text(cast(Text, el).text)
|
||||
new_el.metadata = el.metadata
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user