diff --git a/CHANGELOG.md b/CHANGELOG.md index e4e695425..97832602b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.11.0-dev5 +## 0.11.0-dev6 ### Enhancements @@ -13,6 +13,7 @@ ### Fixes +* **Fix `TYPE_TO_TEXT_ELEMENT_MAP`** Updated `Figure` mapping from `FigureCaption` to `Image`. * **Handle errors when extracting PDF text** Certain pdfs throw unexpected errors when being opened by `pdfminer`, causing `partition_pdf()` to fail. We expect to be able to partition smoothly using an alternative strategy if text extraction doesn't work. Added exception handling to handle unexpected errors when extracting pdf text and to help determine pdf strategy. * **Fix `fast` strategy fall back to `ocr_only`** The `fast` strategy should not fall back to a more expensive strategy. * **Remove default user ./ssh folder** The default notebook user during image build would create the known_hosts file with incorrect ownership, this is legacy and no longer needed so it was removed. diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index d3a792aee..2be9a93fd 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -20,6 +20,9 @@ from unstructured.documents.elements import ( Text, Title, ) +from unstructured.documents.elements import ( + Image as ImageElement, +) from unstructured.partition import common from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT @@ -85,7 +88,7 @@ def test_normalize_layout_element_dict(): def test_normalize_layout_element_dict_caption(): layout_element = { "type": "Figure", - "coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]], + "coordinates": ((1, 2), (3, 4), (5, 6), (7, 8)), "text": "Some lovely text", } coordinate_system = PixelSpace(width=10, height=20) @@ -93,9 +96,9 @@ def test_normalize_layout_element_dict_caption(): layout_element, coordinate_system=coordinate_system, ) - assert element == FigureCaption( + assert element == ImageElement( text="Some lovely text", - coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]], + coordinates=((1, 2), (3, 4), (5, 6), (7, 8)), coordinate_system=coordinate_system, ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 95c6a4977..ec06ce086 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.0-dev5" # pragma: no cover +__version__ = "0.11.0-dev6" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index d6230b080..fe1c2ce18 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -836,37 +836,37 @@ class ElementType: TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = { - ElementType.UNCATEGORIZED_TEXT: Text, - ElementType.FIGURE_CAPTION: FigureCaption, - ElementType.FIGURE: FigureCaption, - ElementType.TEXT: NarrativeText, - ElementType.NARRATIVE_TEXT: NarrativeText, - ElementType.LIST_ITEM: ListItem, - ElementType.BULLETED_TEXT: ListItem, ElementType.TITLE: Title, - ElementType.ADDRESS: Address, - ElementType.EMAIL_ADDRESS: EmailAddress, - ElementType.IMAGE: Image, - ElementType.PAGE_BREAK: PageBreak, - ElementType.TABLE: Table, - ElementType.HEADER: Header, - ElementType.FOOTER: Footer, - ElementType.CAPTION: FigureCaption, - ElementType.FOOTNOTE: Footer, - ElementType.FORMULA: Formula, - ElementType.LIST_ITEM_OTHER: ListItem, - ElementType.PAGE_FOOTER: Footer, - ElementType.PAGE_HEADER: Header, # Title? - ElementType.PICTURE: Image, - # this mapping favors ensures yolox produces backward compatible categories ElementType.SECTION_HEADER: Title, ElementType.HEADLINE: Title, ElementType.SUB_HEADLINE: Title, + ElementType.FIELD_NAME: Title, + ElementType.UNCATEGORIZED_TEXT: Text, + ElementType.COMPOSITE_ELEMENT: Text, + ElementType.TEXT: NarrativeText, + ElementType.NARRATIVE_TEXT: NarrativeText, + # this mapping favors ensures yolox produces backward compatible categories ElementType.ABSTRACT: NarrativeText, ElementType.THREADING: NarrativeText, ElementType.FORM: NarrativeText, - ElementType.FIELD_NAME: Title, ElementType.VALUE: NarrativeText, ElementType.LINK: NarrativeText, - ElementType.COMPOSITE_ELEMENT: Text, + ElementType.LIST_ITEM: ListItem, + ElementType.BULLETED_TEXT: ListItem, + ElementType.LIST_ITEM_OTHER: ListItem, + ElementType.HEADER: Header, + ElementType.PAGE_HEADER: Header, # Title? + ElementType.FOOTER: Footer, + ElementType.PAGE_FOOTER: Footer, + ElementType.FOOTNOTE: Footer, + ElementType.FIGURE_CAPTION: FigureCaption, + ElementType.CAPTION: FigureCaption, + ElementType.IMAGE: Image, + ElementType.FIGURE: Image, + ElementType.PICTURE: Image, + ElementType.TABLE: Table, + ElementType.ADDRESS: Address, + ElementType.EMAIL_ADDRESS: EmailAddress, + ElementType.FORMULA: Formula, + ElementType.PAGE_BREAK: PageBreak, }