Fix: incorrect figure mapping (#2111)

Closes #2098.
This commit is contained in:
Christine Straub 2023-11-17 16:11:11 -08:00 committed by GitHub
parent 5ba3b9c2c6
commit d623d75d3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 29 deletions

View File

@ -1,4 +1,4 @@
## 0.11.0-dev5
## 0.11.0-dev6
### Enhancements
@ -13,6 +13,7 @@
### Fixes
* **Fix `TYPE_TO_TEXT_ELEMENT_MAP`** Updated `Figure` mapping from `FigureCaption` to `Image`.
* **Handle errors when extracting PDF text** Certain pdfs throw unexpected errors when being opened by `pdfminer`, causing `partition_pdf()` to fail. We expect to be able to partition smoothly using an alternative strategy if text extraction doesn't work. Added exception handling to handle unexpected errors when extracting pdf text and to help determine pdf strategy.
* **Fix `fast` strategy fall back to `ocr_only`** The `fast` strategy should not fall back to a more expensive strategy.
* **Remove default user ./ssh folder** The default notebook user during image build would create the known_hosts file with incorrect ownership, this is legacy and no longer needed so it was removed.

View File

@ -20,6 +20,9 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.documents.elements import (
Image as ImageElement,
)
from unstructured.partition import common
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
@ -85,7 +88,7 @@ def test_normalize_layout_element_dict():
def test_normalize_layout_element_dict_caption():
layout_element = {
"type": "Figure",
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
"coordinates": ((1, 2), (3, 4), (5, 6), (7, 8)),
"text": "Some lovely text",
}
coordinate_system = PixelSpace(width=10, height=20)
@ -93,9 +96,9 @@ def test_normalize_layout_element_dict_caption():
layout_element,
coordinate_system=coordinate_system,
)
assert element == FigureCaption(
assert element == ImageElement(
text="Some lovely text",
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
coordinates=((1, 2), (3, 4), (5, 6), (7, 8)),
coordinate_system=coordinate_system,
)

View File

@ -1 +1 @@
__version__ = "0.11.0-dev5" # pragma: no cover
__version__ = "0.11.0-dev6" # pragma: no cover

View File

@ -836,37 +836,37 @@ class ElementType:
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
ElementType.UNCATEGORIZED_TEXT: Text,
ElementType.FIGURE_CAPTION: FigureCaption,
ElementType.FIGURE: FigureCaption,
ElementType.TEXT: NarrativeText,
ElementType.NARRATIVE_TEXT: NarrativeText,
ElementType.LIST_ITEM: ListItem,
ElementType.BULLETED_TEXT: ListItem,
ElementType.TITLE: Title,
ElementType.ADDRESS: Address,
ElementType.EMAIL_ADDRESS: EmailAddress,
ElementType.IMAGE: Image,
ElementType.PAGE_BREAK: PageBreak,
ElementType.TABLE: Table,
ElementType.HEADER: Header,
ElementType.FOOTER: Footer,
ElementType.CAPTION: FigureCaption,
ElementType.FOOTNOTE: Footer,
ElementType.FORMULA: Formula,
ElementType.LIST_ITEM_OTHER: ListItem,
ElementType.PAGE_FOOTER: Footer,
ElementType.PAGE_HEADER: Header, # Title?
ElementType.PICTURE: Image,
# this mapping favors ensures yolox produces backward compatible categories
ElementType.SECTION_HEADER: Title,
ElementType.HEADLINE: Title,
ElementType.SUB_HEADLINE: Title,
ElementType.FIELD_NAME: Title,
ElementType.UNCATEGORIZED_TEXT: Text,
ElementType.COMPOSITE_ELEMENT: Text,
ElementType.TEXT: NarrativeText,
ElementType.NARRATIVE_TEXT: NarrativeText,
# this mapping favors ensures yolox produces backward compatible categories
ElementType.ABSTRACT: NarrativeText,
ElementType.THREADING: NarrativeText,
ElementType.FORM: NarrativeText,
ElementType.FIELD_NAME: Title,
ElementType.VALUE: NarrativeText,
ElementType.LINK: NarrativeText,
ElementType.COMPOSITE_ELEMENT: Text,
ElementType.LIST_ITEM: ListItem,
ElementType.BULLETED_TEXT: ListItem,
ElementType.LIST_ITEM_OTHER: ListItem,
ElementType.HEADER: Header,
ElementType.PAGE_HEADER: Header, # Title?
ElementType.FOOTER: Footer,
ElementType.PAGE_FOOTER: Footer,
ElementType.FOOTNOTE: Footer,
ElementType.FIGURE_CAPTION: FigureCaption,
ElementType.CAPTION: FigureCaption,
ElementType.IMAGE: Image,
ElementType.FIGURE: Image,
ElementType.PICTURE: Image,
ElementType.TABLE: Table,
ElementType.ADDRESS: Address,
ElementType.EMAIL_ADDRESS: EmailAddress,
ElementType.FORMULA: Formula,
ElementType.PAGE_BREAK: PageBreak,
}