mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 11:34:07 +00:00
parent
5ba3b9c2c6
commit
d623d75d3c
@ -1,4 +1,4 @@
|
||||
## 0.11.0-dev5
|
||||
## 0.11.0-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix `TYPE_TO_TEXT_ELEMENT_MAP`** Updated `Figure` mapping from `FigureCaption` to `Image`.
|
||||
* **Handle errors when extracting PDF text** Certain pdfs throw unexpected errors when being opened by `pdfminer`, causing `partition_pdf()` to fail. We expect to be able to partition smoothly using an alternative strategy if text extraction doesn't work. Added exception handling to handle unexpected errors when extracting pdf text and to help determine pdf strategy.
|
||||
* **Fix `fast` strategy fall back to `ocr_only`** The `fast` strategy should not fall back to a more expensive strategy.
|
||||
* **Remove default user ./ssh folder** The default notebook user during image build would create the known_hosts file with incorrect ownership, this is legacy and no longer needed so it was removed.
|
||||
|
||||
@ -20,6 +20,9 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.documents.elements import (
|
||||
Image as ImageElement,
|
||||
)
|
||||
from unstructured.partition import common
|
||||
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
|
||||
|
||||
@ -85,7 +88,7 @@ def test_normalize_layout_element_dict():
|
||||
def test_normalize_layout_element_dict_caption():
|
||||
layout_element = {
|
||||
"type": "Figure",
|
||||
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
||||
"coordinates": ((1, 2), (3, 4), (5, 6), (7, 8)),
|
||||
"text": "Some lovely text",
|
||||
}
|
||||
coordinate_system = PixelSpace(width=10, height=20)
|
||||
@ -93,9 +96,9 @@ def test_normalize_layout_element_dict_caption():
|
||||
layout_element,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
assert element == FigureCaption(
|
||||
assert element == ImageElement(
|
||||
text="Some lovely text",
|
||||
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
||||
coordinates=((1, 2), (3, 4), (5, 6), (7, 8)),
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.0-dev5" # pragma: no cover
|
||||
__version__ = "0.11.0-dev6" # pragma: no cover
|
||||
|
||||
@ -836,37 +836,37 @@ class ElementType:
|
||||
|
||||
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||
ElementType.UNCATEGORIZED_TEXT: Text,
|
||||
ElementType.FIGURE_CAPTION: FigureCaption,
|
||||
ElementType.FIGURE: FigureCaption,
|
||||
ElementType.TEXT: NarrativeText,
|
||||
ElementType.NARRATIVE_TEXT: NarrativeText,
|
||||
ElementType.LIST_ITEM: ListItem,
|
||||
ElementType.BULLETED_TEXT: ListItem,
|
||||
ElementType.TITLE: Title,
|
||||
ElementType.ADDRESS: Address,
|
||||
ElementType.EMAIL_ADDRESS: EmailAddress,
|
||||
ElementType.IMAGE: Image,
|
||||
ElementType.PAGE_BREAK: PageBreak,
|
||||
ElementType.TABLE: Table,
|
||||
ElementType.HEADER: Header,
|
||||
ElementType.FOOTER: Footer,
|
||||
ElementType.CAPTION: FigureCaption,
|
||||
ElementType.FOOTNOTE: Footer,
|
||||
ElementType.FORMULA: Formula,
|
||||
ElementType.LIST_ITEM_OTHER: ListItem,
|
||||
ElementType.PAGE_FOOTER: Footer,
|
||||
ElementType.PAGE_HEADER: Header, # Title?
|
||||
ElementType.PICTURE: Image,
|
||||
# this mapping favors ensures yolox produces backward compatible categories
|
||||
ElementType.SECTION_HEADER: Title,
|
||||
ElementType.HEADLINE: Title,
|
||||
ElementType.SUB_HEADLINE: Title,
|
||||
ElementType.FIELD_NAME: Title,
|
||||
ElementType.UNCATEGORIZED_TEXT: Text,
|
||||
ElementType.COMPOSITE_ELEMENT: Text,
|
||||
ElementType.TEXT: NarrativeText,
|
||||
ElementType.NARRATIVE_TEXT: NarrativeText,
|
||||
# this mapping favors ensures yolox produces backward compatible categories
|
||||
ElementType.ABSTRACT: NarrativeText,
|
||||
ElementType.THREADING: NarrativeText,
|
||||
ElementType.FORM: NarrativeText,
|
||||
ElementType.FIELD_NAME: Title,
|
||||
ElementType.VALUE: NarrativeText,
|
||||
ElementType.LINK: NarrativeText,
|
||||
ElementType.COMPOSITE_ELEMENT: Text,
|
||||
ElementType.LIST_ITEM: ListItem,
|
||||
ElementType.BULLETED_TEXT: ListItem,
|
||||
ElementType.LIST_ITEM_OTHER: ListItem,
|
||||
ElementType.HEADER: Header,
|
||||
ElementType.PAGE_HEADER: Header, # Title?
|
||||
ElementType.FOOTER: Footer,
|
||||
ElementType.PAGE_FOOTER: Footer,
|
||||
ElementType.FOOTNOTE: Footer,
|
||||
ElementType.FIGURE_CAPTION: FigureCaption,
|
||||
ElementType.CAPTION: FigureCaption,
|
||||
ElementType.IMAGE: Image,
|
||||
ElementType.FIGURE: Image,
|
||||
ElementType.PICTURE: Image,
|
||||
ElementType.TABLE: Table,
|
||||
ElementType.ADDRESS: Address,
|
||||
ElementType.EMAIL_ADDRESS: EmailAddress,
|
||||
ElementType.FORMULA: Formula,
|
||||
ElementType.PAGE_BREAK: PageBreak,
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user