mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-02 21:33:08 +00:00
rfctr: prepare for fix to raises on file-like-object with name not a path to a file (#2617)
**Summary** Improve typing and other mechanical refactoring in preparation for fix to issue 2308.
This commit is contained in:
parent
79552ff70b
commit
b59e4b69ce
@ -1,4 +1,4 @@
|
||||
## 0.12.6-dev5
|
||||
## 0.12.6-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
@ -5,7 +5,8 @@ import pytest
|
||||
from PIL import Image
|
||||
from unstructured_inference.inference import layout
|
||||
from unstructured_inference.inference.elements import TextRegion
|
||||
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
@ -28,7 +29,7 @@ from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DO
|
||||
|
||||
|
||||
class MockPageLayout(layout.PageLayout):
|
||||
def __init__(self, number: int, image: Image):
|
||||
def __init__(self, number: int, image: Image.Image):
|
||||
self.number = number
|
||||
self.image = image
|
||||
|
||||
|
@ -1 +1,11 @@
|
||||
class Table: ...
|
||||
class Table:
|
||||
@property
|
||||
def rows(self) -> tuple[_Row]: ...
|
||||
|
||||
class _Row:
|
||||
@property
|
||||
def cells(self) -> tuple[_Cell]: ...
|
||||
|
||||
class _Cell:
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.12.6-dev5" # pragma: no cover
|
||||
__version__ = "0.12.6-dev6" # pragma: no cover
|
||||
|
@ -899,7 +899,7 @@ class Footer(Text):
|
||||
category = "Footer"
|
||||
|
||||
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, type[Text]] = {
|
||||
ElementType.TITLE: Title,
|
||||
ElementType.SECTION_HEADER: Title,
|
||||
ElementType.HEADLINE: Title,
|
||||
|
@ -15,9 +15,9 @@ from unstructured.documents.elements import Element
|
||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common import (
|
||||
_add_element_metadata,
|
||||
_remove_element_metadata,
|
||||
add_element_metadata,
|
||||
exactly_one,
|
||||
remove_element_metadata,
|
||||
set_element_hierarchy,
|
||||
)
|
||||
|
||||
@ -602,16 +602,11 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element
|
||||
# NOTE(robinson) - Attached files have already run through this logic
|
||||
# in their own partitioning function
|
||||
if element.metadata.attached_to_filename is None:
|
||||
_add_element_metadata(
|
||||
element,
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
add_element_metadata(element, **metadata_kwargs)
|
||||
|
||||
return elements
|
||||
else:
|
||||
return _remove_element_metadata(
|
||||
elements,
|
||||
)
|
||||
return remove_element_metadata(elements)
|
||||
|
||||
return wrapper
|
||||
|
||||
@ -639,16 +634,11 @@ def add_filetype(
|
||||
# NOTE(robinson) - Attached files have already run through this logic
|
||||
# in their own partitioning function
|
||||
if element.metadata.attached_to_filename is None:
|
||||
_add_element_metadata(
|
||||
element,
|
||||
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
||||
)
|
||||
add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype])
|
||||
|
||||
return elements
|
||||
else:
|
||||
return _remove_element_metadata(
|
||||
elements,
|
||||
)
|
||||
return remove_element_metadata(elements)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
@ -6,17 +6,7 @@ import subprocess
|
||||
from datetime import datetime
|
||||
from io import BufferedReader, BytesIO, TextIOWrapper
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import (
|
||||
IO,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
BinaryIO,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
from typing import IO, TYPE_CHECKING, Any, BinaryIO, List, Optional
|
||||
|
||||
import emoji
|
||||
from tabulate import tabulate
|
||||
@ -39,11 +29,8 @@ from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
|
||||
from unstructured.utils import dependency_exists, first
|
||||
|
||||
if dependency_exists("docx") and dependency_exists("docx.table"):
|
||||
from docx.table import Table as docxtable
|
||||
|
||||
if dependency_exists("pptx") and dependency_exists("pptx.table"):
|
||||
from pptx.table import Table as pptxtable
|
||||
from pptx.table import Table as PptxTable
|
||||
|
||||
if dependency_exists("numpy") and dependency_exists("cv2"):
|
||||
from unstructured.partition.utils.sorting import sort_page_elements
|
||||
@ -80,14 +67,20 @@ HIERARCHY_RULE_SET = {
|
||||
}
|
||||
|
||||
|
||||
def get_last_modified_date(filename: str) -> Union[str, None]:
|
||||
def get_last_modified_date(filename: str) -> Optional[str]:
|
||||
"""Modification time of file at path `filename`, if it exists.
|
||||
|
||||
Returns `None` when `filename` is not a path to a file on the local filesystem.
|
||||
|
||||
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
|
||||
"2024-03-05T17:02:53".
|
||||
"""
|
||||
modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
|
||||
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
|
||||
def get_last_modified_date_from_file(
|
||||
file: Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes],
|
||||
) -> Union[str, None]:
|
||||
def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
|
||||
"""Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
|
||||
filename = None
|
||||
if hasattr(file, "name"):
|
||||
filename = file.name
|
||||
@ -100,15 +93,11 @@ def get_last_modified_date_from_file(
|
||||
|
||||
|
||||
def normalize_layout_element(
|
||||
layout_element: Union[
|
||||
"LayoutElement",
|
||||
Element,
|
||||
Dict[str, Any],
|
||||
],
|
||||
layout_element: LayoutElement | Element | dict[str, Any],
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
infer_list_items: bool = True,
|
||||
source_format: Optional[str] = "html",
|
||||
) -> Union[Element, List[Element]]:
|
||||
) -> Element | list[Element]:
|
||||
"""Converts an unstructured_inference LayoutElement object to an unstructured Element."""
|
||||
|
||||
if isinstance(layout_element, Element) and source_format == "html":
|
||||
@ -123,7 +112,7 @@ def normalize_layout_element(
|
||||
else:
|
||||
layout_dict = layout_element
|
||||
|
||||
text = layout_dict.get("text")
|
||||
text = layout_dict.get("text", "")
|
||||
# Both `coordinates` and `coordinate_system` must be present
|
||||
# in order to add coordinates metadata to the element.
|
||||
coordinates = layout_dict.get("coordinates")
|
||||
@ -148,7 +137,7 @@ def normalize_layout_element(
|
||||
)
|
||||
else:
|
||||
return ListItem(
|
||||
text=text if text else "",
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
@ -156,6 +145,7 @@ def normalize_layout_element(
|
||||
)
|
||||
|
||||
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
|
||||
assert isinstance(element_type, str)
|
||||
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
|
||||
_element_class = _element_class(
|
||||
text=text,
|
||||
@ -187,7 +177,7 @@ def normalize_layout_element(
|
||||
)
|
||||
else:
|
||||
return Text(
|
||||
text=text if text else "",
|
||||
text=text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
metadata=class_prob_metadata,
|
||||
@ -197,10 +187,10 @@ def normalize_layout_element(
|
||||
|
||||
def layout_list_to_list_items(
|
||||
text: Optional[str],
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]],
|
||||
coordinates: Optional[tuple[tuple[float, float], ...]],
|
||||
coordinate_system: Optional[CoordinateSystem],
|
||||
metadata=Optional[ElementMetadata],
|
||||
detection_origin=Optional[str],
|
||||
metadata: Optional[ElementMetadata],
|
||||
detection_origin: Optional[str],
|
||||
) -> List[Element]:
|
||||
"""Converts a list LayoutElement to a list of ListItem elements."""
|
||||
split_items = ENUMERATED_BULLETS_RE.split(text) if text else []
|
||||
@ -226,9 +216,8 @@ def layout_list_to_list_items(
|
||||
|
||||
|
||||
def set_element_hierarchy(
|
||||
elements: List[Element],
|
||||
ruleset: Dict[str, List[str]] = HIERARCHY_RULE_SET,
|
||||
) -> List[Element]:
|
||||
elements: List[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
|
||||
) -> list[Element]:
|
||||
"""Sets the parent_id for each element in the list of elements
|
||||
based on the element's category, depth and a ruleset
|
||||
|
||||
@ -274,23 +263,25 @@ def set_element_hierarchy(
|
||||
return elements
|
||||
|
||||
|
||||
def _add_element_metadata(
|
||||
def add_element_metadata(
|
||||
element: Element,
|
||||
filename: Optional[str] = None,
|
||||
filetype: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
url: Optional[str] = None,
|
||||
text_as_html: Optional[str] = None,
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinates: Optional[tuple[tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
section: Optional[str] = None,
|
||||
image_path: Optional[str] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
languages: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
**kwargs: Any,
|
||||
) -> Element:
|
||||
"""Adds document metadata to the document element. Document metadata includes information
|
||||
like the filename, source url, and page number."""
|
||||
"""Adds document metadata to the document element.
|
||||
|
||||
Document metadata includes information like the filename, source url, and page number.
|
||||
"""
|
||||
|
||||
coordinates_metadata = (
|
||||
CoordinatesMetadata(
|
||||
@ -342,12 +333,11 @@ def _add_element_metadata(
|
||||
return element
|
||||
|
||||
|
||||
def _remove_element_metadata(
|
||||
layout_elements,
|
||||
) -> List[Element]:
|
||||
"""Removes document metadata from the document element. Document metadata includes information
|
||||
like the filename, source url, and page number."""
|
||||
# Init an empty list of elements to write to
|
||||
def remove_element_metadata(layout_elements) -> list[Element]:
|
||||
"""Removes document metadata from the document element.
|
||||
|
||||
Document metadata includes information like the filename, source url, and page number.
|
||||
"""
|
||||
elements: List[Element] = []
|
||||
metadata = ElementMetadata()
|
||||
for layout_element in layout_elements:
|
||||
@ -442,8 +432,8 @@ def exactly_one(**kwargs: Any) -> None:
|
||||
|
||||
|
||||
def spooled_to_bytes_io_if_needed(
|
||||
file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]],
|
||||
) -> Optional[Union[bytes, BinaryIO]]:
|
||||
file_obj: bytes | BinaryIO | SpooledTemporaryFile[bytes] | None,
|
||||
) -> bytes | BinaryIO | None:
|
||||
if isinstance(file_obj, SpooledTemporaryFile):
|
||||
file_obj.seek(0)
|
||||
contents = file_obj.read()
|
||||
@ -453,35 +443,35 @@ def spooled_to_bytes_io_if_needed(
|
||||
return file_obj
|
||||
|
||||
|
||||
def convert_to_bytes(
|
||||
file: Optional[Union[bytes, SpooledTemporaryFile, IO[bytes]]] = None,
|
||||
) -> bytes:
|
||||
def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
|
||||
"""Extract the bytes from `file` without preventing it from being read again later.
|
||||
|
||||
As a convenience to simplify client code, also returns `file` unchanged if it is already bytes.
|
||||
"""
|
||||
if isinstance(file, bytes):
|
||||
f_bytes = file
|
||||
elif isinstance(file, SpooledTemporaryFile):
|
||||
return file
|
||||
|
||||
if isinstance(file, SpooledTemporaryFile):
|
||||
file.seek(0)
|
||||
f_bytes = file.read()
|
||||
file.seek(0)
|
||||
elif isinstance(file, BytesIO):
|
||||
f_bytes = file.getvalue()
|
||||
elif isinstance(file, (TextIOWrapper, BufferedReader)):
|
||||
return f_bytes
|
||||
|
||||
if isinstance(file, BytesIO):
|
||||
return file.getvalue()
|
||||
|
||||
if isinstance(file, (TextIOWrapper, BufferedReader)):
|
||||
with open(file.name, "rb") as f:
|
||||
f_bytes = f.read()
|
||||
else:
|
||||
raise ValueError("Invalid file-like object type")
|
||||
return f.read()
|
||||
|
||||
return f_bytes
|
||||
raise ValueError("Invalid file-like object type")
|
||||
|
||||
|
||||
def convert_ms_office_table_to_text(
|
||||
table: Union["docxtable", "pptxtable"],
|
||||
as_html: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Convert a table object from a Word document to an HTML table string using the tabulate library.
|
||||
def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str:
|
||||
"""Convert a PPTX table object to an HTML table string using the tabulate library.
|
||||
|
||||
Args:
|
||||
table (Table): A docx.table.Table object.
|
||||
table (Table): A pptx.table.Table object.
|
||||
as_html (bool): Whether to return the table as an HTML string (True) or a
|
||||
plain text string (False)
|
||||
|
||||
@ -513,9 +503,7 @@ def contains_emoji(s: str) -> bool:
|
||||
return bool(emoji.emoji_count(s))
|
||||
|
||||
|
||||
def _get_page_image_metadata(
|
||||
page: PageLayout,
|
||||
) -> dict:
|
||||
def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
|
||||
"""Retrieve image metadata and coordinate system from a page."""
|
||||
|
||||
image = getattr(page, "image", None)
|
||||
@ -551,7 +539,7 @@ def document_to_element_list(
|
||||
detection_origin: Optional[str] = None,
|
||||
sort_mode: str = SORT_MODE_XY_CUT,
|
||||
languages: Optional[List[str]] = None,
|
||||
**kwargs,
|
||||
**kwargs: Any,
|
||||
) -> List[Element]:
|
||||
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
||||
elements: List[Element] = []
|
||||
@ -565,7 +553,7 @@ def document_to_element_list(
|
||||
image_width = page_image_metadata.get("width")
|
||||
image_height = page_image_metadata.get("height")
|
||||
|
||||
translation_mapping: List[Tuple["LayoutElement", Element]] = []
|
||||
translation_mapping: list[tuple["LayoutElement", Element]] = []
|
||||
for layout_element in page.elements:
|
||||
if image_width and image_height and hasattr(layout_element.bbox, "coordinates"):
|
||||
coordinate_system = PixelSpace(width=image_width, height=image_height)
|
||||
@ -610,7 +598,7 @@ def document_to_element_list(
|
||||
layout_element.image_path if hasattr(layout_element, "image_path") else None
|
||||
)
|
||||
|
||||
_add_element_metadata(
|
||||
add_element_metadata(
|
||||
element,
|
||||
page_number=i + 1,
|
||||
filetype=image_format,
|
||||
@ -642,16 +630,16 @@ def document_to_element_list(
|
||||
|
||||
def ocr_data_to_elements(
|
||||
ocr_data: List["LayoutElement"],
|
||||
image_size: Tuple[Union[int, float], Union[int, float]],
|
||||
image_size: tuple[int | float, int | float],
|
||||
common_metadata: Optional[ElementMetadata] = None,
|
||||
infer_list_items: bool = True,
|
||||
source_format: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
) -> list[Element]:
|
||||
"""Convert OCR layout data into `unstructured` elements with associated metadata."""
|
||||
|
||||
image_width, image_height = image_size
|
||||
coordinate_system = PixelSpace(width=image_width, height=image_height)
|
||||
elements = []
|
||||
elements: list[Element] = []
|
||||
for layout_element in ocr_data:
|
||||
element = normalize_layout_element(
|
||||
layout_element,
|
||||
|
@ -135,7 +135,7 @@ def default_hi_res_model() -> str:
|
||||
@add_chunking_strategy
|
||||
def partition_pdf(
|
||||
filename: str = "",
|
||||
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None,
|
||||
file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = PartitionStrategy.AUTO,
|
||||
infer_table_structure: bool = False,
|
||||
@ -151,7 +151,7 @@ def partition_pdf(
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
**kwargs,
|
||||
**kwargs: Any,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
Parameters
|
||||
|
@ -263,7 +263,7 @@ def validate_date_args(date: Optional[str] = None) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any]]:
|
||||
def _first_and_remaining_iterator(it: Iterable[_T]) -> Tuple[_T, Iterator[_T]]:
|
||||
iterator = iter(it)
|
||||
try:
|
||||
out = next(iterator)
|
||||
@ -275,7 +275,7 @@ def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any]
|
||||
return out, iterator
|
||||
|
||||
|
||||
def first(it: Iterable[Any]) -> Any:
|
||||
def first(it: Iterable[_T]) -> _T:
|
||||
"""Returns the first item from an iterable. Raises an error if the iterable is empty."""
|
||||
out, _ = _first_and_remaining_iterator(it)
|
||||
return out
|
||||
|
Loading…
x
Reference in New Issue
Block a user