rfctr: prepare for fix to raises on file-like-object with name not a path to a file (#2617)

**Summary**
Improve typing and other mechanical refactoring in preparation for fix
to issue 2308.
This commit is contained in:
Steve Canny 2024-03-06 15:46:54 -08:00 committed by GitHub
parent 79552ff70b
commit b59e4b69ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 90 additions and 101 deletions

View File

@ -1,4 +1,4 @@
## 0.12.6-dev5 ## 0.12.6-dev6
### Enhancements ### Enhancements

View File

@ -5,7 +5,8 @@ import pytest
from PIL import Image from PIL import Image
from unstructured_inference.inference import layout from unstructured_inference.inference import layout
from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.coordinates import PixelSpace from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ( from unstructured.documents.elements import (
@ -28,7 +29,7 @@ from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DO
class MockPageLayout(layout.PageLayout): class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image): def __init__(self, number: int, image: Image.Image):
self.number = number self.number = number
self.image = image self.image = image

View File

@ -1 +1,11 @@
class Table: ... class Table:
@property
def rows(self) -> tuple[_Row]: ...
class _Row:
@property
def cells(self) -> tuple[_Cell]: ...
class _Cell:
@property
def text(self) -> str: ...

View File

@ -1 +1 @@
__version__ = "0.12.6-dev5" # pragma: no cover __version__ = "0.12.6-dev6" # pragma: no cover

View File

@ -899,7 +899,7 @@ class Footer(Text):
category = "Footer" category = "Footer"
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = { TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, type[Text]] = {
ElementType.TITLE: Title, ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title, ElementType.SECTION_HEADER: Title,
ElementType.HEADLINE: Title, ElementType.HEADLINE: Title,

View File

@ -15,9 +15,9 @@ from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import ( from unstructured.partition.common import (
_add_element_metadata, add_element_metadata,
_remove_element_metadata,
exactly_one, exactly_one,
remove_element_metadata,
set_element_hierarchy, set_element_hierarchy,
) )
@ -602,16 +602,11 @@ def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element
# NOTE(robinson) - Attached files have already run through this logic # NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function # in their own partitioning function
if element.metadata.attached_to_filename is None: if element.metadata.attached_to_filename is None:
_add_element_metadata( add_element_metadata(element, **metadata_kwargs)
element,
**metadata_kwargs, # type: ignore
)
return elements return elements
else: else:
return _remove_element_metadata( return remove_element_metadata(elements)
elements,
)
return wrapper return wrapper
@ -639,16 +634,11 @@ def add_filetype(
# NOTE(robinson) - Attached files have already run through this logic # NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function # in their own partitioning function
if element.metadata.attached_to_filename is None: if element.metadata.attached_to_filename is None:
_add_element_metadata( add_element_metadata(element, filetype=FILETYPE_TO_MIMETYPE[filetype])
element,
filetype=FILETYPE_TO_MIMETYPE[filetype],
)
return elements return elements
else: else:
return _remove_element_metadata( return remove_element_metadata(elements)
elements,
)
return wrapper return wrapper

View File

@ -6,17 +6,7 @@ import subprocess
from datetime import datetime from datetime import datetime
from io import BufferedReader, BytesIO, TextIOWrapper from io import BufferedReader, BytesIO, TextIOWrapper
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from typing import ( from typing import IO, TYPE_CHECKING, Any, BinaryIO, List, Optional
IO,
TYPE_CHECKING,
Any,
BinaryIO,
Dict,
List,
Optional,
Tuple,
Union,
)
import emoji import emoji
from tabulate import tabulate from tabulate import tabulate
@ -39,11 +29,8 @@ from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT
from unstructured.utils import dependency_exists, first from unstructured.utils import dependency_exists, first
if dependency_exists("docx") and dependency_exists("docx.table"):
from docx.table import Table as docxtable
if dependency_exists("pptx") and dependency_exists("pptx.table"): if dependency_exists("pptx") and dependency_exists("pptx.table"):
from pptx.table import Table as pptxtable from pptx.table import Table as PptxTable
if dependency_exists("numpy") and dependency_exists("cv2"): if dependency_exists("numpy") and dependency_exists("cv2"):
from unstructured.partition.utils.sorting import sort_page_elements from unstructured.partition.utils.sorting import sort_page_elements
@ -80,14 +67,20 @@ HIERARCHY_RULE_SET = {
} }
def get_last_modified_date(filename: str) -> Union[str, None]: def get_last_modified_date(filename: str) -> Optional[str]:
"""Modification time of file at path `filename`, if it exists.
Returns `None` when `filename` is not a path to a file on the local filesystem.
Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
"2024-03-05T17:02:53".
"""
modify_date = datetime.fromtimestamp(os.path.getmtime(filename)) modify_date = datetime.fromtimestamp(os.path.getmtime(filename))
return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z") return modify_date.strftime("%Y-%m-%dT%H:%M:%S%z")
def get_last_modified_date_from_file( def get_last_modified_date_from_file(file: IO[bytes] | bytes) -> Optional[str]:
file: Union[IO[bytes], SpooledTemporaryFile[bytes], BinaryIO, bytes], """Modified timestamp of `file` if it corresponds to a file on the local filesystem."""
) -> Union[str, None]:
filename = None filename = None
if hasattr(file, "name"): if hasattr(file, "name"):
filename = file.name filename = file.name
@ -100,15 +93,11 @@ def get_last_modified_date_from_file(
def normalize_layout_element( def normalize_layout_element(
layout_element: Union[ layout_element: LayoutElement | Element | dict[str, Any],
"LayoutElement",
Element,
Dict[str, Any],
],
coordinate_system: Optional[CoordinateSystem] = None, coordinate_system: Optional[CoordinateSystem] = None,
infer_list_items: bool = True, infer_list_items: bool = True,
source_format: Optional[str] = "html", source_format: Optional[str] = "html",
) -> Union[Element, List[Element]]: ) -> Element | list[Element]:
"""Converts an unstructured_inference LayoutElement object to an unstructured Element.""" """Converts an unstructured_inference LayoutElement object to an unstructured Element."""
if isinstance(layout_element, Element) and source_format == "html": if isinstance(layout_element, Element) and source_format == "html":
@ -123,7 +112,7 @@ def normalize_layout_element(
else: else:
layout_dict = layout_element layout_dict = layout_element
text = layout_dict.get("text") text = layout_dict.get("text", "")
# Both `coordinates` and `coordinate_system` must be present # Both `coordinates` and `coordinate_system` must be present
# in order to add coordinates metadata to the element. # in order to add coordinates metadata to the element.
coordinates = layout_dict.get("coordinates") coordinates = layout_dict.get("coordinates")
@ -148,7 +137,7 @@ def normalize_layout_element(
) )
else: else:
return ListItem( return ListItem(
text=text if text else "", text=text,
coordinates=coordinates, coordinates=coordinates,
coordinate_system=coordinate_system, coordinate_system=coordinate_system,
metadata=class_prob_metadata, metadata=class_prob_metadata,
@ -156,6 +145,7 @@ def normalize_layout_element(
) )
elif element_type in TYPE_TO_TEXT_ELEMENT_MAP: elif element_type in TYPE_TO_TEXT_ELEMENT_MAP:
assert isinstance(element_type, str)
_element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type] _element_class = TYPE_TO_TEXT_ELEMENT_MAP[element_type]
_element_class = _element_class( _element_class = _element_class(
text=text, text=text,
@ -187,7 +177,7 @@ def normalize_layout_element(
) )
else: else:
return Text( return Text(
text=text if text else "", text=text,
coordinates=coordinates, coordinates=coordinates,
coordinate_system=coordinate_system, coordinate_system=coordinate_system,
metadata=class_prob_metadata, metadata=class_prob_metadata,
@ -197,10 +187,10 @@ def normalize_layout_element(
def layout_list_to_list_items( def layout_list_to_list_items(
text: Optional[str], text: Optional[str],
coordinates: Optional[Tuple[Tuple[float, float], ...]], coordinates: Optional[tuple[tuple[float, float], ...]],
coordinate_system: Optional[CoordinateSystem], coordinate_system: Optional[CoordinateSystem],
metadata=Optional[ElementMetadata], metadata: Optional[ElementMetadata],
detection_origin=Optional[str], detection_origin: Optional[str],
) -> List[Element]: ) -> List[Element]:
"""Converts a list LayoutElement to a list of ListItem elements.""" """Converts a list LayoutElement to a list of ListItem elements."""
split_items = ENUMERATED_BULLETS_RE.split(text) if text else [] split_items = ENUMERATED_BULLETS_RE.split(text) if text else []
@ -226,9 +216,8 @@ def layout_list_to_list_items(
def set_element_hierarchy( def set_element_hierarchy(
elements: List[Element], elements: List[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
ruleset: Dict[str, List[str]] = HIERARCHY_RULE_SET, ) -> list[Element]:
) -> List[Element]:
"""Sets the parent_id for each element in the list of elements """Sets the parent_id for each element in the list of elements
based on the element's category, depth and a ruleset based on the element's category, depth and a ruleset
@ -274,23 +263,25 @@ def set_element_hierarchy(
return elements return elements
def _add_element_metadata( def add_element_metadata(
element: Element, element: Element,
filename: Optional[str] = None, filename: Optional[str] = None,
filetype: Optional[str] = None, filetype: Optional[str] = None,
page_number: Optional[int] = None, page_number: Optional[int] = None,
url: Optional[str] = None, url: Optional[str] = None,
text_as_html: Optional[str] = None, text_as_html: Optional[str] = None,
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None, coordinates: Optional[tuple[tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None, coordinate_system: Optional[CoordinateSystem] = None,
section: Optional[str] = None, section: Optional[str] = None,
image_path: Optional[str] = None, image_path: Optional[str] = None,
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
**kwargs, **kwargs: Any,
) -> Element: ) -> Element:
"""Adds document metadata to the document element. Document metadata includes information """Adds document metadata to the document element.
like the filename, source url, and page number."""
Document metadata includes information like the filename, source url, and page number.
"""
coordinates_metadata = ( coordinates_metadata = (
CoordinatesMetadata( CoordinatesMetadata(
@ -342,12 +333,11 @@ def _add_element_metadata(
return element return element
def _remove_element_metadata( def remove_element_metadata(layout_elements) -> list[Element]:
layout_elements, """Removes document metadata from the document element.
) -> List[Element]:
"""Removes document metadata from the document element. Document metadata includes information Document metadata includes information like the filename, source url, and page number.
like the filename, source url, and page number.""" """
# Init an empty list of elements to write to
elements: List[Element] = [] elements: List[Element] = []
metadata = ElementMetadata() metadata = ElementMetadata()
for layout_element in layout_elements: for layout_element in layout_elements:
@ -442,8 +432,8 @@ def exactly_one(**kwargs: Any) -> None:
def spooled_to_bytes_io_if_needed( def spooled_to_bytes_io_if_needed(
file_obj: Optional[Union[bytes, BinaryIO, SpooledTemporaryFile[bytes]]], file_obj: bytes | BinaryIO | SpooledTemporaryFile[bytes] | None,
) -> Optional[Union[bytes, BinaryIO]]: ) -> bytes | BinaryIO | None:
if isinstance(file_obj, SpooledTemporaryFile): if isinstance(file_obj, SpooledTemporaryFile):
file_obj.seek(0) file_obj.seek(0)
contents = file_obj.read() contents = file_obj.read()
@ -453,35 +443,35 @@ def spooled_to_bytes_io_if_needed(
return file_obj return file_obj
def convert_to_bytes( def convert_to_bytes(file: bytes | IO[bytes]) -> bytes:
file: Optional[Union[bytes, SpooledTemporaryFile, IO[bytes]]] = None, """Extract the bytes from `file` without preventing it from being read again later.
) -> bytes:
As a convenience to simplify client code, also returns `file` unchanged if it is already bytes.
"""
if isinstance(file, bytes): if isinstance(file, bytes):
f_bytes = file return file
elif isinstance(file, SpooledTemporaryFile):
if isinstance(file, SpooledTemporaryFile):
file.seek(0) file.seek(0)
f_bytes = file.read() f_bytes = file.read()
file.seek(0) file.seek(0)
elif isinstance(file, BytesIO): return f_bytes
f_bytes = file.getvalue()
elif isinstance(file, (TextIOWrapper, BufferedReader)): if isinstance(file, BytesIO):
return file.getvalue()
if isinstance(file, (TextIOWrapper, BufferedReader)):
with open(file.name, "rb") as f: with open(file.name, "rb") as f:
f_bytes = f.read() return f.read()
else:
raise ValueError("Invalid file-like object type")
return f_bytes raise ValueError("Invalid file-like object type")
def convert_ms_office_table_to_text( def convert_ms_office_table_to_text(table: PptxTable, as_html: bool = True) -> str:
table: Union["docxtable", "pptxtable"], """Convert a PPTX table object to an HTML table string using the tabulate library.
as_html: bool = True,
) -> str:
"""
Convert a table object from a Word document to an HTML table string using the tabulate library.
Args: Args:
table (Table): A docx.table.Table object. table (Table): A pptx.table.Table object.
as_html (bool): Whether to return the table as an HTML string (True) or a as_html (bool): Whether to return the table as an HTML string (True) or a
plain text string (False) plain text string (False)
@ -513,9 +503,7 @@ def contains_emoji(s: str) -> bool:
return bool(emoji.emoji_count(s)) return bool(emoji.emoji_count(s))
def _get_page_image_metadata( def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
page: PageLayout,
) -> dict:
"""Retrieve image metadata and coordinate system from a page.""" """Retrieve image metadata and coordinate system from a page."""
image = getattr(page, "image", None) image = getattr(page, "image", None)
@ -551,7 +539,7 @@ def document_to_element_list(
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT, sort_mode: str = SORT_MODE_XY_CUT,
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
**kwargs, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements.""" """Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = [] elements: List[Element] = []
@ -565,7 +553,7 @@ def document_to_element_list(
image_width = page_image_metadata.get("width") image_width = page_image_metadata.get("width")
image_height = page_image_metadata.get("height") image_height = page_image_metadata.get("height")
translation_mapping: List[Tuple["LayoutElement", Element]] = [] translation_mapping: list[tuple["LayoutElement", Element]] = []
for layout_element in page.elements: for layout_element in page.elements:
if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): if image_width and image_height and hasattr(layout_element.bbox, "coordinates"):
coordinate_system = PixelSpace(width=image_width, height=image_height) coordinate_system = PixelSpace(width=image_width, height=image_height)
@ -610,7 +598,7 @@ def document_to_element_list(
layout_element.image_path if hasattr(layout_element, "image_path") else None layout_element.image_path if hasattr(layout_element, "image_path") else None
) )
_add_element_metadata( add_element_metadata(
element, element,
page_number=i + 1, page_number=i + 1,
filetype=image_format, filetype=image_format,
@ -642,16 +630,16 @@ def document_to_element_list(
def ocr_data_to_elements( def ocr_data_to_elements(
ocr_data: List["LayoutElement"], ocr_data: List["LayoutElement"],
image_size: Tuple[Union[int, float], Union[int, float]], image_size: tuple[int | float, int | float],
common_metadata: Optional[ElementMetadata] = None, common_metadata: Optional[ElementMetadata] = None,
infer_list_items: bool = True, infer_list_items: bool = True,
source_format: Optional[str] = None, source_format: Optional[str] = None,
) -> List[Element]: ) -> list[Element]:
"""Convert OCR layout data into `unstructured` elements with associated metadata.""" """Convert OCR layout data into `unstructured` elements with associated metadata."""
image_width, image_height = image_size image_width, image_height = image_size
coordinate_system = PixelSpace(width=image_width, height=image_height) coordinate_system = PixelSpace(width=image_width, height=image_height)
elements = [] elements: list[Element] = []
for layout_element in ocr_data: for layout_element in ocr_data:
element = normalize_layout_element( element = normalize_layout_element(
layout_element, layout_element,

View File

@ -135,7 +135,7 @@ def default_hi_res_model() -> str:
@add_chunking_strategy @add_chunking_strategy
def partition_pdf( def partition_pdf(
filename: str = "", filename: str = "",
file: Optional[Union[BinaryIO, SpooledTemporaryFile]] = None, file: Optional[Union[BinaryIO, SpooledTemporaryFile[bytes]]] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = PartitionStrategy.AUTO, strategy: str = PartitionStrategy.AUTO,
infer_table_structure: bool = False, infer_table_structure: bool = False,
@ -151,7 +151,7 @@ def partition_pdf(
extract_image_block_types: Optional[List[str]] = None, extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None, extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False, extract_image_block_to_payload: bool = False,
**kwargs, **kwargs: Any,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
Parameters Parameters

View File

@ -263,7 +263,7 @@ def validate_date_args(date: Optional[str] = None) -> bool:
) )
def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any]]: def _first_and_remaining_iterator(it: Iterable[_T]) -> Tuple[_T, Iterator[_T]]:
iterator = iter(it) iterator = iter(it)
try: try:
out = next(iterator) out = next(iterator)
@ -275,7 +275,7 @@ def _first_and_remaining_iterator(it: Iterable[Any]) -> tuple[Any, Iterator[Any]
return out, iterator return out, iterator
def first(it: Iterable[Any]) -> Any: def first(it: Iterable[_T]) -> _T:
"""Returns the first item from an iterable. Raises an error if the iterable is empty.""" """Returns the first item from an iterable. Raises an error if the iterable is empty."""
out, _ = _first_and_remaining_iterator(it) out, _ = _first_and_remaining_iterator(it)
return out return out