mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 20:57:50 +00:00
Fix/521 pdf2image memory error hi res (#948)
This PR is to reflect changes in the unstructured-inference PR #152 * Update functionality to retrieve image metadata from a page for document_to_element_list
This commit is contained in:
parent
6e852cbe70
commit
f7def03d55
2
.gitignore
vendored
2
.gitignore
vendored
@ -186,3 +186,5 @@ tags
|
|||||||
|
|
||||||
# Ruff cache
|
# Ruff cache
|
||||||
.ruff_cache/
|
.ruff_cache/
|
||||||
|
|
||||||
|
unstructured-inference/
|
@ -1,7 +1,8 @@
|
|||||||
## 0.8.2-dev4
|
## 0.8.2-dev5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Update functionality to retrieve image metadata from a page for `document_to_element_list`
|
||||||
* Links are now tracked in `partition_html` output.
|
* Links are now tracked in `partition_html` output.
|
||||||
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
* Set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||||
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
* Add min_partition kwarg to that combines elements below a specified threshold and modifies splitting of strings longer than max partition so words are not split.
|
||||||
@ -17,6 +18,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Use the `image_metadata` property of the `PageLayout` instance to get the page image info in the `document_to_element_list`
|
||||||
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
|
* Add functionality to write images to computer storage temporarily instead of keeping them in memory for `ocr_only` strategy
|
||||||
* Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy
|
* Add functionality to convert a PDF in small chunks of pages at a time for `ocr_only` strategy
|
||||||
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
|
* Adds `.txt`, `.text`, and `.tab` to list of extensions to check if file
|
||||||
|
@ -11,6 +11,7 @@ from unstructured_inference.inference.layoutelement import LocationlessLayoutEle
|
|||||||
from unstructured.file_utils import filetype
|
from unstructured.file_utils import filetype
|
||||||
from unstructured.file_utils.filetype import (
|
from unstructured.file_utils.filetype import (
|
||||||
FileType,
|
FileType,
|
||||||
|
_get_page_image_metadata,
|
||||||
_is_code_mime_type,
|
_is_code_mime_type,
|
||||||
_is_text_file_a_csv,
|
_is_text_file_a_csv,
|
||||||
_is_text_file_a_json,
|
_is_text_file_a_json,
|
||||||
@ -469,3 +470,9 @@ def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
|||||||
layout_elem_absent_coordinates = MockDocumentLayout()
|
layout_elem_absent_coordinates = MockDocumentLayout()
|
||||||
elements = document_to_element_list(layout_elem_absent_coordinates)
|
elements = document_to_element_list(layout_elem_absent_coordinates)
|
||||||
assert elements[0].metadata.coordinates is None
|
assert elements[0].metadata.coordinates is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_page_image_metadata_and_coordinate_system():
|
||||||
|
doc = MockDocumentLayout()
|
||||||
|
metadata = _get_page_image_metadata(doc.pages[0])
|
||||||
|
assert type(metadata) == dict
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.8.2-dev4" # pragma: no cover
|
__version__ = "0.8.2-dev5" # pragma: no cover
|
||||||
|
@ -20,7 +20,7 @@ from unstructured.partition.common import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from unstructured_inference.inference.layout import DocumentLayout
|
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import magic
|
import magic
|
||||||
@ -466,14 +466,20 @@ def document_to_element_list(
|
|||||||
num_pages = len(document.pages)
|
num_pages = len(document.pages)
|
||||||
for i, page in enumerate(document.pages):
|
for i, page in enumerate(document.pages):
|
||||||
page_elements: List[Element] = []
|
page_elements: List[Element] = []
|
||||||
|
|
||||||
|
page_image_metadata = _get_page_image_metadata(page)
|
||||||
|
image_format = page_image_metadata.get("format")
|
||||||
|
image_width = page_image_metadata.get("width")
|
||||||
|
image_height = page_image_metadata.get("height")
|
||||||
|
|
||||||
for layout_element in page.elements:
|
for layout_element in page.elements:
|
||||||
if hasattr(page, "image") and hasattr(layout_element, "coordinates"):
|
if image_width and image_height and hasattr(layout_element, "coordinates"):
|
||||||
image_format = page.image.format
|
coordinate_system = PixelSpace(width=image_width, height=image_height)
|
||||||
coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
|
|
||||||
else:
|
else:
|
||||||
image_format = None
|
|
||||||
coordinate_system = None
|
coordinate_system = None
|
||||||
|
|
||||||
element = normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
element = normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
||||||
|
|
||||||
if isinstance(element, List):
|
if isinstance(element, List):
|
||||||
for el in element:
|
for el in element:
|
||||||
el.metadata.page_number = i + 1
|
el.metadata.page_number = i + 1
|
||||||
@ -514,6 +520,34 @@ def document_to_element_list(
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def _get_page_image_metadata(
|
||||||
|
page: PageLayout,
|
||||||
|
) -> dict:
|
||||||
|
"""Retrieve image metadata and coordinate system from a page."""
|
||||||
|
|
||||||
|
image = getattr(page, "image", None)
|
||||||
|
image_metadata = getattr(page, "image_metadata", None)
|
||||||
|
|
||||||
|
if image:
|
||||||
|
image_format = image.format
|
||||||
|
image_width = image.width
|
||||||
|
image_height = image.height
|
||||||
|
elif image_metadata:
|
||||||
|
image_format = image_metadata.get("format")
|
||||||
|
image_width = image_metadata.get("width")
|
||||||
|
image_height = image_metadata.get("height")
|
||||||
|
else:
|
||||||
|
image_format = None
|
||||||
|
image_width = None
|
||||||
|
image_height = None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"format": image_format,
|
||||||
|
"width": image_width,
|
||||||
|
"height": image_height,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PROGRAMMING_LANGUAGES = [
|
PROGRAMMING_LANGUAGES = [
|
||||||
"javascript",
|
"javascript",
|
||||||
"python",
|
"python",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user