mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-04 03:03:55 +00:00
fix: filename missing from image metadata (#1863)
Closes [#1859](https://github.com/Unstructured-IO/unstructured/issues/1859). * **Fixes elements partitioned from an image file missing certain metadata** Metadata for image files, like file type, was being handled differently from other file types. This caused a bug where other metadata, like the file name, was being missed. This change brought metadata handling for image files to be more in line with the handling for other file types so that file name and other metadata fields are being captured. Additionally: * Added test to verify filename is being captured in metadata * Cleaned up `CHANGELOG.md` formatting #### Testing: The following produces output `None` on `main`, but outputs the filename `layout-parser-paper-fast.jpg` on this branch: ```python from unstructured.partition.auto import partition elements = partition("example-docs/layout-parser-paper-fast.jpg") print(elements[0].metadata.filename) ```
This commit is contained in:
parent
2d5ffa4581
commit
d8241cbcfc
@ -1,4 +1,4 @@
|
||||
## 0.10.26-dev6
|
||||
## 0.10.26-dev7
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,10 +10,10 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fixes elements partitioned from an image file missing certain metadata** Metadata for image files, like file type, was being handled differently from other file types. This caused a bug where other metadata, like the file name, was being missed. This change brought metadata handling for image files to be more in line with the handling for other file types so that file name and other metadata fields are being captured.
|
||||
* **Adds `typing-extensions` as an explicit dependency** This package is an implicit dependency, but the module is being imported directly in `unstructured.documents.elements` so the dependency should be explicit in case changes in other dependencies lead to `typing-extensions` being dropped as a dependency.
|
||||
* ** Stop passing `extract_tables` to unstructured-inference ** since it is now supported in unstructured instead. Also noted the table
|
||||
output regressioin for PDF files.
|
||||
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
|
||||
* **Stop passing `extract_tables` to `unstructured-inference` since it is now supported in `unstructured` instead** Table extraction previously occurred in `unstructured-inference`, but that logic, except for the table model itself, is now a part of the `unstructured` library. Thus the parameter triggering table extraction is no longer passed to the `unstructured-inference` package. Also noted the table output regression for PDF files.
|
||||
* **Fix a bug in Table partitioning** Previously the `skip_infer_table_types` variable used in `partition` was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable when calling `partition` to specify the filetypes for which you want to skip table extraction, or the `infer_table_structure` boolean variable on the file specific partitioning function.
|
||||
* **Fix partition docx without sections** Some docx files, like those from teams output, do not contain sections and it would produce no results because the code assumes all components are in sections. Now if no sections is detected from a document we iterate through the paragraphs and return contents found in the paragraphs.
|
||||
|
||||
## 0.10.25
|
||||
|
||||
@ -11,6 +11,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, exampl
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.partition import image, ocr, pdf
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
from unstructured.utils import only
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
@ -547,3 +548,38 @@ def test_partition_image_raises_TypeError_for_invalid_languages():
|
||||
filename = "example-docs/layout-parser-paper-fast.jpg"
|
||||
with pytest.raises(TypeError):
|
||||
image.partition_image(filename=filename, strategy="hi_res", languages="eng")
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def inference_results():
|
||||
page = layout.PageLayout(
|
||||
number=1,
|
||||
image=mock.MagicMock(format="JPEG"),
|
||||
layout=layout.TextRegion.from_coords(0, 0, 600, 800, text="hello"),
|
||||
)
|
||||
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
|
||||
doc = layout.DocumentLayout(pages=[page])
|
||||
return doc
|
||||
|
||||
|
||||
def test_partition_image_has_filename(inference_results):
|
||||
doc_path = "example-docs"
|
||||
filename = "layout-parser-paper-fast.jpg"
|
||||
# Mock inference call with known return results
|
||||
with mock.patch(
|
||||
"unstructured_inference.inference.layout.process_file_with_model",
|
||||
return_value=inference_results,
|
||||
) as mock_inference_func:
|
||||
elements = image.partition_image(
|
||||
filename=os.path.join(doc_path, filename),
|
||||
strategy="hi_res",
|
||||
)
|
||||
# Make sure we actually went down the path we expect.
|
||||
mock_inference_func.assert_called_once()
|
||||
# Unpack element but also make sure there is only one
|
||||
element = only(elements)
|
||||
# This makes sure we are still getting the filetype metadata (should be translated from the
|
||||
# fixtures)
|
||||
assert element.metadata.filetype == "JPEG"
|
||||
# This should be kept from the filename we originally gave
|
||||
assert element.metadata.filename == filename
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.26-dev6" # pragma: no cover
|
||||
__version__ = "0.10.26-dev7" # pragma: no cover
|
||||
|
||||
@ -540,7 +540,47 @@ def _is_code_mime_type(mime_type: str) -> bool:
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def add_metadata_with_filetype(
|
||||
def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
|
||||
elements = func(*args, **kwargs)
|
||||
sig = inspect.signature(func)
|
||||
params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
|
||||
for param in sig.parameters.values():
|
||||
if param.name not in params and param.default is not param.empty:
|
||||
params[param.name] = param.default
|
||||
include_metadata = params.get("include_metadata", True)
|
||||
if include_metadata:
|
||||
if params.get("metadata_filename"):
|
||||
params["filename"] = params.get("metadata_filename")
|
||||
|
||||
metadata_kwargs = {
|
||||
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
|
||||
}
|
||||
# NOTE (yao): do not use cast here as cast(None) still is None
|
||||
if not str(kwargs.get("model_name", "")).startswith("chipper"):
|
||||
# NOTE(alan): Skip hierarchy if using chipper, as it should take care of that
|
||||
elements = set_element_hierarchy(elements)
|
||||
|
||||
for element in elements:
|
||||
# NOTE(robinson) - Attached files have already run through this logic
|
||||
# in their own partitioning function
|
||||
if element.metadata.attached_to_filename is None:
|
||||
_add_element_metadata(
|
||||
element,
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
|
||||
return elements
|
||||
else:
|
||||
return _remove_element_metadata(
|
||||
elements,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def add_filetype(
|
||||
filetype: FileType,
|
||||
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
|
||||
"""..."""
|
||||
@ -559,14 +599,6 @@ def add_metadata_with_filetype(
|
||||
if params.get("metadata_filename"):
|
||||
params["filename"] = params.get("metadata_filename")
|
||||
|
||||
metadata_kwargs = {
|
||||
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
|
||||
}
|
||||
# NOTE (yao): do not use cast here as cast(None) still is None
|
||||
if not str(kwargs.get("model_name", "")).startswith("chipper"):
|
||||
# NOTE(alan): Skip hierarchy if using chipper, as it should take care of that
|
||||
elements = set_element_hierarchy(elements)
|
||||
|
||||
for element in elements:
|
||||
# NOTE(robinson) - Attached files have already run through this logic
|
||||
# in their own partitioning function
|
||||
@ -574,7 +606,6 @@ def add_metadata_with_filetype(
|
||||
_add_element_metadata(
|
||||
element,
|
||||
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
|
||||
return elements
|
||||
@ -586,3 +617,14 @@ def add_metadata_with_filetype(
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def add_metadata_with_filetype(
|
||||
filetype: FileType,
|
||||
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
|
||||
"""..."""
|
||||
|
||||
def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
|
||||
return add_filetype(filetype=filetype)(add_metadata(func))
|
||||
|
||||
return decorator
|
||||
|
||||
@ -2,6 +2,7 @@ from typing import List, Optional
|
||||
|
||||
from unstructured.chunking.title import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import add_metadata
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.lang import (
|
||||
@ -11,6 +12,7 @@ from unstructured.partition.pdf import partition_pdf_or_image
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata
|
||||
@add_chunking_strategy()
|
||||
def partition_image(
|
||||
filename: str = "",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user