fix: filename missing from image metadata (#1863)

Closes
[#1859](https://github.com/Unstructured-IO/unstructured/issues/1859).

* **Fixes elements partitioned from an image file missing certain
metadata** Metadata for image files, like file type, was being handled
differently from other file types. This caused a bug where other
metadata, like the file name, was being missed. This change brought
metadata handling for image files to be more in line with the handling
for other file types so that file name and other metadata fields are
being captured.

Additionally:
* Added test to verify filename is being captured in metadata
* Cleaned up `CHANGELOG.md` formatting

#### Testing:
The following produces output `None` on `main`, but outputs the filename
`layout-parser-paper-fast.jpg` on this branch:
```python
from unstructured.partition.auto import partition
elements = partition("example-docs/layout-parser-paper-fast.jpg")
print(elements[0].metadata.filename)

```
This commit is contained in:
qued 2023-10-25 00:19:51 -05:00 committed by GitHub
parent 2d5ffa4581
commit d8241cbcfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 95 additions and 15 deletions

View File

@ -1,4 +1,4 @@
## 0.10.26-dev6
## 0.10.26-dev7
### Enhancements
@ -10,10 +10,10 @@
### Fixes
* **Fixes elements partitioned from an image file missing certain metadata** Metadata for image files, like file type, was being handled differently from other file types. This caused a bug where other metadata, like the file name, was being missed. This change brought metadata handling for image files to be more in line with the handling for other file types so that file name and other metadata fields are being captured.
* **Adds `typing-extensions` as an explicit dependency** This package is an implicit dependency, but the module is being imported directly in `unstructured.documents.elements` so the dependency should be explicit in case changes in other dependencies lead to `typing-extensions` being dropped as a dependency.
* ** Stop passing `extract_tables` to unstructured-inference ** since it is now supported in unstructured instead. Also noted the table
output regressioin for PDF files.
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
* **Stop passing `extract_tables` to `unstructured-inference` since it is now supported in `unstructured` instead** Table extraction previously occurred in `unstructured-inference`, but that logic, except for the table model itself, is now a part of the `unstructured` library. Thus the parameter triggering table extraction is no longer passed to the `unstructured-inference` package. Also noted the table output regression for PDF files.
* **Fix a bug in Table partitioning** Previously the `skip_infer_table_types` variable used in `partition` was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable when calling `partition` to specify the filetypes for which you want to skip table extraction, or the `infer_table_structure` boolean variable on the file specific partitioning function.
* **Fix partition docx without sections** Some docx files, like those from teams output, do not contain sections and it would produce no results because the code assumes all components are in sections. Now if no sections is detected from a document we iterate through the paragraphs and return contents found in the paragraphs.
## 0.10.25

View File

@ -11,6 +11,7 @@ from test_unstructured.unit_utils import assert_round_trips_through_JSON, exampl
from unstructured.chunking.title import chunk_by_title
from unstructured.partition import image, ocr, pdf
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.utils import only
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -547,3 +548,38 @@ def test_partition_image_raises_TypeError_for_invalid_languages():
filename = "example-docs/layout-parser-paper-fast.jpg"
with pytest.raises(TypeError):
image.partition_image(filename=filename, strategy="hi_res", languages="eng")
@pytest.fixture()
def inference_results():
page = layout.PageLayout(
number=1,
image=mock.MagicMock(format="JPEG"),
layout=layout.TextRegion.from_coords(0, 0, 600, 800, text="hello"),
)
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
doc = layout.DocumentLayout(pages=[page])
return doc
def test_partition_image_has_filename(inference_results):
doc_path = "example-docs"
filename = "layout-parser-paper-fast.jpg"
# Mock inference call with known return results
with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model",
return_value=inference_results,
) as mock_inference_func:
elements = image.partition_image(
filename=os.path.join(doc_path, filename),
strategy="hi_res",
)
# Make sure we actually went down the path we expect.
mock_inference_func.assert_called_once()
# Unpack element but also make sure there is only one
element = only(elements)
# This makes sure we are still getting the filetype metadata (should be translated from the
# fixtures)
assert element.metadata.filetype == "JPEG"
# This should be kept from the filename we originally gave
assert element.metadata.filename == filename

View File

@ -1 +1 @@
__version__ = "0.10.26-dev6" # pragma: no cover
__version__ = "0.10.26-dev7" # pragma: no cover

View File

@ -540,7 +540,47 @@ def _is_code_mime_type(mime_type: str) -> bool:
_P = ParamSpec("_P")
def add_metadata_with_filetype(
def add_metadata(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
elements = func(*args, **kwargs)
sig = inspect.signature(func)
params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
for param in sig.parameters.values():
if param.name not in params and param.default is not param.empty:
params[param.name] = param.default
include_metadata = params.get("include_metadata", True)
if include_metadata:
if params.get("metadata_filename"):
params["filename"] = params.get("metadata_filename")
metadata_kwargs = {
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
}
# NOTE (yao): do not use cast here as cast(None) still is None
if not str(kwargs.get("model_name", "")).startswith("chipper"):
# NOTE(alan): Skip hierarchy if using chipper, as it should take care of that
elements = set_element_hierarchy(elements)
for element in elements:
# NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function
if element.metadata.attached_to_filename is None:
_add_element_metadata(
element,
**metadata_kwargs, # type: ignore
)
return elements
else:
return _remove_element_metadata(
elements,
)
return wrapper
def add_filetype(
filetype: FileType,
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
"""..."""
@ -559,14 +599,6 @@ def add_metadata_with_filetype(
if params.get("metadata_filename"):
params["filename"] = params.get("metadata_filename")
metadata_kwargs = {
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
}
# NOTE (yao): do not use cast here as cast(None) still is None
if not str(kwargs.get("model_name", "")).startswith("chipper"):
# NOTE(alan): Skip hierarchy if using chipper, as it should take care of that
elements = set_element_hierarchy(elements)
for element in elements:
# NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function
@ -574,7 +606,6 @@ def add_metadata_with_filetype(
_add_element_metadata(
element,
filetype=FILETYPE_TO_MIMETYPE[filetype],
**metadata_kwargs, # type: ignore
)
return elements
@ -586,3 +617,14 @@ def add_metadata_with_filetype(
return wrapper
return decorator
def add_metadata_with_filetype(
filetype: FileType,
) -> Callable[[Callable[_P, List[Element]]], Callable[_P, List[Element]]]:
"""..."""
def decorator(func: Callable[_P, List[Element]]) -> Callable[_P, List[Element]]:
return add_filetype(filetype=filetype)(add_metadata(func))
return decorator

View File

@ -2,6 +2,7 @@ from typing import List, Optional
from unstructured.chunking.title import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata
from unstructured.logger import logger
from unstructured.partition.common import exactly_one
from unstructured.partition.lang import (
@ -11,6 +12,7 @@ from unstructured.partition.pdf import partition_pdf_or_image
@process_metadata()
@add_metadata
@add_chunking_strategy()
def partition_image(
filename: str = "",