diff --git a/CHANGELOG.md b/CHANGELOG.md index 474d97323..46e39f640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ ### Fixes +* **Selecting a different model wasn't being respected when calling `partition_image`.** Problem: `partition_pdf` allows for passing a `model_name` parameter. Given the similarity between the image and PDF pipelines, the expected behavior is that `partition_image` should support the same parameter, but `partition_image` was unintentionally not passing along its `kwargs`. This was corrected by adding the kwargs to the downstream call. * **Fixes a chunking issue via dropping the field "coordinates".** Problem: chunk_by_title function was chunking each element to its own individual chunk while it needed to group elements into a fewer number of chunks. We've discovered that this happens due to a metadata matching logic in chunk_by_title function, and discovered that elements with different metadata can't be put into the same chunk. At the same time, any element with "coordinates" essentially had different metadata than other elements, due each element locating in different places and having different coordinates. Fix: That is why we have included the key "coordinates" inside a list of excluded metadata keys, while doing this "metadata_matches" comparision. Importance: This change is crucial to be able to chunk by title for documents which include "coordinates" metadata in their elements. ## 0.10.14 diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index 0d773a715..31dc482b4 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -426,3 +426,14 @@ def test_add_chunking_strategy_on_partition_image( chunks = chunk_by_title(elements) assert chunk_elements != elements assert chunk_elements == chunks + + +def test_partition_image_uses_model_name(): + with mock.patch.object( + pdf, + "_partition_pdf_or_image_local", + ) as mockpartition: + image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test") + print(mockpartition.call_args) + assert "model_name" in mockpartition.call_args.kwargs + assert mockpartition.call_args.kwargs["model_name"] diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index 8489743bf..e23c2894c 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -851,3 +851,19 @@ def test_combine_numbered_list(filename): break assert len(elements) < 28 assert first_list_element.text.endswith("(Section 3)") + + +def test_partition_pdf_uses_model_name(): + with mock.patch.object( + pdf, + "_partition_pdf_or_image_local", + ) as mockpartition: + pdf.partition_pdf( + "example-docs/layout-parser-paper-fast.pdf", + model_name="test", + strategy="hi_res", + ) + + mockpartition.assert_called_once() + assert "model_name" in mockpartition.call_args.kwargs + assert mockpartition.call_args.kwargs["model_name"] diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index ca40a0e08..80eb07727 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -81,4 +81,5 @@ def partition_image( languages=languages, strategy=strategy, metadata_last_modified=metadata_last_modified, + **kwargs, )