fix: page_number should not always be 1 in the metadata (#657)

* fix page number issue * add tests * changelog and version * update changelog
2025-12-05 03:23:03 +00:00 · 2023-05-30 15:10:14 -04:00 · 2023-05-30 15:10:14 -04:00 · f7cde5539a
commit f7cde5539a
parent b8dcf437ee
4 changed files with 18 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,9 @@

 ### Fixes

+* Fixes an issue in `_add_element_metadata` that caused all elements to have `page_number=1`
+  in the element metadata.
+* Adds `.log` as a file extension for TXT files.
 * Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
 * Allow passed encoding to be used in the `replace_mime_encodings`

@ -13,7 +16,7 @@

 ### Enhancements

-* Supprts epub tests since pandoc is updated in base image
+* Supports epub tests since pandoc is updated in base image

 ### Features

--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -188,6 +188,8 @@ def test_partition_pdf_with_spooled_file(
        result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
        # validate that the result is a non-empty list of dicts
        assert len(result) > 10
+        # check that the pdf has multiple different page numbers
+        assert len({element.metadata.page_number for element in result}) > 1


@pytest.mark.parametrize(
@ -228,6 +230,8 @@ def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-
 def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
    elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
    assert len(elements) > 10
+    # check that the pdf has multiple different page numbers
+    assert len({element.metadata.page_number for element in elements}) > 1


 def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"):
@ -377,6 +381,8 @@ def test_partition_pdf_with_copy_protection():
    filename = os.path.join("example-docs", "copy-protected.pdf")
    elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
    elements[0] == Title("LayoutParser: A Uniﬁed Toolkit for Deep Based Document Image Analysis")
+    # check that the pdf has multiple different page numbers
+    assert len({element.metadata.page_number for element in elements}) > 1


 def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -84,6 +84,12 @@ def _add_element_metadata(
            text_as_html: Optional[str] = layout_element.text_as_html
        else:
            text_as_html = None
+
+        # NOTE(robinson) - defer to the page number that's already in the metadata
+        # if it's available
+        if hasattr(element, "metadata"):
+            page_number = element.metadata.page_number or page_number
+
        metadata = ElementMetadata(
            filename=filename,
            filetype=filetype,
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -279,12 +279,12 @@ def _process_pdfminer_pages(

        text_segments = []
        for obj in page:
-            # NOTE(robinson) - "Figure" is an example of an object type that does
-            # not have a get_text method
            x1, y2, x2, y1 = obj.bbox
            y1 = height - y1
            y2 = height - y2

+            # NOTE(robinson) - "Figure" is an example of an object type that does
+            # not have a get_text method
            if not hasattr(obj, "get_text"):
                continue
            _text = obj.get_text()