diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f206b32b..7abb1afba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ ### Fixes +* Fixes an issue in `_add_element_metadata` that caused all elements to have `page_number=1` + in the element metadata. +* Adds `.log` as a file extension for TXT files. * Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding. * Allow passed encoding to be used in the `replace_mime_encodings` @@ -13,7 +16,7 @@ ### Enhancements -* Supprts epub tests since pandoc is updated in base image +* Supports epub tests since pandoc is updated in base image ### Features diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index e91e41759..88fbdd89e 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -188,6 +188,8 @@ def test_partition_pdf_with_spooled_file( result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy) # validate that the result is a non-empty list of dicts assert len(result) > 10 + # check that the pdf has multiple different page numbers + assert len({element.metadata.page_number for element in result}) > 1 @pytest.mark.parametrize( @@ -228,6 +230,8 @@ def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser- def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"): elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast") assert len(elements) > 10 + # check that the pdf has multiple different page numbers + assert len({element.metadata.page_number for element in elements}) > 1 def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"): @@ -377,6 +381,8 @@ def test_partition_pdf_with_copy_protection(): filename = os.path.join("example-docs", "copy-protected.pdf") elements = pdf.partition_pdf(filename=filename, strategy="hi_res") elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis") + # check that the pdf has multiple different page numbers + assert len({element.metadata.page_number for element in elements}) > 1 def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog): diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 7191885f5..488176d4a 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -84,6 +84,12 @@ def _add_element_metadata( text_as_html: Optional[str] = layout_element.text_as_html else: text_as_html = None + + # NOTE(robinson) - defer to the page number that's already in the metadata + # if it's available + if hasattr(element, "metadata"): + page_number = element.metadata.page_number or page_number + metadata = ElementMetadata( filename=filename, filetype=filetype, diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 5b667f925..4693966e6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -279,12 +279,12 @@ def _process_pdfminer_pages( text_segments = [] for obj in page: - # NOTE(robinson) - "Figure" is an example of an object type that does - # not have a get_text method x1, y2, x2, y1 = obj.bbox y1 = height - y1 y2 = height - y2 + # NOTE(robinson) - "Figure" is an example of an object type that does + # not have a get_text method if not hasattr(obj, "get_text"): continue _text = obj.get_text()