fix: page_number should not always be 1 in the metadata (#657)

* fix page number issue

* add tests

* changelog and version

* update changelog
This commit is contained in:
Matt Robinson 2023-05-30 15:10:14 -04:00 committed by GitHub
parent b8dcf437ee
commit f7cde5539a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 3 deletions

View File

@ -6,6 +6,9 @@
### Fixes
* Fixes an issue in `_add_element_metadata` that caused all elements to have `page_number=1`
in the element metadata.
* Adds `.log` as a file extension for TXT files.
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
* Allow passed encoding to be used in the `replace_mime_encodings`
@ -13,7 +16,7 @@
### Enhancements
* Supprts epub tests since pandoc is updated in base image
* Supports epub tests since pandoc is updated in base image
### Features

View File

@ -188,6 +188,8 @@ def test_partition_pdf_with_spooled_file(
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in result}) > 1
@pytest.mark.parametrize(
@ -228,6 +230,8 @@ def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-
def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in elements}) > 1
def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"):
@ -377,6 +381,8 @@ def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
# check that the pdf has multiple different page numbers
assert len({element.metadata.page_number for element in elements}) > 1
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):

View File

@ -84,6 +84,12 @@ def _add_element_metadata(
text_as_html: Optional[str] = layout_element.text_as_html
else:
text_as_html = None
# NOTE(robinson) - defer to the page number that's already in the metadata
# if it's available
if hasattr(element, "metadata"):
page_number = element.metadata.page_number or page_number
metadata = ElementMetadata(
filename=filename,
filetype=filetype,

View File

@ -279,12 +279,12 @@ def _process_pdfminer_pages(
text_segments = []
for obj in page:
# NOTE(robinson) - "Figure" is an example of an object type that does
# not have a get_text method
x1, y2, x2, y1 = obj.bbox
y1 = height - y1
y2 = height - y2
# NOTE(robinson) - "Figure" is an example of an object type that does
# not have a get_text method
if not hasattr(obj, "get_text"):
continue
_text = obj.get_text()