mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-22 15:50:22 +00:00
fix: page_number
should not always be 1 in the metadata (#657)
* fix page number issue * add tests * changelog and version * update changelog
This commit is contained in:
parent
b8dcf437ee
commit
f7cde5539a
@ -6,6 +6,9 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fixes an issue in `_add_element_metadata` that caused all elements to have `page_number=1`
|
||||
in the element metadata.
|
||||
* Adds `.log` as a file extension for TXT files.
|
||||
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
|
||||
* Allow passed encoding to be used in the `replace_mime_encodings`
|
||||
|
||||
@ -13,7 +16,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Supprts epub tests since pandoc is updated in base image
|
||||
* Supports epub tests since pandoc is updated in base image
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -188,6 +188,8 @@ def test_partition_pdf_with_spooled_file(
|
||||
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
|
||||
# validate that the result is a non-empty list of dicts
|
||||
assert len(result) > 10
|
||||
# check that the pdf has multiple different page numbers
|
||||
assert len({element.metadata.page_number for element in result}) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -228,6 +230,8 @@ def test_partition_pdf_with_no_page_breaks(filename="example-docs/layout-parser-
|
||||
def test_partition_pdf_with_fast_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
||||
assert len(elements) > 10
|
||||
# check that the pdf has multiple different page numbers
|
||||
assert len({element.metadata.page_number for element in elements}) > 1
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_groups_text(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||
@ -377,6 +381,8 @@ def test_partition_pdf_with_copy_protection():
|
||||
filename = os.path.join("example-docs", "copy-protected.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
||||
elements[0] == Title("LayoutParser: A Unified Toolkit for Deep Based Document Image Analysis")
|
||||
# check that the pdf has multiple different page numbers
|
||||
assert len({element.metadata.page_number for element in elements}) > 1
|
||||
|
||||
|
||||
def test_partition_pdf_with_copy_protection_fallback_to_hi_res(caplog):
|
||||
|
@ -84,6 +84,12 @@ def _add_element_metadata(
|
||||
text_as_html: Optional[str] = layout_element.text_as_html
|
||||
else:
|
||||
text_as_html = None
|
||||
|
||||
# NOTE(robinson) - defer to the page number that's already in the metadata
|
||||
# if it's available
|
||||
if hasattr(element, "metadata"):
|
||||
page_number = element.metadata.page_number or page_number
|
||||
|
||||
metadata = ElementMetadata(
|
||||
filename=filename,
|
||||
filetype=filetype,
|
||||
|
@ -279,12 +279,12 @@ def _process_pdfminer_pages(
|
||||
|
||||
text_segments = []
|
||||
for obj in page:
|
||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||
# not have a get_text method
|
||||
x1, y2, x2, y1 = obj.bbox
|
||||
y1 = height - y1
|
||||
y2 = height - y2
|
||||
|
||||
# NOTE(robinson) - "Figure" is an example of an object type that does
|
||||
# not have a get_text method
|
||||
if not hasattr(obj, "get_text"):
|
||||
continue
|
||||
_text = obj.get_text()
|
||||
|
Loading…
x
Reference in New Issue
Block a user