fix: Ensure uninitialized pages are removed before assembling document (#1812)

Ensure uninitialized pages are removed before assembling document

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-19 07:33:25 +02:00 committed by GitHub
parent 861abcdcb0
commit dd7f64ff28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 1 deletions

View File

@ -124,7 +124,7 @@ class ReadingOrderModel:
page_no = page.page_no + 1
size = page.size
assert size is not None
assert size is not None, "Page size is not initialized."
out_doc.add_page(page_no=page_no, size=size)

View File

@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
)
raise e
# Filter out uninitialized pages (those with size=None) that may remain
# after timeout or processing failures to prevent assertion errors downstream
initial_page_count = len(conv_res.pages)
conv_res.pages = [page for page in conv_res.pages if page.size is not None]
if len(conv_res.pages) < initial_page_count:
_log.info(
f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
f"due to timeout or processing failures"
)
return conv_res
def _unload(self, conv_res: ConversionResult) -> ConversionResult: