From dd7f64ff28226cd9964fc4d8ba807b2c8a6358ef Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Thu, 19 Jun 2025 07:33:25 +0200 Subject: [PATCH] fix: Ensure uninitialized pages are removed before assembling document (#1812) Ensure uninitialized pages are removed before assembling document Signed-off-by: Christoph Auer --- docling/models/readingorder_model.py | 2 +- docling/pipeline/base_pipeline.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index ba1bb45b..9c38756c 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -124,7 +124,7 @@ class ReadingOrderModel: page_no = page.page_no + 1 size = page.size - assert size is not None + assert size is not None, "Page size is not initialized." out_doc.add_page(page_no=page_no, size=size) diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 29475d68..2b168101 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. ) raise e + # Filter out uninitialized pages (those with size=None) that may remain + # after timeout or processing failures to prevent assertion errors downstream + initial_page_count = len(conv_res.pages) + conv_res.pages = [page for page in conv_res.pages if page.size is not None] + + if len(conv_res.pages) < initial_page_count: + _log.info( + f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages " + f"due to timeout or processing failures" + ) + return conv_res def _unload(self, conv_res: ConversionResult) -> ConversionResult: