feat: Increase logging transparency for empty Documents during conversion (#8509)

* Add log lines for PDF conversion and make skipping more explicit in DocumentSplitter * Add logging statement for PDFMinerToDocument as well * Add tests * Remove unused line * Remove unused line * add reno * Add in PDF file * Update checks in PDF converters and add tests for document splitter * Revert * Remove line * Fix comment * Make mypy happy * Make mypy happy
2025-12-27 06:58:35 +00:00 · 2024-11-04 09:26:57 +01:00 · 2024-11-04 09:26:57 +01:00 · 911f3523ab
commit 911f3523ab
parent 2595e68050
8 changed files with 72 additions and 9 deletions
--- a/haystack/components/converters/pdfminer.py
+++ b/haystack/components/converters/pdfminer.py
@ -37,7 +37,7 @@ class PDFMinerToDocument:
    ```
    """

-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
        self,
        line_overlap: float = 0.5,
        char_margin: float = 2.0,
@ -92,7 +92,7 @@ class PDFMinerToDocument:
            all_texts=all_texts,
        )

-    def __converter(self, extractor) -> Document:
+    def _converter(self, extractor) -> Document:
        """
        Extracts text from PDF pages then convert the text into Documents

@ -151,13 +151,19 @@ class PDFMinerToDocument:
                continue
            try:
                pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
-                document = self.__converter(pdf_reader)
+                document = self._converter(pdf_reader)
            except Exception as e:
                logger.warning(
                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                )
                continue

+            if document.content is None or document.content.strip() == "":
+                logger.warning(
+                    "PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
+                    source=source,
+                )
+
            merged_metadata = {**bytestream.meta, **metadata}
            document.meta = merged_metadata
            documents.append(document)
--- a/haystack/components/converters/pypdf.py
+++ b/haystack/components/converters/pypdf.py
@ -90,10 +90,10 @@ class PyPDFToDocument:
        :returns:
            Deserialized component.
        """
-        custom_converter_data = data["init_parameters"]["converter"]
+        init_parameters = data.get("init_parameters", {})
+        custom_converter_data = init_parameters.get("converter")
        if custom_converter_data is not None:
            data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
-
        return default_from_dict(cls, data)

    def _default_convert(self, reader: "PdfReader") -> Document:
@ -142,6 +142,12 @@ class PyPDFToDocument:
                )
                continue

+            if document.content is None or document.content.strip() == "":
+                logger.warning(
+                    "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
+                    source=source,
+                )
+
            merged_metadata = {**bytestream.meta, **metadata}
            document.meta = merged_metadata
            documents.append(document)
--- a/haystack/components/preprocessors/document_splitter.py
+++ b/haystack/components/preprocessors/document_splitter.py
@ -7,10 +7,12 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

 from more_itertools import windowed

-from haystack import Document, component
+from haystack import Document, component, logging
 from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.utils import deserialize_callable, serialize_callable

+logger = logging.getLogger(__name__)
+

@component
 class DocumentSplitter:
@ -46,7 +48,7 @@ class DocumentSplitter:
    ```
    """

-    def __init__(
+    def __init__(  # pylint: disable=too-many-positional-arguments
        self,
        split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
        split_length: int = 200,
@ -112,6 +114,9 @@ class DocumentSplitter:
                raise ValueError(
                    f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
                )
+            if doc.content == "":
+                logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
+                continue
            units = self._split_into_units(doc.content, self.split_by)
            text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
                units, self.split_length, self.split_overlap, self.split_threshold
@ -173,6 +178,7 @@ class DocumentSplitter:
                # concatenate the last split with the current one
                text_splits[-1] += txt

+            # NOTE: This line skips documents that have content=""
            elif len(txt) > 0:
                text_splits.append(txt)
                splits_pages.append(cur_page)
--- a/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml
+++ b/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml
@ -0,0 +1,7 @@
+---
+features:
+  - |
+    Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
+    This can happen if the PDF file is a scanned image.
+    Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
+    This behavior was already occurring, but now its clearer through logs that this is happening.
--- a/test/components/converters/test_pdfminer_to_document.py
+++ b/test/components/converters/test_pdfminer_to_document.py
@ -126,3 +126,11 @@ class TestPDFMinerToDocument:
            results = converter.run(sources=sources)
            assert "Could not read non_existing_file.pdf" in caplog.text
            assert results["documents"] == []
+
+    def test_run_empty_document(self, caplog, test_files_path):
+        sources = [test_files_path / "pdf" / "non_text_searchable.pdf"]
+        converter = PDFMinerToDocument()
+        with caplog.at_level(logging.WARNING):
+            results = converter.run(sources=sources)
+            assert "PDFMinerToDocument could not extract text from the file" in caplog.text
+            assert results["documents"][0].content == ""
--- a/test/components/converters/test_pypdf_to_document.py
+++ b/test/components/converters/test_pypdf_to_document.py
@ -63,9 +63,13 @@ class TestPyPDFToDocument:
        assert isinstance(instance, PyPDFToDocument)
        assert instance.converter is None

+    def test_from_dict_defaults(self):
+        data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
+        instance = PyPDFToDocument.from_dict(data)
+        assert isinstance(instance, PyPDFToDocument)
+        assert instance.converter is None
+
    def test_from_dict_custom_converter(self):
-        pypdf_converter = PyPDFToDocument(converter=CustomConverter())
-        data = pypdf_converter.to_dict()
        data = {
            "type": "haystack.components.converters.pypdf.PyPDFToDocument",
            "init_parameters": {
@ -161,3 +165,10 @@ class TestPyPDFToDocument:
        assert len(docs) == 1
        assert "ReAct" not in docs[0].content
        assert "I don't care about converting given pdfs, I always return this" in docs[0].content
+
+    def test_run_empty_document(self, caplog, test_files_path):
+        paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
+        with caplog.at_level(logging.WARNING):
+            output = PyPDFToDocument().run(sources=paths)
+            assert "PyPDFToDocument could not extract text from the file" in caplog.text
+            assert output["documents"][0].content == ""
--- a/test/components/preprocessors/test_document_splitter.py
+++ b/test/components/preprocessors/test_document_splitter.py
@ -43,6 +43,7 @@ class TestDocumentSplitter:
        ):
            splitter = DocumentSplitter()
            splitter.run(documents=[Document()])
+            assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text

    def test_single_doc(self):
        with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
@ -445,3 +446,21 @@ class TestDocumentSplitter:
        assert original_splitter.split_by == deserialized_splitter.split_by
        assert callable(deserialized_splitter.splitting_function)
        assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
+
+    def test_run_empty_document(self):
+        """
+        Test if the component runs correctly with an empty document.
+        """
+        splitter = DocumentSplitter()
+        doc = Document(content="")
+        results = splitter.run([doc])
+        assert results["documents"] == []
+
+    def test_run_document_only_whitespaces(self):
+        """
+        Test if the component runs correctly with a document containing only whitespaces.
+        """
+        splitter = DocumentSplitter()
+        doc = Document(content="  ")
+        results = splitter.run([doc])
+        assert results["documents"][0].content == "  "
--- a/test/test_files/pdf/non_text_searchable.pdf
+++ b/test/test_files/pdf/non_text_searchable.pdf