diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index b105e2fca..acf9db28f 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -37,7 +37,7 @@ class PDFMinerToDocument: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, line_overlap: float = 0.5, char_margin: float = 2.0, @@ -92,7 +92,7 @@ class PDFMinerToDocument: all_texts=all_texts, ) - def __converter(self, extractor) -> Document: + def _converter(self, extractor) -> Document: """ Extracts text from PDF pages then convert the text into Documents @@ -151,13 +151,19 @@ class PDFMinerToDocument: continue try: pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params) - document = self.__converter(pdf_reader) + document = self._converter(pdf_reader) except Exception as e: logger.warning( "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e ) continue + if document.content is None or document.content.strip() == "": + logger.warning( + "PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.", + source=source, + ) + merged_metadata = {**bytestream.meta, **metadata} document.meta = merged_metadata documents.append(document) diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 86d3e1c60..72fbcdc16 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -90,10 +90,10 @@ class PyPDFToDocument: :returns: Deserialized component. """ - custom_converter_data = data["init_parameters"]["converter"] + init_parameters = data.get("init_parameters", {}) + custom_converter_data = init_parameters.get("converter") if custom_converter_data is not None: data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data) - return default_from_dict(cls, data) def _default_convert(self, reader: "PdfReader") -> Document: @@ -142,6 +142,12 @@ class PyPDFToDocument: ) continue + if document.content is None or document.content.strip() == "": + logger.warning( + "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.", + source=source, + ) + merged_metadata = {**bytestream.meta, **metadata} document.meta = merged_metadata documents.append(document) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 556878a96..2abdb41dd 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -7,10 +7,12 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple from more_itertools import windowed -from haystack import Document, component +from haystack import Document, component, logging from haystack.core.serialization import default_from_dict, default_to_dict from haystack.utils import deserialize_callable, serialize_callable +logger = logging.getLogger(__name__) + @component class DocumentSplitter: @@ -46,7 +48,7 @@ class DocumentSplitter: ``` """ - def __init__( + def __init__( # pylint: disable=too-many-positional-arguments self, split_by: Literal["function", "page", "passage", "sentence", "word"] = "word", split_length: int = 200, @@ -112,6 +114,9 @@ class DocumentSplitter: raise ValueError( f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None." ) + if doc.content == "": + logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id) + continue units = self._split_into_units(doc.content, self.split_by) text_splits, splits_pages, splits_start_idxs = self._concatenate_units( units, self.split_length, self.split_overlap, self.split_threshold @@ -173,6 +178,7 @@ class DocumentSplitter: # concatenate the last split with the current one text_splits[-1] += txt + # NOTE: This line skips documents that have content="" elif len(txt) > 0: text_splits.append(txt) splits_pages.append(cur_page) diff --git a/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml b/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml new file mode 100644 index 000000000..dd5637dba --- /dev/null +++ b/releasenotes/notes/add-logs-empty-files-pdf-f28a14e52984c1ea.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content. + This can happen if the PDF file is a scanned image. + Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped. + This behavior was already occurring, but now its clearer through logs that this is happening. diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py index 0cd47df8f..53dececbe 100644 --- a/test/components/converters/test_pdfminer_to_document.py +++ b/test/components/converters/test_pdfminer_to_document.py @@ -126,3 +126,11 @@ class TestPDFMinerToDocument: results = converter.run(sources=sources) assert "Could not read non_existing_file.pdf" in caplog.text assert results["documents"] == [] + + def test_run_empty_document(self, caplog, test_files_path): + sources = [test_files_path / "pdf" / "non_text_searchable.pdf"] + converter = PDFMinerToDocument() + with caplog.at_level(logging.WARNING): + results = converter.run(sources=sources) + assert "PDFMinerToDocument could not extract text from the file" in caplog.text + assert results["documents"][0].content == "" diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index 79a757f37..234b545ea 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -63,9 +63,13 @@ class TestPyPDFToDocument: assert isinstance(instance, PyPDFToDocument) assert instance.converter is None + def test_from_dict_defaults(self): + data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}} + instance = PyPDFToDocument.from_dict(data) + assert isinstance(instance, PyPDFToDocument) + assert instance.converter is None + def test_from_dict_custom_converter(self): - pypdf_converter = PyPDFToDocument(converter=CustomConverter()) - data = pypdf_converter.to_dict() data = { "type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": { @@ -161,3 +165,10 @@ class TestPyPDFToDocument: assert len(docs) == 1 assert "ReAct" not in docs[0].content assert "I don't care about converting given pdfs, I always return this" in docs[0].content + + def test_run_empty_document(self, caplog, test_files_path): + paths = [test_files_path / "pdf" / "non_text_searchable.pdf"] + with caplog.at_level(logging.WARNING): + output = PyPDFToDocument().run(sources=paths) + assert "PyPDFToDocument could not extract text from the file" in caplog.text + assert output["documents"][0].content == "" diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 7c942ab4c..aecb85317 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -43,6 +43,7 @@ class TestDocumentSplitter: ): splitter = DocumentSplitter() splitter.run(documents=[Document()]) + assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text def test_single_doc(self): with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."): @@ -445,3 +446,21 @@ class TestDocumentSplitter: assert original_splitter.split_by == deserialized_splitter.split_by assert callable(deserialized_splitter.splitting_function) assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"] + + def test_run_empty_document(self): + """ + Test if the component runs correctly with an empty document. + """ + splitter = DocumentSplitter() + doc = Document(content="") + results = splitter.run([doc]) + assert results["documents"] == [] + + def test_run_document_only_whitespaces(self): + """ + Test if the component runs correctly with a document containing only whitespaces. + """ + splitter = DocumentSplitter() + doc = Document(content=" ") + results = splitter.run([doc]) + assert results["documents"][0].content == " " diff --git a/test/test_files/pdf/non_text_searchable.pdf b/test/test_files/pdf/non_text_searchable.pdf new file mode 100644 index 000000000..3579aed51 Binary files /dev/null and b/test/test_files/pdf/non_text_searchable.pdf differ