feat: Increase logging transparency for empty Documents during conversion (#8509)

* Add log lines for PDF conversion and make skipping more explicit in DocumentSplitter

* Add logging statement for PDFMinerToDocument as well

* Add tests

* Remove unused line

* Remove unused line

* add reno

* Add in PDF file

* Update checks in PDF converters and add tests for document splitter

* Revert

* Remove line

* Fix comment

* Make mypy happy

* Make mypy happy
This commit is contained in:
Sebastian Husch Lee 2024-11-04 09:26:57 +01:00 committed by GitHub
parent 2595e68050
commit 911f3523ab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 72 additions and 9 deletions

View File

@ -37,7 +37,7 @@ class PDFMinerToDocument:
```
"""
def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
line_overlap: float = 0.5,
char_margin: float = 2.0,
@ -92,7 +92,7 @@ class PDFMinerToDocument:
all_texts=all_texts,
)
def __converter(self, extractor) -> Document:
def _converter(self, extractor) -> Document:
"""
Extracts text from PDF pages then convert the text into Documents
@ -151,13 +151,19 @@ class PDFMinerToDocument:
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
document = self.__converter(pdf_reader)
document = self._converter(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue
if document.content is None or document.content.strip() == "":
logger.warning(
"PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)
merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)

View File

@ -90,10 +90,10 @@ class PyPDFToDocument:
:returns:
Deserialized component.
"""
custom_converter_data = data["init_parameters"]["converter"]
init_parameters = data.get("init_parameters", {})
custom_converter_data = init_parameters.get("converter")
if custom_converter_data is not None:
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
return default_from_dict(cls, data)
def _default_convert(self, reader: "PdfReader") -> Document:
@ -142,6 +142,12 @@ class PyPDFToDocument:
)
continue
if document.content is None or document.content.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
)
merged_metadata = {**bytestream.meta, **metadata}
document.meta = merged_metadata
documents.append(document)

View File

@ -7,10 +7,12 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
from more_itertools import windowed
from haystack import Document, component
from haystack import Document, component, logging
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable
logger = logging.getLogger(__name__)
@component
class DocumentSplitter:
@ -46,7 +48,7 @@ class DocumentSplitter:
```
"""
def __init__(
def __init__( # pylint: disable=too-many-positional-arguments
self,
split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
split_length: int = 200,
@ -112,6 +114,9 @@ class DocumentSplitter:
raise ValueError(
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
)
if doc.content == "":
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
continue
units = self._split_into_units(doc.content, self.split_by)
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
units, self.split_length, self.split_overlap, self.split_threshold
@ -173,6 +178,7 @@ class DocumentSplitter:
# concatenate the last split with the current one
text_splits[-1] += txt
# NOTE: This line skips documents that have content=""
elif len(txt) > 0:
text_splits.append(txt)
splits_pages.append(cur_page)

View File

@ -0,0 +1,7 @@
---
features:
- |
Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
This can happen if the PDF file is a scanned image.
Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
This behavior was already occurring, but now its clearer through logs that this is happening.

View File

@ -126,3 +126,11 @@ class TestPDFMinerToDocument:
results = converter.run(sources=sources)
assert "Could not read non_existing_file.pdf" in caplog.text
assert results["documents"] == []
def test_run_empty_document(self, caplog, test_files_path):
sources = [test_files_path / "pdf" / "non_text_searchable.pdf"]
converter = PDFMinerToDocument()
with caplog.at_level(logging.WARNING):
results = converter.run(sources=sources)
assert "PDFMinerToDocument could not extract text from the file" in caplog.text
assert results["documents"][0].content == ""

View File

@ -63,9 +63,13 @@ class TestPyPDFToDocument:
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None
def test_from_dict_defaults(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None
def test_from_dict_custom_converter(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
data = pypdf_converter.to_dict()
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
@ -161,3 +165,10 @@ class TestPyPDFToDocument:
assert len(docs) == 1
assert "ReAct" not in docs[0].content
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
def test_run_empty_document(self, caplog, test_files_path):
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
with caplog.at_level(logging.WARNING):
output = PyPDFToDocument().run(sources=paths)
assert "PyPDFToDocument could not extract text from the file" in caplog.text
assert output["documents"][0].content == ""

View File

@ -43,6 +43,7 @@ class TestDocumentSplitter:
):
splitter = DocumentSplitter()
splitter.run(documents=[Document()])
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
def test_single_doc(self):
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
@ -445,3 +446,21 @@ class TestDocumentSplitter:
assert original_splitter.split_by == deserialized_splitter.split_by
assert callable(deserialized_splitter.splitting_function)
assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
def test_run_empty_document(self):
"""
Test if the component runs correctly with an empty document.
"""
splitter = DocumentSplitter()
doc = Document(content="")
results = splitter.run([doc])
assert results["documents"] == []
def test_run_document_only_whitespaces(self):
"""
Test if the component runs correctly with a document containing only whitespaces.
"""
splitter = DocumentSplitter()
doc = Document(content=" ")
results = splitter.run([doc])
assert results["documents"][0].content == " "

Binary file not shown.