mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 06:58:35 +00:00
feat: Increase logging transparency for empty Documents during conversion (#8509)
* Add log lines for PDF conversion and make skipping more explicit in DocumentSplitter * Add logging statement for PDFMinerToDocument as well * Add tests * Remove unused line * Remove unused line * add reno * Add in PDF file * Update checks in PDF converters and add tests for document splitter * Revert * Remove line * Fix comment * Make mypy happy * Make mypy happy
This commit is contained in:
parent
2595e68050
commit
911f3523ab
@ -37,7 +37,7 @@ class PDFMinerToDocument:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
def __init__( # pylint: disable=too-many-positional-arguments
|
||||
self,
|
||||
line_overlap: float = 0.5,
|
||||
char_margin: float = 2.0,
|
||||
@ -92,7 +92,7 @@ class PDFMinerToDocument:
|
||||
all_texts=all_texts,
|
||||
)
|
||||
|
||||
def __converter(self, extractor) -> Document:
|
||||
def _converter(self, extractor) -> Document:
|
||||
"""
|
||||
Extracts text from PDF pages then convert the text into Documents
|
||||
|
||||
@ -151,13 +151,19 @@ class PDFMinerToDocument:
|
||||
continue
|
||||
try:
|
||||
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
|
||||
document = self.__converter(pdf_reader)
|
||||
document = self._converter(pdf_reader)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
|
||||
)
|
||||
continue
|
||||
|
||||
if document.content is None or document.content.strip() == "":
|
||||
logger.warning(
|
||||
"PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
|
||||
source=source,
|
||||
)
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
document.meta = merged_metadata
|
||||
documents.append(document)
|
||||
|
||||
@ -90,10 +90,10 @@ class PyPDFToDocument:
|
||||
:returns:
|
||||
Deserialized component.
|
||||
"""
|
||||
custom_converter_data = data["init_parameters"]["converter"]
|
||||
init_parameters = data.get("init_parameters", {})
|
||||
custom_converter_data = init_parameters.get("converter")
|
||||
if custom_converter_data is not None:
|
||||
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
|
||||
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
def _default_convert(self, reader: "PdfReader") -> Document:
|
||||
@ -142,6 +142,12 @@ class PyPDFToDocument:
|
||||
)
|
||||
continue
|
||||
|
||||
if document.content is None or document.content.strip() == "":
|
||||
logger.warning(
|
||||
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
|
||||
source=source,
|
||||
)
|
||||
|
||||
merged_metadata = {**bytestream.meta, **metadata}
|
||||
document.meta = merged_metadata
|
||||
documents.append(document)
|
||||
|
||||
@ -7,10 +7,12 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
|
||||
|
||||
from more_itertools import windowed
|
||||
|
||||
from haystack import Document, component
|
||||
from haystack import Document, component, logging
|
||||
from haystack.core.serialization import default_from_dict, default_to_dict
|
||||
from haystack.utils import deserialize_callable, serialize_callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@component
|
||||
class DocumentSplitter:
|
||||
@ -46,7 +48,7 @@ class DocumentSplitter:
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
def __init__( # pylint: disable=too-many-positional-arguments
|
||||
self,
|
||||
split_by: Literal["function", "page", "passage", "sentence", "word"] = "word",
|
||||
split_length: int = 200,
|
||||
@ -112,6 +114,9 @@ class DocumentSplitter:
|
||||
raise ValueError(
|
||||
f"DocumentSplitter only works with text documents but content for document ID {doc.id} is None."
|
||||
)
|
||||
if doc.content == "":
|
||||
logger.warning("Document ID {doc_id} has an empty content. Skipping this document.", doc_id=doc.id)
|
||||
continue
|
||||
units = self._split_into_units(doc.content, self.split_by)
|
||||
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
|
||||
units, self.split_length, self.split_overlap, self.split_threshold
|
||||
@ -173,6 +178,7 @@ class DocumentSplitter:
|
||||
# concatenate the last split with the current one
|
||||
text_splits[-1] += txt
|
||||
|
||||
# NOTE: This line skips documents that have content=""
|
||||
elif len(txt) > 0:
|
||||
text_splits.append(txt)
|
||||
splits_pages.append(cur_page)
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add warning logs to the PDFMinerToDocument and PyPDFToDocument to indicate when a processed PDF file has no content.
|
||||
This can happen if the PDF file is a scanned image.
|
||||
Also added an explicit check and warning message to the DocumentSplitter that warns the user that empty Documents are skipped.
|
||||
This behavior was already occurring, but now its clearer through logs that this is happening.
|
||||
@ -126,3 +126,11 @@ class TestPDFMinerToDocument:
|
||||
results = converter.run(sources=sources)
|
||||
assert "Could not read non_existing_file.pdf" in caplog.text
|
||||
assert results["documents"] == []
|
||||
|
||||
def test_run_empty_document(self, caplog, test_files_path):
|
||||
sources = [test_files_path / "pdf" / "non_text_searchable.pdf"]
|
||||
converter = PDFMinerToDocument()
|
||||
with caplog.at_level(logging.WARNING):
|
||||
results = converter.run(sources=sources)
|
||||
assert "PDFMinerToDocument could not extract text from the file" in caplog.text
|
||||
assert results["documents"][0].content == ""
|
||||
|
||||
@ -63,9 +63,13 @@ class TestPyPDFToDocument:
|
||||
assert isinstance(instance, PyPDFToDocument)
|
||||
assert instance.converter is None
|
||||
|
||||
def test_from_dict_defaults(self):
|
||||
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {}}
|
||||
instance = PyPDFToDocument.from_dict(data)
|
||||
assert isinstance(instance, PyPDFToDocument)
|
||||
assert instance.converter is None
|
||||
|
||||
def test_from_dict_custom_converter(self):
|
||||
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
|
||||
data = pypdf_converter.to_dict()
|
||||
data = {
|
||||
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
|
||||
"init_parameters": {
|
||||
@ -161,3 +165,10 @@ class TestPyPDFToDocument:
|
||||
assert len(docs) == 1
|
||||
assert "ReAct" not in docs[0].content
|
||||
assert "I don't care about converting given pdfs, I always return this" in docs[0].content
|
||||
|
||||
def test_run_empty_document(self, caplog, test_files_path):
|
||||
paths = [test_files_path / "pdf" / "non_text_searchable.pdf"]
|
||||
with caplog.at_level(logging.WARNING):
|
||||
output = PyPDFToDocument().run(sources=paths)
|
||||
assert "PyPDFToDocument could not extract text from the file" in caplog.text
|
||||
assert output["documents"][0].content == ""
|
||||
|
||||
@ -43,6 +43,7 @@ class TestDocumentSplitter:
|
||||
):
|
||||
splitter = DocumentSplitter()
|
||||
splitter.run(documents=[Document()])
|
||||
assert "DocumentSplitter only works with text documents but content for document ID" in caplog.text
|
||||
|
||||
def test_single_doc(self):
|
||||
with pytest.raises(TypeError, match="DocumentSplitter expects a List of Documents as input."):
|
||||
@ -445,3 +446,21 @@ class TestDocumentSplitter:
|
||||
assert original_splitter.split_by == deserialized_splitter.split_by
|
||||
assert callable(deserialized_splitter.splitting_function)
|
||||
assert deserialized_splitter.splitting_function("a.b.c") == ["a", "b", "c"]
|
||||
|
||||
def test_run_empty_document(self):
|
||||
"""
|
||||
Test if the component runs correctly with an empty document.
|
||||
"""
|
||||
splitter = DocumentSplitter()
|
||||
doc = Document(content="")
|
||||
results = splitter.run([doc])
|
||||
assert results["documents"] == []
|
||||
|
||||
def test_run_document_only_whitespaces(self):
|
||||
"""
|
||||
Test if the component runs correctly with a document containing only whitespaces.
|
||||
"""
|
||||
splitter = DocumentSplitter()
|
||||
doc = Document(content=" ")
|
||||
results = splitter.run([doc])
|
||||
assert results["documents"][0].content == " "
|
||||
|
||||
BIN
test/test_files/pdf/non_text_searchable.pdf
Normal file
BIN
test/test_files/pdf/non_text_searchable.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user