mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
fix: LLMMetadataExtractor
bug in handling Document
objects with no content
* test(extractors): Add unit test for LLMMetadataExtractor with no content Adds a new unit test `test_run_with_document_content_none` to `TestLLMMetadataExtractor`. This test verifies that `LLMMetadataExtractor` correctly handles documents where `document.content` is None or an empty string. It ensures that: - Such documents are added to the `failed_documents` list. - The correct error message ("Document has no content, skipping LLM call.") is present in their metadata. - No actual LLM call is attempted for these documents. This test provides coverage for the fix that prevents an AttributeError when processing documents with no content. * chore: update comment to reflect new behavior in _run_on_thread method * docs: Add release note for LLMMetadataExtractor no content fix * Update releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml * Update fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml --------- Co-authored-by: David S. Batista <dsbatista@gmail.com>
This commit is contained in:
parent
1fb2477149
commit
f025501792
@ -256,9 +256,9 @@ class LLMMetadataExtractor:
|
||||
return all_prompts
|
||||
|
||||
def _run_on_thread(self, prompt: Optional[ChatMessage]) -> Dict[str, Any]:
|
||||
# If prompt is None, return an empty dictionary
|
||||
# If prompt is None, return an error dictionary
|
||||
if prompt is None:
|
||||
return {"replies": ["{}"]}
|
||||
return {"error": "Document has no content, skipping LLM call."}
|
||||
|
||||
try:
|
||||
result = self._chat_generator.run(messages=[prompt])
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixed a bug in the `LLMMetadataExtractor` that occurred when
|
||||
processing `Document` objects with `None` or empty string content. The
|
||||
component now gracefully handles these cases by marking such documents as
|
||||
failed and providing an appropriate error message in their metadata, without
|
||||
attempting an LLM call.
|
@ -219,6 +219,40 @@ class TestLLMMetadataExtractor:
|
||||
assert result["documents"] == []
|
||||
assert result["failed_documents"] == []
|
||||
|
||||
def test_run_with_document_content_none(self, monkeypatch):
|
||||
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
|
||||
# Mock the chat generator to prevent actual LLM calls
|
||||
mock_chat_generator = Mock(spec=OpenAIChatGenerator)
|
||||
|
||||
extractor = LLMMetadataExtractor(
|
||||
prompt="prompt {{document.content}}", chat_generator=mock_chat_generator, expected_keys=["some_key"]
|
||||
)
|
||||
|
||||
# Document with None content
|
||||
doc_with_none_content = Document(content=None)
|
||||
# also test with empty string content
|
||||
doc_with_empty_content = Document(content="")
|
||||
docs = [doc_with_none_content, doc_with_empty_content]
|
||||
|
||||
result = extractor.run(documents=docs)
|
||||
|
||||
# Assert that the documents are in failed_documents
|
||||
assert len(result["documents"]) == 0
|
||||
assert len(result["failed_documents"]) == 2
|
||||
|
||||
failed_doc_none = result["failed_documents"][0]
|
||||
assert failed_doc_none.id == doc_with_none_content.id
|
||||
assert "metadata_extraction_error" in failed_doc_none.meta
|
||||
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
||||
|
||||
failed_doc_empty = result["failed_documents"][1]
|
||||
assert failed_doc_empty.id == doc_with_empty_content.id
|
||||
assert "metadata_extraction_error" in failed_doc_empty.meta
|
||||
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
||||
|
||||
# Ensure no attempt was made to call the LLM
|
||||
mock_chat_generator.run.assert_not_called()
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("OPENAI_API_KEY", None),
|
||||
|
Loading…
x
Reference in New Issue
Block a user