fix: LLMMetadataExtractor bug in handling Document objects with no content

* test(extractors): Add unit test for LLMMetadataExtractor with no content

Adds a new unit test `test_run_with_document_content_none` to `TestLLMMetadataExtractor`.

This test verifies that `LLMMetadataExtractor` correctly handles documents where `document.content` is None or an empty string.

It ensures that:

- Such documents are added to the `failed_documents` list.

- The correct error message ("Document has no content, skipping LLM call.") is present in their metadata.

- No actual LLM call is attempted for these documents.

This test provides coverage for the fix that prevents an AttributeError when processing documents with no content.

* chore: update comment to reflect new behavior in _run_on_thread method

* docs: Add release note for LLMMetadataExtractor no content fix

* Update releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml

* Update fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>
This commit is contained in:
Seth Peters 2025-05-23 19:57:39 +03:00 committed by GitHub
parent 1fb2477149
commit f025501792
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 44 additions and 2 deletions

View File

@ -256,9 +256,9 @@ class LLMMetadataExtractor:
return all_prompts
def _run_on_thread(self, prompt: Optional[ChatMessage]) -> Dict[str, Any]:
# If prompt is None, return an empty dictionary
# If prompt is None, return an error dictionary
if prompt is None:
return {"replies": ["{}"]}
return {"error": "Document has no content, skipping LLM call."}
try:
result = self._chat_generator.run(messages=[prompt])

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixed a bug in the `LLMMetadataExtractor` that occurred when
processing `Document` objects with `None` or empty string content. The
component now gracefully handles these cases by marking such documents as
failed and providing an appropriate error message in their metadata, without
attempting an LLM call.

View File

@ -219,6 +219,40 @@ class TestLLMMetadataExtractor:
assert result["documents"] == []
assert result["failed_documents"] == []
def test_run_with_document_content_none(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
# Mock the chat generator to prevent actual LLM calls
mock_chat_generator = Mock(spec=OpenAIChatGenerator)
extractor = LLMMetadataExtractor(
prompt="prompt {{document.content}}", chat_generator=mock_chat_generator, expected_keys=["some_key"]
)
# Document with None content
doc_with_none_content = Document(content=None)
# also test with empty string content
doc_with_empty_content = Document(content="")
docs = [doc_with_none_content, doc_with_empty_content]
result = extractor.run(documents=docs)
# Assert that the documents are in failed_documents
assert len(result["documents"]) == 0
assert len(result["failed_documents"]) == 2
failed_doc_none = result["failed_documents"][0]
assert failed_doc_none.id == doc_with_none_content.id
assert "metadata_extraction_error" in failed_doc_none.meta
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
failed_doc_empty = result["failed_documents"][1]
assert failed_doc_empty.id == doc_with_empty_content.id
assert "metadata_extraction_error" in failed_doc_empty.meta
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
# Ensure no attempt was made to call the LLM
mock_chat_generator.run.assert_not_called()
@pytest.mark.integration
@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),