fix: LLMMetadataExtractor bug in handling Document objects with no content

* test(extractors): Add unit test for LLMMetadataExtractor with no content Adds a new unit test `test_run_with_document_content_none` to `TestLLMMetadataExtractor`. This test verifies that `LLMMetadataExtractor` correctly handles documents where `document.content` is None or an empty string. It ensures that: - Such documents are added to the `failed_documents` list. - The correct error message ("Document has no content, skipping LLM call.") is present in their metadata. - No actual LLM call is attempted for these documents. This test provides coverage for the fix that prevents an AttributeError when processing documents with no content. * chore: update comment to reflect new behavior in _run_on_thread method * docs: Add release note for LLMMetadataExtractor no content fix * Update releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml * Update fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml --------- Co-authored-by: David S. Batista <dsbatista@gmail.com>
2025-06-26 22:00:13 +00:00 · 2025-05-23 19:57:39 +03:00 · 2025-05-23 19:57:39 +03:00 · f025501792
commit f025501792
parent 1fb2477149
3 changed files with 44 additions and 2 deletions
--- a/haystack/components/extractors/llm_metadata_extractor.py
+++ b/haystack/components/extractors/llm_metadata_extractor.py
@ -256,9 +256,9 @@ class LLMMetadataExtractor:
        return all_prompts

    def _run_on_thread(self, prompt: Optional[ChatMessage]) -> Dict[str, Any]:
-        # If prompt is None, return an empty dictionary
+        # If prompt is None, return an error dictionary
        if prompt is None:
-            return {"replies": ["{}"]}
+            return {"error": "Document has no content, skipping LLM call."}

        try:
            result = self._chat_generator.run(messages=[prompt])
--- a/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml
+++ b/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml
@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixed a bug in the `LLMMetadataExtractor` that occurred when
+    processing `Document` objects with `None` or empty string content. The
+    component now gracefully handles these cases by marking such documents as
+    failed and providing an appropriate error message in their metadata, without
+    attempting an LLM call.
--- a/test/components/extractors/test_llm_metadata_extractor.py
+++ b/test/components/extractors/test_llm_metadata_extractor.py
@ -219,6 +219,40 @@ class TestLLMMetadataExtractor:
        assert result["documents"] == []
        assert result["failed_documents"] == []

+    def test_run_with_document_content_none(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        # Mock the chat generator to prevent actual LLM calls
+        mock_chat_generator = Mock(spec=OpenAIChatGenerator)
+
+        extractor = LLMMetadataExtractor(
+            prompt="prompt {{document.content}}", chat_generator=mock_chat_generator, expected_keys=["some_key"]
+        )
+
+        # Document with None content
+        doc_with_none_content = Document(content=None)
+        # also test with empty string content
+        doc_with_empty_content = Document(content="")
+        docs = [doc_with_none_content, doc_with_empty_content]
+
+        result = extractor.run(documents=docs)
+
+        # Assert that the documents are in failed_documents
+        assert len(result["documents"]) == 0
+        assert len(result["failed_documents"]) == 2
+
+        failed_doc_none = result["failed_documents"][0]
+        assert failed_doc_none.id == doc_with_none_content.id
+        assert "metadata_extraction_error" in failed_doc_none.meta
+        assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
+
+        failed_doc_empty = result["failed_documents"][1]
+        assert failed_doc_empty.id == doc_with_empty_content.id
+        assert "metadata_extraction_error" in failed_doc_empty.meta
+        assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
+
+        # Ensure no attempt was made to call the LLM
+        mock_chat_generator.run.assert_not_called()
+
    @pytest.mark.integration
    @pytest.mark.skipif(
        not os.environ.get("OPENAI_API_KEY", None),