From f025501792a5870c062d1d0ecfb2bf2c27d1cfc1 Mon Sep 17 00:00:00 2001
From: Seth Peters <58783402+Seth-Peters@users.noreply.github.com>
Date: Fri, 23 May 2025 19:57:39 +0300
Subject: [PATCH] fix: `LLMMetadataExtractor` bug in handling `Document`
 objects with no content

* test(extractors): Add unit test for LLMMetadataExtractor with no content

Adds a new unit test `test_run_with_document_content_none` to `TestLLMMetadataExtractor`.

This test verifies that `LLMMetadataExtractor` correctly handles documents where `document.content` is None or an empty string.

It ensures that:

- Such documents are added to the `failed_documents` list.

- The correct error message ("Document has no content, skipping LLM call.") is present in their metadata.

- No actual LLM call is attempted for these documents.

This test provides coverage for the fix that prevents an AttributeError when processing documents with no content.

* chore: update comment to reflect new behavior in _run_on_thread method

* docs: Add release note for LLMMetadataExtractor no content fix

* Update releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml

* Update fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml

---------

Co-authored-by: David S. Batista <dsbatista@gmail.com>
---
 .../extractors/llm_metadata_extractor.py      |  4 +--
 ...extractor-no-content-910067ea72094f18.yaml |  8 +++++
 .../extractors/test_llm_metadata_extractor.py | 34 +++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml

diff --git a/haystack/components/extractors/llm_metadata_extractor.py b/haystack/components/extractors/llm_metadata_extractor.py
index 476e16353..c75c73956 100644
--- a/haystack/components/extractors/llm_metadata_extractor.py
+++ b/haystack/components/extractors/llm_metadata_extractor.py
@@ -256,9 +256,9 @@ class LLMMetadataExtractor:
         return all_prompts
 
     def _run_on_thread(self, prompt: Optional[ChatMessage]) -> Dict[str, Any]:
-        # If prompt is None, return an empty dictionary
+        # If prompt is None, return an error dictionary
         if prompt is None:
-            return {"replies": ["{}"]}
+            return {"error": "Document has no content, skipping LLM call."}
 
         try:
             result = self._chat_generator.run(messages=[prompt])
diff --git a/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml b/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml
new file mode 100644
index 000000000..afc30b195
--- /dev/null
+++ b/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixed a bug in the `LLMMetadataExtractor` that occurred when
+    processing `Document` objects with `None` or empty string content. The
+    component now gracefully handles these cases by marking such documents as
+    failed and providing an appropriate error message in their metadata, without
+    attempting an LLM call.
diff --git a/test/components/extractors/test_llm_metadata_extractor.py b/test/components/extractors/test_llm_metadata_extractor.py
index 45b66f7a2..cb3ac7525 100644
--- a/test/components/extractors/test_llm_metadata_extractor.py
+++ b/test/components/extractors/test_llm_metadata_extractor.py
@@ -219,6 +219,40 @@ class TestLLMMetadataExtractor:
         assert result["documents"] == []
         assert result["failed_documents"] == []
 
+    def test_run_with_document_content_none(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        # Mock the chat generator to prevent actual LLM calls
+        mock_chat_generator = Mock(spec=OpenAIChatGenerator)
+
+        extractor = LLMMetadataExtractor(
+            prompt="prompt {{document.content}}", chat_generator=mock_chat_generator, expected_keys=["some_key"]
+        )
+
+        # Document with None content
+        doc_with_none_content = Document(content=None)
+        # also test with empty string content
+        doc_with_empty_content = Document(content="")
+        docs = [doc_with_none_content, doc_with_empty_content]
+
+        result = extractor.run(documents=docs)
+
+        # Assert that the documents are in failed_documents
+        assert len(result["documents"]) == 0
+        assert len(result["failed_documents"]) == 2
+
+        failed_doc_none = result["failed_documents"][0]
+        assert failed_doc_none.id == doc_with_none_content.id
+        assert "metadata_extraction_error" in failed_doc_none.meta
+        assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
+
+        failed_doc_empty = result["failed_documents"][1]
+        assert failed_doc_empty.id == doc_with_empty_content.id
+        assert "metadata_extraction_error" in failed_doc_empty.meta
+        assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
+
+        # Ensure no attempt was made to call the LLM
+        mock_chat_generator.run.assert_not_called()
+
     @pytest.mark.integration
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),