From f025501792a5870c062d1d0ecfb2bf2c27d1cfc1 Mon Sep 17 00:00:00 2001 From: Seth Peters <58783402+Seth-Peters@users.noreply.github.com> Date: Fri, 23 May 2025 19:57:39 +0300 Subject: [PATCH] fix: `LLMMetadataExtractor` bug in handling `Document` objects with no content * test(extractors): Add unit test for LLMMetadataExtractor with no content Adds a new unit test `test_run_with_document_content_none` to `TestLLMMetadataExtractor`. This test verifies that `LLMMetadataExtractor` correctly handles documents where `document.content` is None or an empty string. It ensures that: - Such documents are added to the `failed_documents` list. - The correct error message ("Document has no content, skipping LLM call.") is present in their metadata. - No actual LLM call is attempted for these documents. This test provides coverage for the fix that prevents an AttributeError when processing documents with no content. * chore: update comment to reflect new behavior in _run_on_thread method * docs: Add release note for LLMMetadataExtractor no content fix * Update releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml * Update fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml --------- Co-authored-by: David S. Batista --- .../extractors/llm_metadata_extractor.py | 4 +-- ...extractor-no-content-910067ea72094f18.yaml | 8 +++++ .../extractors/test_llm_metadata_extractor.py | 34 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml diff --git a/haystack/components/extractors/llm_metadata_extractor.py b/haystack/components/extractors/llm_metadata_extractor.py index 476e16353..c75c73956 100644 --- a/haystack/components/extractors/llm_metadata_extractor.py +++ b/haystack/components/extractors/llm_metadata_extractor.py @@ -256,9 +256,9 @@ class LLMMetadataExtractor: return all_prompts def _run_on_thread(self, prompt: Optional[ChatMessage]) -> Dict[str, Any]: - # If prompt is None, return an empty dictionary + # If prompt is None, return an error dictionary if prompt is None: - return {"replies": ["{}"]} + return {"error": "Document has no content, skipping LLM call."} try: result = self._chat_generator.run(messages=[prompt]) diff --git a/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml b/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml new file mode 100644 index 000000000..afc30b195 --- /dev/null +++ b/releasenotes/notes/fix-llm-metadata-extractor-no-content-910067ea72094f18.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixed a bug in the `LLMMetadataExtractor` that occurred when + processing `Document` objects with `None` or empty string content. The + component now gracefully handles these cases by marking such documents as + failed and providing an appropriate error message in their metadata, without + attempting an LLM call. diff --git a/test/components/extractors/test_llm_metadata_extractor.py b/test/components/extractors/test_llm_metadata_extractor.py index 45b66f7a2..cb3ac7525 100644 --- a/test/components/extractors/test_llm_metadata_extractor.py +++ b/test/components/extractors/test_llm_metadata_extractor.py @@ -219,6 +219,40 @@ class TestLLMMetadataExtractor: assert result["documents"] == [] assert result["failed_documents"] == [] + def test_run_with_document_content_none(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + # Mock the chat generator to prevent actual LLM calls + mock_chat_generator = Mock(spec=OpenAIChatGenerator) + + extractor = LLMMetadataExtractor( + prompt="prompt {{document.content}}", chat_generator=mock_chat_generator, expected_keys=["some_key"] + ) + + # Document with None content + doc_with_none_content = Document(content=None) + # also test with empty string content + doc_with_empty_content = Document(content="") + docs = [doc_with_none_content, doc_with_empty_content] + + result = extractor.run(documents=docs) + + # Assert that the documents are in failed_documents + assert len(result["documents"]) == 0 + assert len(result["failed_documents"]) == 2 + + failed_doc_none = result["failed_documents"][0] + assert failed_doc_none.id == doc_with_none_content.id + assert "metadata_extraction_error" in failed_doc_none.meta + assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." + + failed_doc_empty = result["failed_documents"][1] + assert failed_doc_empty.id == doc_with_empty_content.id + assert "metadata_extraction_error" in failed_doc_empty.meta + assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." + + # Ensure no attempt was made to call the LLM + mock_chat_generator.run.assert_not_called() + @pytest.mark.integration @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None),