From 380f1f396c6678d5fcb46b0375ac992f3fe2b78c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 25 Jun 2025 14:17:00 +0200 Subject: [PATCH] Update LLMMetaDataExtractor to properly retrigger document ID creation after adding new metadata to docs --- .../extractors/llm_metadata_extractor.py | 33 ++++++++++++------- .../extractors/test_llm_metadata_extractor.py | 9 +++-- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/haystack/components/extractors/llm_metadata_extractor.py b/haystack/components/extractors/llm_metadata_extractor.py index bb798da66..0a21191bd 100644 --- a/haystack/components/extractors/llm_metadata_extractor.py +++ b/haystack/components/extractors/llm_metadata_extractor.py @@ -5,6 +5,7 @@ import copy import json from concurrent.futures import ThreadPoolExecutor +from dataclasses import replace from typing import Any, Dict, List, Optional, Union from jinja2 import meta @@ -319,23 +320,31 @@ class LLMMetadataExtractor: failed_documents = [] for document, result in zip(documents, results): if "error" in result: - document.meta["metadata_extraction_error"] = result["error"] - document.meta["metadata_extraction_response"] = None - failed_documents.append(document) + new_meta = { + **document.meta, + "metadata_extraction_error": result["error"], + "metadata_extraction_response": None, + } + # We set id to an empty string to retrigger new id creation + failed_documents.append(replace(document, meta=new_meta, id="")) continue parsed_metadata = self._extract_metadata(result["replies"][0].text) if "error" in parsed_metadata: - document.meta["metadata_extraction_error"] = parsed_metadata["error"] - document.meta["metadata_extraction_response"] = result["replies"][0] - failed_documents.append(document) + new_meta = { + **document.meta, + "metadata_extraction_error": parsed_metadata["error"], + "metadata_extraction_response": result["replies"][0], + } + # We set id to an empty string to retrigger new id creation + failed_documents.append(replace(document, meta=new_meta, id="")) continue - for key in parsed_metadata: - document.meta[key] = parsed_metadata[key] - # Remove metadata_extraction_error and metadata_extraction_response if present from previous runs - document.meta.pop("metadata_extraction_error", None) - document.meta.pop("metadata_extraction_response", None) - successful_documents.append(document) + new_meta = {**document.meta, **parsed_metadata} + # Remove metadata_extraction_error and metadata_extraction_response if present from previous runs + new_meta.pop("metadata_extraction_error", None) + new_meta.pop("metadata_extraction_response", None) + # We set id to an empty string to retrigger new id creation + successful_documents.append(replace(document, meta=new_meta, id="")) return {"documents": successful_documents, "failed_documents": failed_documents} diff --git a/test/components/extractors/test_llm_metadata_extractor.py b/test/components/extractors/test_llm_metadata_extractor.py index 31417efeb..e3fc6a6be 100644 --- a/test/components/extractors/test_llm_metadata_extractor.py +++ b/test/components/extractors/test_llm_metadata_extractor.py @@ -244,12 +244,12 @@ class TestLLMMetadataExtractor: assert len(result["failed_documents"]) == 2 failed_doc_none = result["failed_documents"][0] - assert failed_doc_none.id == doc_with_none_content.id + assert failed_doc_none.id != doc_with_none_content.id assert "metadata_extraction_error" in failed_doc_none.meta assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." failed_doc_empty = result["failed_documents"][1] - assert failed_doc_empty.id == doc_with_empty_content.id + assert failed_doc_empty.id != doc_with_empty_content.id assert "metadata_extraction_error" in failed_doc_empty.meta assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." @@ -322,3 +322,8 @@ output: assert len(doc_store_docs) == 2 assert "entities" in doc_store_docs[0].meta assert "entities" in doc_store_docs[1].meta + + # Check that IDs of documents in doc store are different from the original documents + doc_store_doc_ids = {doc.id for doc in doc_store_docs} + original_doc_ids = {doc.id for doc in docs} + assert doc_store_doc_ids != original_doc_ids