Update LLMMetaDataExtractor to properly retrigger document ID creation after adding new metadata to docs

This commit is contained in:
Sebastian Husch Lee 2025-06-25 14:17:00 +02:00
parent 1d1c13a8bc
commit 380f1f396c
2 changed files with 28 additions and 14 deletions

View File

@ -5,6 +5,7 @@
import copy import copy
import json import json
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from dataclasses import replace
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from jinja2 import meta from jinja2 import meta
@ -319,23 +320,31 @@ class LLMMetadataExtractor:
failed_documents = [] failed_documents = []
for document, result in zip(documents, results): for document, result in zip(documents, results):
if "error" in result: if "error" in result:
document.meta["metadata_extraction_error"] = result["error"] new_meta = {
document.meta["metadata_extraction_response"] = None **document.meta,
failed_documents.append(document) "metadata_extraction_error": result["error"],
"metadata_extraction_response": None,
}
# We set id to an empty string to retrigger new id creation
failed_documents.append(replace(document, meta=new_meta, id=""))
continue continue
parsed_metadata = self._extract_metadata(result["replies"][0].text) parsed_metadata = self._extract_metadata(result["replies"][0].text)
if "error" in parsed_metadata: if "error" in parsed_metadata:
document.meta["metadata_extraction_error"] = parsed_metadata["error"] new_meta = {
document.meta["metadata_extraction_response"] = result["replies"][0] **document.meta,
failed_documents.append(document) "metadata_extraction_error": parsed_metadata["error"],
"metadata_extraction_response": result["replies"][0],
}
# We set id to an empty string to retrigger new id creation
failed_documents.append(replace(document, meta=new_meta, id=""))
continue continue
for key in parsed_metadata: new_meta = {**document.meta, **parsed_metadata}
document.meta[key] = parsed_metadata[key] # Remove metadata_extraction_error and metadata_extraction_response if present from previous runs
# Remove metadata_extraction_error and metadata_extraction_response if present from previous runs new_meta.pop("metadata_extraction_error", None)
document.meta.pop("metadata_extraction_error", None) new_meta.pop("metadata_extraction_response", None)
document.meta.pop("metadata_extraction_response", None) # We set id to an empty string to retrigger new id creation
successful_documents.append(document) successful_documents.append(replace(document, meta=new_meta, id=""))
return {"documents": successful_documents, "failed_documents": failed_documents} return {"documents": successful_documents, "failed_documents": failed_documents}

View File

@ -244,12 +244,12 @@ class TestLLMMetadataExtractor:
assert len(result["failed_documents"]) == 2 assert len(result["failed_documents"]) == 2
failed_doc_none = result["failed_documents"][0] failed_doc_none = result["failed_documents"][0]
assert failed_doc_none.id == doc_with_none_content.id assert failed_doc_none.id != doc_with_none_content.id
assert "metadata_extraction_error" in failed_doc_none.meta assert "metadata_extraction_error" in failed_doc_none.meta
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
failed_doc_empty = result["failed_documents"][1] failed_doc_empty = result["failed_documents"][1]
assert failed_doc_empty.id == doc_with_empty_content.id assert failed_doc_empty.id != doc_with_empty_content.id
assert "metadata_extraction_error" in failed_doc_empty.meta assert "metadata_extraction_error" in failed_doc_empty.meta
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call." assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
@ -322,3 +322,8 @@ output:
assert len(doc_store_docs) == 2 assert len(doc_store_docs) == 2
assert "entities" in doc_store_docs[0].meta assert "entities" in doc_store_docs[0].meta
assert "entities" in doc_store_docs[1].meta assert "entities" in doc_store_docs[1].meta
# Check that IDs of documents in doc store are different from the original documents
doc_store_doc_ids = {doc.id for doc in doc_store_docs}
original_doc_ids = {doc.id for doc in docs}
assert doc_store_doc_ids != original_doc_ids