mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-14 16:47:06 +00:00
Update LLMMetaDataExtractor to properly retrigger document ID creation after adding new metadata to docs
This commit is contained in:
parent
1d1c13a8bc
commit
380f1f396c
@ -5,6 +5,7 @@
|
|||||||
import copy
|
import copy
|
||||||
import json
|
import json
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from dataclasses import replace
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from jinja2 import meta
|
from jinja2 import meta
|
||||||
@ -319,23 +320,31 @@ class LLMMetadataExtractor:
|
|||||||
failed_documents = []
|
failed_documents = []
|
||||||
for document, result in zip(documents, results):
|
for document, result in zip(documents, results):
|
||||||
if "error" in result:
|
if "error" in result:
|
||||||
document.meta["metadata_extraction_error"] = result["error"]
|
new_meta = {
|
||||||
document.meta["metadata_extraction_response"] = None
|
**document.meta,
|
||||||
failed_documents.append(document)
|
"metadata_extraction_error": result["error"],
|
||||||
|
"metadata_extraction_response": None,
|
||||||
|
}
|
||||||
|
# We set id to an empty string to retrigger new id creation
|
||||||
|
failed_documents.append(replace(document, meta=new_meta, id=""))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parsed_metadata = self._extract_metadata(result["replies"][0].text)
|
parsed_metadata = self._extract_metadata(result["replies"][0].text)
|
||||||
if "error" in parsed_metadata:
|
if "error" in parsed_metadata:
|
||||||
document.meta["metadata_extraction_error"] = parsed_metadata["error"]
|
new_meta = {
|
||||||
document.meta["metadata_extraction_response"] = result["replies"][0]
|
**document.meta,
|
||||||
failed_documents.append(document)
|
"metadata_extraction_error": parsed_metadata["error"],
|
||||||
|
"metadata_extraction_response": result["replies"][0],
|
||||||
|
}
|
||||||
|
# We set id to an empty string to retrigger new id creation
|
||||||
|
failed_documents.append(replace(document, meta=new_meta, id=""))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for key in parsed_metadata:
|
new_meta = {**document.meta, **parsed_metadata}
|
||||||
document.meta[key] = parsed_metadata[key]
|
# Remove metadata_extraction_error and metadata_extraction_response if present from previous runs
|
||||||
# Remove metadata_extraction_error and metadata_extraction_response if present from previous runs
|
new_meta.pop("metadata_extraction_error", None)
|
||||||
document.meta.pop("metadata_extraction_error", None)
|
new_meta.pop("metadata_extraction_response", None)
|
||||||
document.meta.pop("metadata_extraction_response", None)
|
# We set id to an empty string to retrigger new id creation
|
||||||
successful_documents.append(document)
|
successful_documents.append(replace(document, meta=new_meta, id=""))
|
||||||
|
|
||||||
return {"documents": successful_documents, "failed_documents": failed_documents}
|
return {"documents": successful_documents, "failed_documents": failed_documents}
|
||||||
|
|||||||
@ -244,12 +244,12 @@ class TestLLMMetadataExtractor:
|
|||||||
assert len(result["failed_documents"]) == 2
|
assert len(result["failed_documents"]) == 2
|
||||||
|
|
||||||
failed_doc_none = result["failed_documents"][0]
|
failed_doc_none = result["failed_documents"][0]
|
||||||
assert failed_doc_none.id == doc_with_none_content.id
|
assert failed_doc_none.id != doc_with_none_content.id
|
||||||
assert "metadata_extraction_error" in failed_doc_none.meta
|
assert "metadata_extraction_error" in failed_doc_none.meta
|
||||||
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
||||||
|
|
||||||
failed_doc_empty = result["failed_documents"][1]
|
failed_doc_empty = result["failed_documents"][1]
|
||||||
assert failed_doc_empty.id == doc_with_empty_content.id
|
assert failed_doc_empty.id != doc_with_empty_content.id
|
||||||
assert "metadata_extraction_error" in failed_doc_empty.meta
|
assert "metadata_extraction_error" in failed_doc_empty.meta
|
||||||
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
|
||||||
|
|
||||||
@ -322,3 +322,8 @@ output:
|
|||||||
assert len(doc_store_docs) == 2
|
assert len(doc_store_docs) == 2
|
||||||
assert "entities" in doc_store_docs[0].meta
|
assert "entities" in doc_store_docs[0].meta
|
||||||
assert "entities" in doc_store_docs[1].meta
|
assert "entities" in doc_store_docs[1].meta
|
||||||
|
|
||||||
|
# Check that IDs of documents in doc store are different from the original documents
|
||||||
|
doc_store_doc_ids = {doc.id for doc in doc_store_docs}
|
||||||
|
original_doc_ids = {doc.id for doc in docs}
|
||||||
|
assert doc_store_doc_ids != original_doc_ids
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user