mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-30 00:30:09 +00:00
fix: DocumentSplitter and DocumentCleaner copy id_hash_keys to newly created Documents (#6083)
* copy id_hash_keys in splitter and cleaner * reno
This commit is contained in:
parent
e963c8acdd
commit
90ddeba579
@ -58,7 +58,8 @@ class DocumentCleaner:
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
Run the DocumentCleaner on the given list of documents
|
||||
Run the DocumentCleaner on the given list of documents.
|
||||
The documents' metadata and id_hash_keys remain unchanged.
|
||||
"""
|
||||
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
|
||||
raise TypeError("DocumentCleaner expects a List of Documents as input.")
|
||||
@ -84,7 +85,9 @@ class DocumentCleaner:
|
||||
if self.remove_repeated_substrings:
|
||||
text = self._remove_repeated_substrings(text)
|
||||
|
||||
cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
|
||||
cleaned_docs.append(
|
||||
Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
|
||||
)
|
||||
|
||||
return {"documents": cleaned_docs}
|
||||
|
||||
|
||||
@ -39,6 +39,7 @@ class TextDocumentSplitter:
|
||||
Splits the documents by split_by after split_length units with an overlap of split_overlap units.
|
||||
Returns a list of documents with the split texts.
|
||||
A metadata field "source_id" is added to each document to keep track of the original document that was split.
|
||||
Other metadata and id_hash_keys are copied from the original document.
|
||||
:param documents: The documents to split.
|
||||
:return: A list of documents with the split texts.
|
||||
"""
|
||||
@ -54,8 +55,9 @@ class TextDocumentSplitter:
|
||||
units = self._split_into_units(doc.text, self.split_by)
|
||||
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
|
||||
metadata = deepcopy(doc.metadata)
|
||||
id_hash_keys = deepcopy(doc.id_hash_keys)
|
||||
metadata["source_id"] = doc.id
|
||||
split_docs += [Document(text=txt, metadata=metadata) for txt in text_splits]
|
||||
split_docs += [Document(text=txt, metadata=metadata, id_hash_keys=id_hash_keys) for txt in text_splits]
|
||||
return {"documents": split_docs}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Fixed a bug that caused TextDocumentSplitter and DocumentCleaner to ignore id_hash_keys and create Documents with duplicate ids if the documents differed only in their metadata.
|
||||
@ -178,3 +178,18 @@ class TestDocumentCleaner:
|
||||
Sed do eiusmod tempor."""
|
||||
result = cleaner.run(documents=[Document(text=text)])
|
||||
assert result["documents"][0].text == expected_text
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_copy_id_hash_keys_and_metadata(self):
|
||||
cleaner = DocumentCleaner()
|
||||
documents = [
|
||||
Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
|
||||
Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
|
||||
]
|
||||
result = cleaner.run(documents=documents)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].id != result["documents"][1].id
|
||||
for doc, cleaned_doc in zip(documents, result["documents"]):
|
||||
assert doc.id_hash_keys == cleaned_doc.id_hash_keys
|
||||
assert doc.metadata == cleaned_doc.metadata
|
||||
assert cleaned_doc.text == "Text."
|
||||
|
||||
@ -155,3 +155,18 @@ class TestTextDocumentSplitter:
|
||||
result = splitter.run(documents=[doc1, doc2])
|
||||
assert result["documents"][0].metadata["source_id"] == doc1.id
|
||||
assert result["documents"][1].metadata["source_id"] == doc2.id
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_copy_id_hash_keys_and_metadata(self):
|
||||
splitter = TextDocumentSplitter(split_by="word", split_length=10)
|
||||
documents = [
|
||||
Document(text="Text.", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
|
||||
Document(text="Text.", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
|
||||
]
|
||||
result = splitter.run(documents=documents)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].id != result["documents"][1].id
|
||||
for doc, split_doc in zip(documents, result["documents"]):
|
||||
assert doc.id_hash_keys == split_doc.id_hash_keys
|
||||
assert doc.metadata.items() <= split_doc.metadata.items()
|
||||
assert split_doc.text == "Text."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user