fix: DocumentSplitter and DocumentCleaner copy id_hash_keys to newly created Documents (#6083)

* copy id_hash_keys in splitter and cleaner

* reno
This commit is contained in:
Julian Risch 2023-10-17 11:03:48 +02:00 committed by GitHub
parent e963c8acdd
commit 90ddeba579
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 42 additions and 3 deletions

View File

@ -58,7 +58,8 @@ class DocumentCleaner:
@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
"""
Run the DocumentCleaner on the given list of documents
Run the DocumentCleaner on the given list of documents.
The documents' metadata and id_hash_keys remain unchanged.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError("DocumentCleaner expects a List of Documents as input.")
@ -84,7 +85,9 @@ class DocumentCleaner:
if self.remove_repeated_substrings:
text = self._remove_repeated_substrings(text)
cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
cleaned_docs.append(
Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
)
return {"documents": cleaned_docs}

View File

@ -39,6 +39,7 @@ class TextDocumentSplitter:
Splits the documents by split_by after split_length units with an overlap of split_overlap units.
Returns a list of documents with the split texts.
A metadata field "source_id" is added to each document to keep track of the original document that was split.
Other metadata and id_hash_keys are copied from the original document.
:param documents: The documents to split.
:return: A list of documents with the split texts.
"""
@ -54,8 +55,9 @@ class TextDocumentSplitter:
units = self._split_into_units(doc.text, self.split_by)
text_splits = self._concatenate_units(units, self.split_length, self.split_overlap)
metadata = deepcopy(doc.metadata)
id_hash_keys = deepcopy(doc.id_hash_keys)
metadata["source_id"] = doc.id
split_docs += [Document(text=txt, metadata=metadata) for txt in text_splits]
split_docs += [Document(text=txt, metadata=metadata, id_hash_keys=id_hash_keys) for txt in text_splits]
return {"documents": split_docs}
def to_dict(self) -> Dict[str, Any]:

View File

@ -0,0 +1,4 @@
---
preview:
- |
Fixed a bug that caused TextDocumentSplitter and DocumentCleaner to ignore id_hash_keys and create Documents with duplicate ids if the documents differed only in their metadata.

View File

@ -178,3 +178,18 @@ class TestDocumentCleaner:
Sed do eiusmod tempor."""
result = cleaner.run(documents=[Document(text=text)])
assert result["documents"][0].text == expected_text
@pytest.mark.unit
def test_copy_id_hash_keys_and_metadata(self):
cleaner = DocumentCleaner()
documents = [
Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
]
result = cleaner.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
for doc, cleaned_doc in zip(documents, result["documents"]):
assert doc.id_hash_keys == cleaned_doc.id_hash_keys
assert doc.metadata == cleaned_doc.metadata
assert cleaned_doc.text == "Text."

View File

@ -155,3 +155,18 @@ class TestTextDocumentSplitter:
result = splitter.run(documents=[doc1, doc2])
assert result["documents"][0].metadata["source_id"] == doc1.id
assert result["documents"][1].metadata["source_id"] == doc2.id
@pytest.mark.unit
def test_copy_id_hash_keys_and_metadata(self):
splitter = TextDocumentSplitter(split_by="word", split_length=10)
documents = [
Document(text="Text.", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
Document(text="Text.", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
]
result = splitter.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
for doc, split_doc in zip(documents, result["documents"]):
assert doc.id_hash_keys == split_doc.id_hash_keys
assert doc.metadata.items() <= split_doc.metadata.items()
assert split_doc.text == "Text."