Remove id_hash_keys from DocumentCleaner (#6123)

This commit is contained in:
Silvano Cerza 2023-10-20 15:16:06 +02:00 committed by GitHub
parent 366f0366bf
commit ec376c7dbd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 8 deletions

View File

@ -59,7 +59,7 @@ class DocumentCleaner:
def run(self, documents: List[Document]):
"""
Run the DocumentCleaner on the given list of documents.
The documents' metadata and id_hash_keys remain unchanged.
The documents' metadata remain unchanged.
"""
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError("DocumentCleaner expects a List of Documents as input.")
@ -85,9 +85,7 @@ class DocumentCleaner:
if self.remove_repeated_substrings:
text = self._remove_repeated_substrings(text)
cleaned_docs.append(
Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
)
cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
return {"documents": cleaned_docs}

View File

@ -125,16 +125,15 @@ class TestDocumentCleaner:
assert result["documents"][0].text == expected_text
@pytest.mark.unit
def test_copy_id_hash_keys_and_metadata(self):
def test_copy_metadata(self):
cleaner = DocumentCleaner()
documents = [
Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
Document(text="Text. ", metadata={"name": "doc 0"}),
Document(text="Text. ", metadata={"name": "doc 1"}),
]
result = cleaner.run(documents=documents)
assert len(result["documents"]) == 2
assert result["documents"][0].id != result["documents"][1].id
for doc, cleaned_doc in zip(documents, result["documents"]):
assert doc.id_hash_keys == cleaned_doc.id_hash_keys
assert doc.metadata == cleaned_doc.metadata
assert cleaned_doc.text == "Text."