mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-09 13:46:54 +00:00
Remove id_hash_keys from DocumentCleaner (#6123)
This commit is contained in:
parent
366f0366bf
commit
ec376c7dbd
@ -59,7 +59,7 @@ class DocumentCleaner:
|
||||
def run(self, documents: List[Document]):
|
||||
"""
|
||||
Run the DocumentCleaner on the given list of documents.
|
||||
The documents' metadata and id_hash_keys remain unchanged.
|
||||
The documents' metadata remain unchanged.
|
||||
"""
|
||||
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
|
||||
raise TypeError("DocumentCleaner expects a List of Documents as input.")
|
||||
@ -85,9 +85,7 @@ class DocumentCleaner:
|
||||
if self.remove_repeated_substrings:
|
||||
text = self._remove_repeated_substrings(text)
|
||||
|
||||
cleaned_docs.append(
|
||||
Document(text=text, metadata=deepcopy(doc.metadata), id_hash_keys=deepcopy(doc.id_hash_keys))
|
||||
)
|
||||
cleaned_docs.append(Document(text=text, metadata=deepcopy(doc.metadata)))
|
||||
|
||||
return {"documents": cleaned_docs}
|
||||
|
||||
|
||||
@ -125,16 +125,15 @@ class TestDocumentCleaner:
|
||||
assert result["documents"][0].text == expected_text
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_copy_id_hash_keys_and_metadata(self):
|
||||
def test_copy_metadata(self):
|
||||
cleaner = DocumentCleaner()
|
||||
documents = [
|
||||
Document(text="Text. ", metadata={"name": "doc 0"}, id_hash_keys=["name"]),
|
||||
Document(text="Text. ", metadata={"name": "doc 1"}, id_hash_keys=["name"]),
|
||||
Document(text="Text. ", metadata={"name": "doc 0"}),
|
||||
Document(text="Text. ", metadata={"name": "doc 1"}),
|
||||
]
|
||||
result = cleaner.run(documents=documents)
|
||||
assert len(result["documents"]) == 2
|
||||
assert result["documents"][0].id != result["documents"][1].id
|
||||
for doc, cleaned_doc in zip(documents, result["documents"]):
|
||||
assert doc.id_hash_keys == cleaned_doc.id_hash_keys
|
||||
assert doc.metadata == cleaned_doc.metadata
|
||||
assert cleaned_doc.text == "Text."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user