haystack/test/components/preprocessors/test_document_cleaner.py

# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

import logging

import pytest

from haystack import Document
from haystack.dataclasses import ByteStream, SparseEmbedding
from haystack.components.preprocessors import DocumentCleaner


class TestDocumentCleaner:
    def test_init(self):
        cleaner = DocumentCleaner()
        assert cleaner.remove_empty_lines is True
        assert cleaner.remove_extra_whitespaces is True
        assert cleaner.remove_repeated_substrings is False
        assert cleaner.remove_substrings is None
        assert cleaner.remove_regex is None
        assert cleaner.keep_id is False

    def test_non_text_document(self, caplog):
        with caplog.at_level(logging.WARNING):
            cleaner = DocumentCleaner()
            cleaner.run(documents=[Document()])
            assert "DocumentCleaner only cleans text documents but document.content for document ID" in caplog.text

    def test_single_document(self):
        with pytest.raises(TypeError, match="DocumentCleaner expects a List of Documents as input."):
            cleaner = DocumentCleaner()
            cleaner.run(documents=Document())

    def test_empty_list(self):
        cleaner = DocumentCleaner()
        result = cleaner.run(documents=[])
        assert result == {"documents": []}

    def test_remove_empty_lines(self):
        cleaner = DocumentCleaner(remove_extra_whitespaces=False)
        result = cleaner.run(
            documents=[
                Document(
                    content="This is a text with some words. \f"
                    ""
                    "There is a second sentence. "
                    ""
                    "And there is a third sentence."
                )
            ]
        )
        assert len(result["documents"]) == 1
        assert (
            result["documents"][0].content
            == "This is a text with some words. \fThere is a second sentence. And there is a third sentence."
        )

    def test_remove_whitespaces(self):
        cleaner = DocumentCleaner(remove_empty_lines=False)
        result = cleaner.run(
            documents=[
                Document(
                    content=" This is a text with some words. "
                    ""
                    "There is a second sentence.  "
                    ""
                    "And there  is a third sentence.\f "
                )
            ]
        )
        assert len(result["documents"]) == 1
        assert result["documents"][0].content == (
            "This is a text with some words. There is a second sentence. And there is a third sentence.\f"
        )

    def test_remove_substrings(self):
        cleaner = DocumentCleaner(remove_substrings=["This", "A", "words", "🪲"])
        result = cleaner.run(documents=[Document(content="This is a text with some words.\f🪲")])
        assert len(result["documents"]) == 1
        assert result["documents"][0].content == " is a text with some .\f"

    def test_remove_regex(self):
        cleaner = DocumentCleaner(remove_regex=r"\s\s+")
        result = cleaner.run(documents=[Document(content="This is a  text \f with   some words.")])
        assert len(result["documents"]) == 1
        assert result["documents"][0].content == "This is a text\fwith some words."

    def test_remove_repeated_substrings(self):
        cleaner = DocumentCleaner(
            remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True
        )

        text = """First Page\fThis is a header.
        Page  of
        2
        4
        Lorem ipsum dolor sit amet
        This is a footer number 1
        This is footer number 2This is a header.
        Page  of
        3
        4
        Sid ut perspiciatis unde
        This is a footer number 1
        This is footer number 2This is a header.
        Page  of
        4
        4
        Sed do eiusmod tempor.
        This is a footer number 1
        This is footer number 2"""

        expected_text = """First Page\f 2
        4
        Lorem ipsum dolor sit amet 3
        4
        Sid ut perspiciatis unde 4
        4
        Sed do eiusmod tempor."""
        result = cleaner.run(documents=[Document(content=text)])
        assert result["documents"][0].content == expected_text

    def test_copy_metadata(self):
        cleaner = DocumentCleaner()
        documents = [
            Document(content="Text. ", meta={"name": "doc 0"}),
            Document(content="Text. ", meta={"name": "doc 1"}),
        ]
        result = cleaner.run(documents=documents)
        assert len(result["documents"]) == 2
        assert result["documents"][0].id != result["documents"][1].id
        for doc, cleaned_doc in zip(documents, result["documents"]):
            assert doc.meta == cleaned_doc.meta
            assert cleaned_doc.content == "Text."

    def test_keep_id_does_not_alter_document_ids(self):
        cleaner = DocumentCleaner(keep_id=True)
        documents = [Document(content="Text. ", id="1"), Document(content="Text. ", id="2")]
        result = cleaner.run(documents=documents)
        assert len(result["documents"]) == 2
        assert result["documents"][0].id == "1"
        assert result["documents"][1].id == "2"

    def test_unicode_normalization(self):
        text = """\
        ｱｲｳｴｵ
        Comment ça va
        مرحبا بالعالم
        em Space"""

        expected_text_NFC = """\
        ｱｲｳｴｵ
        Comment ça va
        مرحبا بالعالم
        em Space"""

        expected_text_NFD = """\
        ｱｲｳｴｵ
        Comment ça va
        مرحبا بالعالم
        em Space"""

        expected_text_NFKC = """\
        アイウエオ
        Comment ça va
        مرحبا بالعالم
        em Space"""

        expected_text_NFKD = """\
        アイウエオ
        Comment ça va
        مرحبا بالعالم
        em Space"""

        nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False)
        nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False)
        nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False)
        nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False)

        nfc_result = nfc_cleaner.run(documents=[Document(content=text)])
        nfd_result = nfd_cleaner.run(documents=[Document(content=text)])
        nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)])
        nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)])

        assert nfc_result["documents"][0].content == expected_text_NFC
        assert nfd_result["documents"][0].content == expected_text_NFD
        assert nfkc_result["documents"][0].content == expected_text_NFKC
        assert nfkd_result["documents"][0].content == expected_text_NFKD

    def test_ascii_only(self):
        text = """\
        ｱｲｳｴｵ
        Comment ça va
        Á
        مرحبا بالعالم
        em Space"""

        expected_text = """\
        \n\
        Comment ca va
        A
         \n\
        em Space"""

        cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
        result = cleaner.run(documents=[Document(content=text)])
        assert result["documents"][0].content == expected_text

    def test_other_document_fields_are_not_lost(self):
        cleaner = DocumentCleaner(keep_id=True)
        document = Document(
            content="This is a text with some words. \nThere is a second sentence. \nAnd there is a third sentence.\n",
            blob=ByteStream.from_string("some_data"),
            meta={"data": 1},
            score=0.1,
            embedding=[0.1, 0.2, 0.3],
            sparse_embedding=SparseEmbedding([0, 2], [0.1, 0.3]),
        )
        res = cleaner.run(documents=[document])

        assert len(res) == 1
        assert len(res["documents"])
        assert res["documents"][0].id == document.id
        assert res["documents"][0].content == (
            "This is a text with some words. There is a second sentence. And there is a third sentence."
        )
        assert res["documents"][0].blob == document.blob
        assert res["documents"][0].meta == document.meta
        assert res["documents"][0].score == document.score
        assert res["documents"][0].embedding == document.embedding
        assert res["documents"][0].sparse_embedding == document.sparse_embedding
-												chore: add license header to all modules (#7675)

* add license header to modules
* check license header at linting time
											
										
										
											2024-05-09 15:40:36 +02:00
+								# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 								#
 								# SPDX-License-Identifier: Apache-2.0
-												chore: removing unused imports from tests (#9446)


											
										
										
											2025-05-26 17:22:51 +01:00
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								import logging
 								import pytest
-												Fix all tests

											
										
										
											2023-11-24 14:48:43 +01:00
+								from haystack import Document
-												Fix DocumentCleaner not preserving Document fields (#8578)


											
										
										
											2024-11-25 13:08:59 +01:00
+								from haystack.dataclasses import ByteStream, SparseEmbedding
-												Fix all tests

											
										
										
											2023-11-24 14:48:43 +01:00
+								from haystack.components.preprocessors import DocumentCleaner
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								class TestDocumentCleaner:
 								    def test_init(self):
 								        cleaner = DocumentCleaner()
-												refactor!: rename `TextDocumentSplitter` to `DocumentSplitter` (#6223)

* rename TextDocumentSplitter to DocumentSplitter

* reno

* fix init
											
										
										
											2023-11-03 11:33:20 +01:00
+								        assert cleaner.remove_empty_lines is True
 								        assert cleaner.remove_extra_whitespaces is True
 								        assert cleaner.remove_repeated_substrings is False
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        assert cleaner.remove_substrings is None
 								        assert cleaner.remove_regex is None
-												add keep-id to DocumentCleaner (#7703)


											
										
										
											2024-05-16 19:18:48 +02:00
+								        assert cleaner.keep_id is False
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								    def test_non_text_document(self, caplog):
 								        with caplog.at_level(logging.WARNING):
 								            cleaner = DocumentCleaner()
 								            cleaner.run(documents=[Document()])
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								            assert "DocumentCleaner only cleans text documents but document.content for document ID" in caplog.text
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								    def test_single_document(self):
 								        with pytest.raises(TypeError, match="DocumentCleaner expects a List of Documents as input."):
 								            cleaner = DocumentCleaner()
 								            cleaner.run(documents=Document())
 								    def test_empty_list(self):
 								        cleaner = DocumentCleaner()
 								        result = cleaner.run(documents=[])
 								        assert result == {"documents": []}
 								    def test_remove_empty_lines(self):
 								        cleaner = DocumentCleaner(remove_extra_whitespaces=False)
 								        result = cleaner.run(
 								            documents=[
 								                Document(
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								                    content="This is a text with some words. \f"
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								                    ""
 								                    "There is a second sentence. "
 								                    ""
 								                    "And there is a third sentence."
 								                )
 								            ]
 								        )
 								        assert len(result["documents"]) == 1
 								        assert (
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								            result["documents"][0].content
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								            == "This is a text with some words. \fThere is a second sentence. And there is a third sentence."
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        )
 								    def test_remove_whitespaces(self):
 								        cleaner = DocumentCleaner(remove_empty_lines=False)
 								        result = cleaner.run(
 								            documents=[
 								                Document(
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								                    content=" This is a text with some words. "
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								                    ""
 								                    "There is a second sentence.  "
 								                    ""
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								                    "And there  is a third sentence.\f "
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								                )
 								            ]
 								        )
 								        assert len(result["documents"]) == 1
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								        assert result["documents"][0].content == (
-												CI: fix format after newly introduced formatting rules from ruff release (#8696)


											
										
										
											2025-01-09 17:25:55 +01:00
+								            "This is a text with some words. There is a second sentence. And there is a third sentence.\f"
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        )
 								    def test_remove_substrings(self):
 								        cleaner = DocumentCleaner(remove_substrings=["This", "A", "words", "🪲"])
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        result = cleaner.run(documents=[Document(content="This is a text with some words.\f🪲")])
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        assert len(result["documents"]) == 1
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        assert result["documents"][0].content == " is a text with some .\f"
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								    def test_remove_regex(self):
 								        cleaner = DocumentCleaner(remove_regex=r"\s\s+")
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        result = cleaner.run(documents=[Document(content="This is a  text \f with   some words.")])
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        assert len(result["documents"]) == 1
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        assert result["documents"][0].content == "This is a text\fwith some words."
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								    def test_remove_repeated_substrings(self):
 								        cleaner = DocumentCleaner(
 								            remove_empty_lines=False, remove_extra_whitespaces=False, remove_repeated_substrings=True
 								        )
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        text = """First Page\fThis is a header.
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
+								        Page  of
 
 
 								        Lorem ipsum dolor sit amet
 								        This is a footer number 1
 								        This is footer number 2This is a header.
 								        Page  of
 
 
 								        Sid ut perspiciatis unde
 								        This is a footer number 1
 								        This is footer number 2This is a header.
 								        Page  of
 
 
 								        Sed do eiusmod tempor.
 								        This is a footer number 1
 								        This is footer number 2"""
-												fix: DocumentCleaner: keep the \f in text (#8078)

* Keep the \f in Document Cleaner

* Add Reno

* Add Test

* Simplified _remove_empty_lines() code
											
										
										
											2024-08-07 14:50:14 +02:00
+								        expected_text = """First Page\f 2
-												feat: Add DocumentCleaner 2.0 (#5976)

* remove whitespaces, substrings, regex, empty lines

* remove repeated substrings

* reno

* return empty string as shortest common ngram

* address first half of review feedback

* address second half of review feedback

* mention \f page separator for header/footer removal

* mention \f page separator for header/footer removal

* mark example usage as python code
											
										
										
											2023-10-13 12:39:55 +02:00
 								        Lorem ipsum dolor sit amet 3
 
 								        Sid ut perspiciatis unde 4
 
 								        Sed do eiusmod tempor."""
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								        result = cleaner.run(documents=[Document(content=text)])
 								        assert result["documents"][0].content == expected_text
-												fix: DocumentSplitter and DocumentCleaner copy `id_hash_keys` to newly created Documents (#6083)

* copy id_hash_keys in splitter and cleaner

* reno
											
										
										
											2023-10-17 11:03:48 +02:00
-												Remove id_hash_keys from DocumentCleaner (#6123)


											
										
										
											2023-10-20 15:16:06 +02:00
+								    def test_copy_metadata(self):
-												fix: DocumentSplitter and DocumentCleaner copy `id_hash_keys` to newly created Documents (#6083)

* copy id_hash_keys in splitter and cleaner

* reno
											
										
										
											2023-10-17 11:03:48 +02:00
+								        cleaner = DocumentCleaner()
 								        documents = [
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								            Document(content="Text. ", meta={"name": "doc 0"}),
 								            Document(content="Text. ", meta={"name": "doc 1"}),
-												fix: DocumentSplitter and DocumentCleaner copy `id_hash_keys` to newly created Documents (#6083)

* copy id_hash_keys in splitter and cleaner

* reno
											
										
										
											2023-10-17 11:03:48 +02:00
+								        ]
 								        result = cleaner.run(documents=documents)
 								        assert len(result["documents"]) == 2
 								        assert result["documents"][0].id != result["documents"][1].id
 								        for doc, cleaned_doc in zip(documents, result["documents"]):
-												refactor: Rename `Document`'s `text` field to `content` (#6181)

* Rework Document serialisation

Make Document backward compatible

Fix InMemoryDocumentStore filters

Fix InMemoryDocumentStore.bm25_retrieval

Add release notes

Fix pylint failures

Enhance Document kwargs handling and docstrings

Rename Document's text field to content

Fix e2e tests

Fix SimilarityRanker tests

Fix typo in release notes

Rename Document's metadata field to meta (#6183)

* fix bugs

* make linters happy

* fix

* more fix

* match regex

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
											
										
										
											2023-10-31 12:44:04 +01:00
+								            assert doc.meta == cleaned_doc.meta
 								            assert cleaned_doc.content == "Text."
-												add keep-id to DocumentCleaner (#7703)


											
										
										
											2024-05-16 19:18:48 +02:00
 								    def test_keep_id_does_not_alter_document_ids(self):
 								        cleaner = DocumentCleaner(keep_id=True)
 								        documents = [Document(content="Text. ", id="1"), Document(content="Text. ", id="2")]
 								        result = cleaner.run(documents=documents)
 								        assert len(result["documents"]) == 2
 								        assert result["documents"][0].id == "1"
 								        assert result["documents"][1].id == "2"
-												feat: add unicode normalization & ascii_only mode for DocumentCleaner (#8103)

* feat: add unicode normalization & ascii_only mode for DocumentCleaner.

* feat: add unicode_normalization parameter valdiation to DocumentCleaner.

* test: fix the unit test to work after code linting.

											
										
										
											2024-08-05 21:00:39 +10:00
 								    def test_unicode_normalization(self):
 								        text = """\
 								        ｱｲｳｴｵ
 								        Comment ça va
 								        مرحبا بالعالم
 								        em Space"""
 								        expected_text_NFC = """\
 								        ｱｲｳｴｵ
 								        Comment ça va
 								        مرحبا بالعالم
 								        em Space"""
 								        expected_text_NFD = """\
 								        ｱｲｳｴｵ
 								        Comment ça va
 								        مرحبا بالعالم
 								        em Space"""
 								        expected_text_NFKC = """\
 								        アイウエオ
 								        Comment ça va
 								        مرحبا بالعالم
 								        em Space"""
 								        expected_text_NFKD = """\
 								        アイウエオ
 								        Comment ça va
 								        مرحبا بالعالم
 								        em Space"""
 								        nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False)
 								        nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False)
 								        nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False)
 								        nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False)
 								        nfc_result = nfc_cleaner.run(documents=[Document(content=text)])
 								        nfd_result = nfd_cleaner.run(documents=[Document(content=text)])
 								        nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)])
 								        nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)])
 								        assert nfc_result["documents"][0].content == expected_text_NFC
 								        assert nfd_result["documents"][0].content == expected_text_NFD
 								        assert nfkc_result["documents"][0].content == expected_text_NFKC
 								        assert nfkd_result["documents"][0].content == expected_text_NFKD
 								    def test_ascii_only(self):
 								        text = """\
 								        ｱｲｳｴｵ
 								        Comment ça va
 								        Á
 								        مرحبا بالعالم
 								        em Space"""
 								        expected_text = """\
 								        \n\
 								        Comment ca va
 								        A
 								         \n\
 								        em Space"""
 								        cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
 								        result = cleaner.run(documents=[Document(content=text)])
 								        assert result["documents"][0].content == expected_text
-												Fix DocumentCleaner not preserving Document fields (#8578)


											
										
										
											2024-11-25 13:08:59 +01:00
 								    def test_other_document_fields_are_not_lost(self):
 								        cleaner = DocumentCleaner(keep_id=True)
 								        document = Document(
-												CI: fix format after newly introduced formatting rules from ruff release (#8696)


											
										
										
											2025-01-09 17:25:55 +01:00
+								            content="This is a text with some words. \nThere is a second sentence. \nAnd there is a third sentence.\n",
-												Fix DocumentCleaner not preserving Document fields (#8578)


											
										
										
											2024-11-25 13:08:59 +01:00
+								            blob=ByteStream.from_string("some_data"),
 								            meta={"data": 1},
 								            score=0.1,
 								            embedding=[0.1, 0.2, 0.3],
 								            sparse_embedding=SparseEmbedding([0, 2], [0.1, 0.3]),
 								        )
 								        res = cleaner.run(documents=[document])
 								        assert len(res) == 1
 								        assert len(res["documents"])
 								        assert res["documents"][0].id == document.id
 								        assert res["documents"][0].content == (
 								            "This is a text with some words. There is a second sentence. And there is a third sentence."
 								        )
 								        assert res["documents"][0].blob == document.blob
 								        assert res["documents"][0].meta == document.meta
 								        assert res["documents"][0].score == document.score
 								        assert res["documents"][0].embedding == document.embedding
 								        assert res["documents"][0].sparse_embedding == document.sparse_embedding