Add test script for ChineseDocumentSplitter, remove Chinese comments, and fix lint issues

2025-06-26 22:00:13 +00:00 · 2025-06-05 15:54:58 +08:00 · 2025-06-05 15:54:58 +08:00 · 10ddc6edc0
commit 10ddc6edc0
parent 7b2d038098
5 changed files with 172 additions and 66 deletions
--- a/haystack/components/preprocessors/chinese_document_spliter.py
+++ b/haystack/components/preprocessors/chinese_document_spliter.py
@ -1,54 +1,41 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
 from copy import deepcopy
-from typing import Any, Dict, List, Literal, Tuple
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

+import hanlp
 from more_itertools import windowed

 from haystack import Document, component, logging
 from haystack.components.preprocessors import DocumentSplitter
-from haystack.lazy_imports import LazyImport
-
-with LazyImport("Run 'pip install hanlp'") as hanlp_import:
-    import hanlp
-
+from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports
+from haystack.core.serialization import default_from_dict, default_to_dict
+from haystack.utils import deserialize_callable, serialize_callable

 logger = logging.getLogger(__name__)

 # mapping of split by character, 'function' and 'sentence' don't split by character
 _CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
+chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+# Load Chinese sentence slicer
+split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)


@component
-class ChineseDocumentSplitter(DocumentSplitter):
+class chinese_DocumentSplitter(DocumentSplitter):
    def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
-        super(ChineseDocumentSplitter, self).__init__(*args, **kwargs)
-
-        # coarse代表粗颗粒度中文分词，fine代表细颗粒度分词，默认为粗颗粒度分词
-        # 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word
-        # segmentation, default is coarse granularity word segmentation
+        super(chinese_DocumentSplitter, self).__init__(*args, **kwargs)
        self.particle_size = particle_size
-        # self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
-        # self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
-
-        # # 加载中文的句子切分器
-        # self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
-
-        hanlp_import.check()
-
-        self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
-        self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
-        self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)  # 加载中文的句子切分器

    def _split_by_character(self, doc) -> List[Document]:
        split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
+        # 'coarse' represents coarse granularity Chinese word segmentation,
+        # 'fine' represents fine granularity word segmentation,
+        #  default is coarse granularity word segmentation
        if self.language == "zh" and self.particle_size == "coarse":
-            units = self.chinese_tokenizer_coarse(doc.content)
+            units = chinese_tokenizer_coarse(doc.content)

        if self.language == "zh" and self.particle_size == "fine":
-            units = self.chinese_tokenizer_fine(doc.content)
+            units = chinese_tokenizer_fine(doc.content)
        if self.language == "en":
            units = doc.content.split(split_at)
            # Add the delimiter back to all units except the last one
@ -63,18 +50,13 @@ class ChineseDocumentSplitter(DocumentSplitter):
            text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
        )

-    # 定义一个函数用于处理中文分句
+    # Define a function to handle Chinese clauses
    def chinese_sentence_split(self, text: str) -> list:
-        """
-        Segmentation of Chinese text.
+        """Split Chinese text into sentences."""
+        # Split sentences
+        sentences = split_sent(text)

-        :param text: The Chinese text to be segmented.
-        :returns: A list of dictionaries, each containing a sentence and its start and end indices.
-        """
-        # 分句
-        sentences = self.split_sent(text)
-
-        # 整理格式
+        # Organize the format of segmented sentences
        results = []
        start = 0
        for sentence in sentences:
@ -94,8 +76,9 @@ class ChineseDocumentSplitter(DocumentSplitter):

        return self._split_by_character(doc)

+    @staticmethod
    def _concatenate_sentences_based_on_word_amount(
-        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
    ) -> Tuple[List[str], List[int], List[int]]:
        """
        Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@ -122,18 +105,16 @@ class ChineseDocumentSplitter(DocumentSplitter):
        for sentence_idx, sentence in enumerate(sentences):
            current_chunk.append(sentence)
            if language == "zh" and particle_size == "coarse":
-                chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
+                chunk_word_count += len(chinese_tokenizer_coarse(sentence))
                next_sentence_word_count = (
-                    len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
+                    len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
                    if sentence_idx < len(sentences) - 1
                    else 0
                )
            if language == "zh" and particle_size == "fine":
-                chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
+                chunk_word_count += len(chinese_tokenizer_fine(sentence))
                next_sentence_word_count = (
-                    len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
-                    if sentence_idx < len(sentences) - 1
-                    else 0
+                    len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
                )

            # Number of words in the current chunk plus the next sentence is larger than the split_length,
@ -145,8 +126,7 @@ class ChineseDocumentSplitter(DocumentSplitter):
                split_start_indices.append(chunk_start_idx)

                # Get the number of sentences that overlap with the next chunk
-                num_sentences_to_keep = ChineseDocumentSplitter._number_of_sentences_to_keep(
-                    self,
+                num_sentences_to_keep = chinese_DocumentSplitter._number_of_sentences_to_keep(
                    sentences=current_chunk,
                    split_length=split_length,
                    split_overlap=split_overlap,
@ -178,12 +158,12 @@ class ChineseDocumentSplitter(DocumentSplitter):

        return text_splits, split_start_page_numbers, split_start_indices

-    # 增加中文句子切分，通过languge == "zh"，进行启用
+    # Add Chinese sentence segmentation and enable it using language=="zh"
    def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
        split_docs = []

        if self.language == "zh":
-            result = ChineseDocumentSplitter.chinese_sentence_split(doc.content)
+            result = self.chinese_sentence_split(doc.content)
        if self.language == "en":
            result = self.sentence_splitter.split_sentences(doc.content)  # type: ignore # None check is done in run()

@ -316,8 +296,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
                overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})

+    @staticmethod
    def _number_of_sentences_to_keep(
-        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
    ) -> int:
        """
        Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
@ -333,20 +314,16 @@ class ChineseDocumentSplitter(DocumentSplitter):

        num_sentences_to_keep = 0
        num_words = 0
-        # chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
-        # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
-        # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
+
        for sent in reversed(sentences[1:]):
            if language == "zh" and particle_size == "coarse":
-                num_words += len(self.chinese_tokenizer_coarse(sent))
-                # num_words += len(sent.split())
+                num_words += len(chinese_tokenizer_coarse(sent))
            if language == "zh" and particle_size == "fine":
-                num_words += len(self.chinese_tokenizer_fine(sent))
+                num_words += len(chinese_tokenizer_fine(sent))
            # If the number of words is larger than the split_length then don't add any more sentences
            if num_words > split_length:
                break
            num_sentences_to_keep += 1
            if num_words > split_overlap:
                break
-
        return num_sentences_to_keep
--- a/pyproject.toml
+++ b/pyproject.toml
@ -293,6 +293,7 @@ warn_unused_configs = true
 ignore_missing_imports = true
 check_untyped_defs = true

+
 [[tool.mypy.overrides]]
 # TODO: Fix component typings
 module = ["haystack.components.*", "haystack.testing.*"]
--- a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml
+++ b/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml
@ -2,4 +2,3 @@
 enhancements:
  - |
    Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown.
-
--- a/test/components/preprocessors/test_chinese_document_splitter.py
+++ b/test/components/preprocessors/test_chinese_document_splitter.py
@ -0,0 +1,129 @@
+import pytest
+from haystack import Document
+from haystack.components.preprocessors.chinese_document_spliter import chinese_DocumentSplitter
+
+
+class TestChineseDocumentSplitter:
+    @pytest.fixture
+    def sample_text(self) -> str:
+        return "这是第一句话，也是故事的开端，紧接着是第二句话，渐渐引出了背景；随后，翻开新/f的一页，我们读到了这一页的第一句话，继续延展出情节的发展，直到这页的第二句话将整段文字温柔地收束于平静之中。"
+
+    def test_split_by_word(self, sample_text):
+        """
+        Test splitting by word.
+
+        Note on Chinese words:
+        Unlike English where words are usually separated by spaces,
+        Chinese text is written continuously without spaces between words.
+        Chinese words can consist of multiple characters.
+        For example, the English word "America" is translated to "美国" in Chinese,
+        which consists of two characters but is treated as a single word.
+        Similarly, "Portugal" is "葡萄牙" in Chinese,
+        three characters but one word.
+        Therefore, splitting by word means splitting by these multi-character tokens,
+        not simply by single characters or spaces.
+        """
+        splitter = chinese_DocumentSplitter(
+            split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[Document(content=sample_text)])
+        docs = result["documents"]
+
+        assert all(isinstance(doc, Document) for doc in docs)
+        assert all(len(doc.content.strip()) <= 10 for doc in docs)
+
+    def test_split_by_sentence(self, sample_text):
+        splitter = chinese_DocumentSplitter(
+            split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[Document(content=sample_text)])
+        docs = result["documents"]
+
+        assert all(isinstance(doc, Document) for doc in docs)
+        assert all(doc.content.strip() != "" for doc in docs)
+        assert any("。" in doc.content for doc in docs), "Expected at least one chunk containing a full stop."
+
+    def test_respect_sentence_boundary(self):
+        """Test that respect_sentence_boundary=True avoids splitting sentences"""
+        text = "这是第一句话，这是第二句话，这是第三句话。这是第四句话，这是第五句话，这是第六句话！这是第七句话，这是第八句话，这是第九句话？"
+        doc = Document(content=text)
+
+        splitter = chinese_DocumentSplitter(
+            split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True
+        )
+        splitter.warm_up()
+        result = splitter.run(documents=[doc])
+        docs = result["documents"]
+
+        print(f"Total chunks created: {len(docs)}.")
+        for i, d in enumerate(docs):
+            print(f"\nChunk {i + 1}:\n{d.content}")
+            # Optional: check that sentences are not cut off
+            assert d.content.strip().endswith(("。", "！", "？")), "Sentence was cut off!"
+
+    def test_overlap_chunks_with_long_text(self):
+        """Test split_overlap parameter to ensure there is clear overlap between chunks of long text"""
+        text = (
+            "月光轻轻洒落，林中传来阵阵狼嚎，夜色悄然笼罩一切。"
+            "树叶在微风中沙沙作响，影子在地面上摇曳不定。"
+            "一只猫头鹰静静地眨了眨眼，从枝头注视着四周……"
+            "远处的小溪哗啦啦地流淌，仿佛在向石头倾诉着什么。"
+            "“咔嚓”一声，某处的树枝突然断裂，然后恢复了寂静。"
+            "空气中弥漫着松树与湿土的气息，令人心安。"
+            "一只狐狸悄然出现，又迅速消失在灌木丛中。"
+            "天上的星星闪烁着，仿佛在诉说古老的故事。"
+            "时间仿佛停滞了……"
+            "万物静候，聆听着夜的呼吸！"
+        )
+        doc = Document(content=text)
+
+        splitter = chinese_DocumentSplitter(
+            split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse"
+        )
+        if hasattr(splitter, "warm_up"):
+            splitter.warm_up()
+
+        result = splitter.run(documents=[doc])
+        docs = result["documents"]
+
+        print(f"Total chunks generated: {len(docs)}.")
+        for i, d in enumerate(docs):
+            print(f"\nChunk {i + 1}:\n{d.content}")
+
+        assert len(docs) > 1, "Expected multiple chunks to be generated"
+
+        max_len_allowed = 80  # Allow a somewhat relaxed max chunk length
+        assert all(len(doc.content) <= max_len_allowed for doc in docs), (
+            f"Some chunks exceed {max_len_allowed} characters"
+        )
+
+        def has_any_overlap(suffix: str, prefix: str) -> bool:
+            """
+            Check if suffix and prefix have at least one continuous overlapping character sequence.
+            Tries from longest possible overlap down to 1 character.
+            Returns True if any overlap found.
+            """
+            max_check_len = min(len(suffix), len(prefix))
+            for length in range(max_check_len, 0, -1):
+                if suffix[-length:] == prefix[:length]:
+                    return True
+            return False
+
+        for i in range(1, len(docs)):
+            prev_chunk = docs[i - 1].content
+            curr_chunk = docs[i].content
+
+            # Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap
+            overlap_prev = prev_chunk[-20:]
+            overlap_curr = curr_chunk[:20]
+
+            assert has_any_overlap(overlap_prev, overlap_curr), (
+                f"Chunks {i} and {i + 1} do not overlap. "
+                f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'"
+            )