Add test script for ChineseDocumentSplitter, remove Chinese comments, and fix lint issues

This commit is contained in:
mc112611 2025-06-05 15:54:58 +08:00
parent 7b2d038098
commit 10ddc6edc0
5 changed files with 172 additions and 66 deletions

View File

@ -1,54 +1,41 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
from copy import deepcopy
from typing import Any, Dict, List, Literal, Tuple
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
import hanlp
from more_itertools import windowed
from haystack import Document, component, logging
from haystack.components.preprocessors import DocumentSplitter
from haystack.lazy_imports import LazyImport
with LazyImport("Run 'pip install hanlp'") as hanlp_import:
import hanlp
from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.utils import deserialize_callable, serialize_callable
logger = logging.getLogger(__name__)
# mapping of split by character, 'function' and 'sentence' don't split by character
_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
# Load Chinese sentence slicer
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
@component
class ChineseDocumentSplitter(DocumentSplitter):
class chinese_DocumentSplitter(DocumentSplitter):
def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
super(ChineseDocumentSplitter, self).__init__(*args, **kwargs)
# coarse代表粗颗粒度中文分词fine代表细颗粒度分词默认为粗颗粒度分词
# 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word
# segmentation, default is coarse granularity word segmentation
super(chinese_DocumentSplitter, self).__init__(*args, **kwargs)
self.particle_size = particle_size
# self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
# self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
# # 加载中文的句子切分器
# self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
hanlp_import.check()
self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) # 加载中文的句子切分器
def _split_by_character(self, doc) -> List[Document]:
split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
# 'coarse' represents coarse granularity Chinese word segmentation,
# 'fine' represents fine granularity word segmentation,
# default is coarse granularity word segmentation
if self.language == "zh" and self.particle_size == "coarse":
units = self.chinese_tokenizer_coarse(doc.content)
units = chinese_tokenizer_coarse(doc.content)
if self.language == "zh" and self.particle_size == "fine":
units = self.chinese_tokenizer_fine(doc.content)
units = chinese_tokenizer_fine(doc.content)
if self.language == "en":
units = doc.content.split(split_at)
# Add the delimiter back to all units except the last one
@ -63,18 +50,13 @@ class ChineseDocumentSplitter(DocumentSplitter):
text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
)
# 定义一个函数用于处理中文分句
# Define a function to handle Chinese clauses
def chinese_sentence_split(self, text: str) -> list:
"""
Segmentation of Chinese text.
"""Split Chinese text into sentences."""
# Split sentences
sentences = split_sent(text)
:param text: The Chinese text to be segmented.
:returns: A list of dictionaries, each containing a sentence and its start and end indices.
"""
# 分句
sentences = self.split_sent(text)
# 整理格式
# Organize the format of segmented sentences
results = []
start = 0
for sentence in sentences:
@ -94,8 +76,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
return self._split_by_character(doc)
@staticmethod
def _concatenate_sentences_based_on_word_amount(
self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
) -> Tuple[List[str], List[int], List[int]]:
"""
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@ -122,18 +105,16 @@ class ChineseDocumentSplitter(DocumentSplitter):
for sentence_idx, sentence in enumerate(sentences):
current_chunk.append(sentence)
if language == "zh" and particle_size == "coarse":
chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
chunk_word_count += len(chinese_tokenizer_coarse(sentence))
next_sentence_word_count = (
len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
if sentence_idx < len(sentences) - 1
else 0
)
if language == "zh" and particle_size == "fine":
chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
chunk_word_count += len(chinese_tokenizer_fine(sentence))
next_sentence_word_count = (
len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
if sentence_idx < len(sentences) - 1
else 0
len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
)
# Number of words in the current chunk plus the next sentence is larger than the split_length,
@ -145,8 +126,7 @@ class ChineseDocumentSplitter(DocumentSplitter):
split_start_indices.append(chunk_start_idx)
# Get the number of sentences that overlap with the next chunk
num_sentences_to_keep = ChineseDocumentSplitter._number_of_sentences_to_keep(
self,
num_sentences_to_keep = chinese_DocumentSplitter._number_of_sentences_to_keep(
sentences=current_chunk,
split_length=split_length,
split_overlap=split_overlap,
@ -178,12 +158,12 @@ class ChineseDocumentSplitter(DocumentSplitter):
return text_splits, split_start_page_numbers, split_start_indices
# 增加中文句子切分通过languge == "zh",进行启用
# Add Chinese sentence segmentation and enable it using language=="zh"
def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
split_docs = []
if self.language == "zh":
result = ChineseDocumentSplitter.chinese_sentence_split(doc.content)
result = self.chinese_sentence_split(doc.content)
if self.language == "en":
result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run()
@ -316,8 +296,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
@staticmethod
def _number_of_sentences_to_keep(
self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
) -> int:
"""
Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
@ -333,20 +314,16 @@ class ChineseDocumentSplitter(DocumentSplitter):
num_sentences_to_keep = 0
num_words = 0
# chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
# Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
for sent in reversed(sentences[1:]):
if language == "zh" and particle_size == "coarse":
num_words += len(self.chinese_tokenizer_coarse(sent))
# num_words += len(sent.split())
num_words += len(chinese_tokenizer_coarse(sent))
if language == "zh" and particle_size == "fine":
num_words += len(self.chinese_tokenizer_fine(sent))
num_words += len(chinese_tokenizer_fine(sent))
# If the number of words is larger than the split_length then don't add any more sentences
if num_words > split_length:
break
num_sentences_to_keep += 1
if num_words > split_overlap:
break
return num_sentences_to_keep

View File

@ -293,6 +293,7 @@ warn_unused_configs = true
ignore_missing_imports = true
check_untyped_defs = true
[[tool.mypy.overrides]]
# TODO: Fix component typings
module = ["haystack.components.*", "haystack.testing.*"]

View File

@ -2,4 +2,3 @@
enhancements:
- |
Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown.

View File

@ -0,0 +1,129 @@
import pytest
from haystack import Document
from haystack.components.preprocessors.chinese_document_spliter import chinese_DocumentSplitter
class TestChineseDocumentSplitter:
@pytest.fixture
def sample_text(self) -> str:
return "这是第一句话,也是故事的开端,紧接着是第二句话,渐渐引出了背景;随后,翻开新/f的一页我们读到了这一页的第一句话继续延展出情节的发展直到这页的第二句话将整段文字温柔地收束于平静之中。"
def test_split_by_word(self, sample_text):
"""
Test splitting by word.
Note on Chinese words:
Unlike English where words are usually separated by spaces,
Chinese text is written continuously without spaces between words.
Chinese words can consist of multiple characters.
For example, the English word "America" is translated to "美国" in Chinese,
which consists of two characters but is treated as a single word.
Similarly, "Portugal" is "葡萄牙" in Chinese,
three characters but one word.
Therefore, splitting by word means splitting by these multi-character tokens,
not simply by single characters or spaces.
"""
splitter = chinese_DocumentSplitter(
split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0
)
if hasattr(splitter, "warm_up"):
splitter.warm_up()
result = splitter.run(documents=[Document(content=sample_text)])
docs = result["documents"]
assert all(isinstance(doc, Document) for doc in docs)
assert all(len(doc.content.strip()) <= 10 for doc in docs)
def test_split_by_sentence(self, sample_text):
splitter = chinese_DocumentSplitter(
split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0
)
if hasattr(splitter, "warm_up"):
splitter.warm_up()
result = splitter.run(documents=[Document(content=sample_text)])
docs = result["documents"]
assert all(isinstance(doc, Document) for doc in docs)
assert all(doc.content.strip() != "" for doc in docs)
assert any("" in doc.content for doc in docs), "Expected at least one chunk containing a full stop."
def test_respect_sentence_boundary(self):
"""Test that respect_sentence_boundary=True avoids splitting sentences"""
text = "这是第一句话,这是第二句话,这是第三句话。这是第四句话,这是第五句话,这是第六句话!这是第七句话,这是第八句话,这是第九句话?"
doc = Document(content=text)
splitter = chinese_DocumentSplitter(
split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True
)
splitter.warm_up()
result = splitter.run(documents=[doc])
docs = result["documents"]
print(f"Total chunks created: {len(docs)}.")
for i, d in enumerate(docs):
print(f"\nChunk {i + 1}:\n{d.content}")
# Optional: check that sentences are not cut off
assert d.content.strip().endswith(("", "", "")), "Sentence was cut off!"
def test_overlap_chunks_with_long_text(self):
"""Test split_overlap parameter to ensure there is clear overlap between chunks of long text"""
text = (
"月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。"
"树叶在微风中沙沙作响,影子在地面上摇曳不定。"
"一只猫头鹰静静地眨了眨眼,从枝头注视着四周……"
"远处的小溪哗啦啦地流淌,仿佛在向石头倾诉着什么。"
"“咔嚓”一声,某处的树枝突然断裂,然后恢复了寂静。"
"空气中弥漫着松树与湿土的气息,令人心安。"
"一只狐狸悄然出现,又迅速消失在灌木丛中。"
"天上的星星闪烁着,仿佛在诉说古老的故事。"
"时间仿佛停滞了……"
"万物静候,聆听着夜的呼吸!"
)
doc = Document(content=text)
splitter = chinese_DocumentSplitter(
split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse"
)
if hasattr(splitter, "warm_up"):
splitter.warm_up()
result = splitter.run(documents=[doc])
docs = result["documents"]
print(f"Total chunks generated: {len(docs)}.")
for i, d in enumerate(docs):
print(f"\nChunk {i + 1}:\n{d.content}")
assert len(docs) > 1, "Expected multiple chunks to be generated"
max_len_allowed = 80 # Allow a somewhat relaxed max chunk length
assert all(len(doc.content) <= max_len_allowed for doc in docs), (
f"Some chunks exceed {max_len_allowed} characters"
)
def has_any_overlap(suffix: str, prefix: str) -> bool:
"""
Check if suffix and prefix have at least one continuous overlapping character sequence.
Tries from longest possible overlap down to 1 character.
Returns True if any overlap found.
"""
max_check_len = min(len(suffix), len(prefix))
for length in range(max_check_len, 0, -1):
if suffix[-length:] == prefix[:length]:
return True
return False
for i in range(1, len(docs)):
prev_chunk = docs[i - 1].content
curr_chunk = docs[i].content
# Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap
overlap_prev = prev_chunk[-20:]
overlap_curr = curr_chunk[:20]
assert has_any_overlap(overlap_prev, overlap_curr), (
f"Chunks {i} and {i + 1} do not overlap. "
f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'"
)