mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
Add test script for ChineseDocumentSplitter, remove Chinese comments, and fix lint issues
This commit is contained in:
parent
7b2d038098
commit
10ddc6edc0
@ -1,54 +1,41 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, List, Literal, Tuple
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
|
||||
|
||||
import hanlp
|
||||
from more_itertools import windowed
|
||||
|
||||
from haystack import Document, component, logging
|
||||
from haystack.components.preprocessors import DocumentSplitter
|
||||
from haystack.lazy_imports import LazyImport
|
||||
|
||||
with LazyImport("Run 'pip install hanlp'") as hanlp_import:
|
||||
import hanlp
|
||||
|
||||
from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports
|
||||
from haystack.core.serialization import default_from_dict, default_to_dict
|
||||
from haystack.utils import deserialize_callable, serialize_callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# mapping of split by character, 'function' and 'sentence' don't split by character
|
||||
_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
|
||||
chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
# Load Chinese sentence slicer
|
||||
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
|
||||
|
||||
|
||||
@component
|
||||
class ChineseDocumentSplitter(DocumentSplitter):
|
||||
class chinese_DocumentSplitter(DocumentSplitter):
|
||||
def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
|
||||
super(ChineseDocumentSplitter, self).__init__(*args, **kwargs)
|
||||
|
||||
# coarse代表粗颗粒度中文分词,fine代表细颗粒度分词,默认为粗颗粒度分词
|
||||
# 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word
|
||||
# segmentation, default is coarse granularity word segmentation
|
||||
super(chinese_DocumentSplitter, self).__init__(*args, **kwargs)
|
||||
self.particle_size = particle_size
|
||||
# self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
# self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
|
||||
# # 加载中文的句子切分器
|
||||
# self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
|
||||
|
||||
hanlp_import.check()
|
||||
|
||||
self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) # 加载中文的句子切分器
|
||||
|
||||
def _split_by_character(self, doc) -> List[Document]:
|
||||
split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
|
||||
# 'coarse' represents coarse granularity Chinese word segmentation,
|
||||
# 'fine' represents fine granularity word segmentation,
|
||||
# default is coarse granularity word segmentation
|
||||
if self.language == "zh" and self.particle_size == "coarse":
|
||||
units = self.chinese_tokenizer_coarse(doc.content)
|
||||
units = chinese_tokenizer_coarse(doc.content)
|
||||
|
||||
if self.language == "zh" and self.particle_size == "fine":
|
||||
units = self.chinese_tokenizer_fine(doc.content)
|
||||
units = chinese_tokenizer_fine(doc.content)
|
||||
if self.language == "en":
|
||||
units = doc.content.split(split_at)
|
||||
# Add the delimiter back to all units except the last one
|
||||
@ -63,18 +50,13 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata
|
||||
)
|
||||
|
||||
# 定义一个函数用于处理中文分句
|
||||
# Define a function to handle Chinese clauses
|
||||
def chinese_sentence_split(self, text: str) -> list:
|
||||
"""
|
||||
Segmentation of Chinese text.
|
||||
"""Split Chinese text into sentences."""
|
||||
# Split sentences
|
||||
sentences = split_sent(text)
|
||||
|
||||
:param text: The Chinese text to be segmented.
|
||||
:returns: A list of dictionaries, each containing a sentence and its start and end indices.
|
||||
"""
|
||||
# 分句
|
||||
sentences = self.split_sent(text)
|
||||
|
||||
# 整理格式
|
||||
# Organize the format of segmented sentences
|
||||
results = []
|
||||
start = 0
|
||||
for sentence in sentences:
|
||||
@ -94,8 +76,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
|
||||
return self._split_by_character(doc)
|
||||
|
||||
@staticmethod
|
||||
def _concatenate_sentences_based_on_word_amount(
|
||||
self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
|
||||
sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
|
||||
) -> Tuple[List[str], List[int], List[int]]:
|
||||
"""
|
||||
Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
|
||||
@ -122,18 +105,16 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
for sentence_idx, sentence in enumerate(sentences):
|
||||
current_chunk.append(sentence)
|
||||
if language == "zh" and particle_size == "coarse":
|
||||
chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
|
||||
chunk_word_count += len(chinese_tokenizer_coarse(sentence))
|
||||
next_sentence_word_count = (
|
||||
len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
|
||||
len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
|
||||
if sentence_idx < len(sentences) - 1
|
||||
else 0
|
||||
)
|
||||
if language == "zh" and particle_size == "fine":
|
||||
chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
|
||||
chunk_word_count += len(chinese_tokenizer_fine(sentence))
|
||||
next_sentence_word_count = (
|
||||
len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
|
||||
if sentence_idx < len(sentences) - 1
|
||||
else 0
|
||||
len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
|
||||
)
|
||||
|
||||
# Number of words in the current chunk plus the next sentence is larger than the split_length,
|
||||
@ -145,8 +126,7 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
split_start_indices.append(chunk_start_idx)
|
||||
|
||||
# Get the number of sentences that overlap with the next chunk
|
||||
num_sentences_to_keep = ChineseDocumentSplitter._number_of_sentences_to_keep(
|
||||
self,
|
||||
num_sentences_to_keep = chinese_DocumentSplitter._number_of_sentences_to_keep(
|
||||
sentences=current_chunk,
|
||||
split_length=split_length,
|
||||
split_overlap=split_overlap,
|
||||
@ -178,12 +158,12 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
|
||||
return text_splits, split_start_page_numbers, split_start_indices
|
||||
|
||||
# 增加中文句子切分,通过languge == "zh",进行启用
|
||||
# Add Chinese sentence segmentation and enable it using language=="zh"
|
||||
def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
|
||||
split_docs = []
|
||||
|
||||
if self.language == "zh":
|
||||
result = ChineseDocumentSplitter.chinese_sentence_split(doc.content)
|
||||
result = self.chinese_sentence_split(doc.content)
|
||||
if self.language == "en":
|
||||
result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run()
|
||||
|
||||
@ -316,8 +296,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
|
||||
previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
|
||||
|
||||
@staticmethod
|
||||
def _number_of_sentences_to_keep(
|
||||
self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
|
||||
sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
|
||||
) -> int:
|
||||
"""
|
||||
Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
|
||||
@ -333,20 +314,16 @@ class ChineseDocumentSplitter(DocumentSplitter):
|
||||
|
||||
num_sentences_to_keep = 0
|
||||
num_words = 0
|
||||
# chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
# Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
|
||||
|
||||
for sent in reversed(sentences[1:]):
|
||||
if language == "zh" and particle_size == "coarse":
|
||||
num_words += len(self.chinese_tokenizer_coarse(sent))
|
||||
# num_words += len(sent.split())
|
||||
num_words += len(chinese_tokenizer_coarse(sent))
|
||||
if language == "zh" and particle_size == "fine":
|
||||
num_words += len(self.chinese_tokenizer_fine(sent))
|
||||
num_words += len(chinese_tokenizer_fine(sent))
|
||||
# If the number of words is larger than the split_length then don't add any more sentences
|
||||
if num_words > split_length:
|
||||
break
|
||||
num_sentences_to_keep += 1
|
||||
if num_words > split_overlap:
|
||||
break
|
||||
|
||||
return num_sentences_to_keep
|
||||
|
@ -293,6 +293,7 @@ warn_unused_configs = true
|
||||
ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
# TODO: Fix component typings
|
||||
module = ["haystack.components.*", "haystack.testing.*"]
|
||||
|
@ -2,4 +2,3 @@
|
||||
enhancements:
|
||||
- |
|
||||
Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown.
|
||||
|
||||
|
129
test/components/preprocessors/test_chinese_document_splitter.py
Normal file
129
test/components/preprocessors/test_chinese_document_splitter.py
Normal file
@ -0,0 +1,129 @@
|
||||
import pytest
|
||||
from haystack import Document
|
||||
from haystack.components.preprocessors.chinese_document_spliter import chinese_DocumentSplitter
|
||||
|
||||
|
||||
class TestChineseDocumentSplitter:
|
||||
@pytest.fixture
|
||||
def sample_text(self) -> str:
|
||||
return "这是第一句话,也是故事的开端,紧接着是第二句话,渐渐引出了背景;随后,翻开新/f的一页,我们读到了这一页的第一句话,继续延展出情节的发展,直到这页的第二句话将整段文字温柔地收束于平静之中。"
|
||||
|
||||
def test_split_by_word(self, sample_text):
|
||||
"""
|
||||
Test splitting by word.
|
||||
|
||||
Note on Chinese words:
|
||||
Unlike English where words are usually separated by spaces,
|
||||
Chinese text is written continuously without spaces between words.
|
||||
Chinese words can consist of multiple characters.
|
||||
For example, the English word "America" is translated to "美国" in Chinese,
|
||||
which consists of two characters but is treated as a single word.
|
||||
Similarly, "Portugal" is "葡萄牙" in Chinese,
|
||||
three characters but one word.
|
||||
Therefore, splitting by word means splitting by these multi-character tokens,
|
||||
not simply by single characters or spaces.
|
||||
"""
|
||||
splitter = chinese_DocumentSplitter(
|
||||
split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0
|
||||
)
|
||||
if hasattr(splitter, "warm_up"):
|
||||
splitter.warm_up()
|
||||
|
||||
result = splitter.run(documents=[Document(content=sample_text)])
|
||||
docs = result["documents"]
|
||||
|
||||
assert all(isinstance(doc, Document) for doc in docs)
|
||||
assert all(len(doc.content.strip()) <= 10 for doc in docs)
|
||||
|
||||
def test_split_by_sentence(self, sample_text):
|
||||
splitter = chinese_DocumentSplitter(
|
||||
split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0
|
||||
)
|
||||
if hasattr(splitter, "warm_up"):
|
||||
splitter.warm_up()
|
||||
|
||||
result = splitter.run(documents=[Document(content=sample_text)])
|
||||
docs = result["documents"]
|
||||
|
||||
assert all(isinstance(doc, Document) for doc in docs)
|
||||
assert all(doc.content.strip() != "" for doc in docs)
|
||||
assert any("。" in doc.content for doc in docs), "Expected at least one chunk containing a full stop."
|
||||
|
||||
def test_respect_sentence_boundary(self):
|
||||
"""Test that respect_sentence_boundary=True avoids splitting sentences"""
|
||||
text = "这是第一句话,这是第二句话,这是第三句话。这是第四句话,这是第五句话,这是第六句话!这是第七句话,这是第八句话,这是第九句话?"
|
||||
doc = Document(content=text)
|
||||
|
||||
splitter = chinese_DocumentSplitter(
|
||||
split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc])
|
||||
docs = result["documents"]
|
||||
|
||||
print(f"Total chunks created: {len(docs)}.")
|
||||
for i, d in enumerate(docs):
|
||||
print(f"\nChunk {i + 1}:\n{d.content}")
|
||||
# Optional: check that sentences are not cut off
|
||||
assert d.content.strip().endswith(("。", "!", "?")), "Sentence was cut off!"
|
||||
|
||||
def test_overlap_chunks_with_long_text(self):
|
||||
"""Test split_overlap parameter to ensure there is clear overlap between chunks of long text"""
|
||||
text = (
|
||||
"月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。"
|
||||
"树叶在微风中沙沙作响,影子在地面上摇曳不定。"
|
||||
"一只猫头鹰静静地眨了眨眼,从枝头注视着四周……"
|
||||
"远处的小溪哗啦啦地流淌,仿佛在向石头倾诉着什么。"
|
||||
"“咔嚓”一声,某处的树枝突然断裂,然后恢复了寂静。"
|
||||
"空气中弥漫着松树与湿土的气息,令人心安。"
|
||||
"一只狐狸悄然出现,又迅速消失在灌木丛中。"
|
||||
"天上的星星闪烁着,仿佛在诉说古老的故事。"
|
||||
"时间仿佛停滞了……"
|
||||
"万物静候,聆听着夜的呼吸!"
|
||||
)
|
||||
doc = Document(content=text)
|
||||
|
||||
splitter = chinese_DocumentSplitter(
|
||||
split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse"
|
||||
)
|
||||
if hasattr(splitter, "warm_up"):
|
||||
splitter.warm_up()
|
||||
|
||||
result = splitter.run(documents=[doc])
|
||||
docs = result["documents"]
|
||||
|
||||
print(f"Total chunks generated: {len(docs)}.")
|
||||
for i, d in enumerate(docs):
|
||||
print(f"\nChunk {i + 1}:\n{d.content}")
|
||||
|
||||
assert len(docs) > 1, "Expected multiple chunks to be generated"
|
||||
|
||||
max_len_allowed = 80 # Allow a somewhat relaxed max chunk length
|
||||
assert all(len(doc.content) <= max_len_allowed for doc in docs), (
|
||||
f"Some chunks exceed {max_len_allowed} characters"
|
||||
)
|
||||
|
||||
def has_any_overlap(suffix: str, prefix: str) -> bool:
|
||||
"""
|
||||
Check if suffix and prefix have at least one continuous overlapping character sequence.
|
||||
Tries from longest possible overlap down to 1 character.
|
||||
Returns True if any overlap found.
|
||||
"""
|
||||
max_check_len = min(len(suffix), len(prefix))
|
||||
for length in range(max_check_len, 0, -1):
|
||||
if suffix[-length:] == prefix[:length]:
|
||||
return True
|
||||
return False
|
||||
|
||||
for i in range(1, len(docs)):
|
||||
prev_chunk = docs[i - 1].content
|
||||
curr_chunk = docs[i].content
|
||||
|
||||
# Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap
|
||||
overlap_prev = prev_chunk[-20:]
|
||||
overlap_curr = curr_chunk[:20]
|
||||
|
||||
assert has_any_overlap(overlap_prev, overlap_curr), (
|
||||
f"Chunks {i} and {i + 1} do not overlap. "
|
||||
f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'"
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user