diff --git a/haystack/components/preprocessors/chinese_document_spliter.py b/haystack/components/preprocessors/chinese_document_spliter.py index 7306e65d7..134e48f29 100644 --- a/haystack/components/preprocessors/chinese_document_spliter.py +++ b/haystack/components/preprocessors/chinese_document_spliter.py @@ -1,54 +1,41 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - from copy import deepcopy -from typing import Any, Dict, List, Literal, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple +import hanlp from more_itertools import windowed from haystack import Document, component, logging from haystack.components.preprocessors import DocumentSplitter -from haystack.lazy_imports import LazyImport - -with LazyImport("Run 'pip install hanlp'") as hanlp_import: - import hanlp - +from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports +from haystack.core.serialization import default_from_dict, default_to_dict +from haystack.utils import deserialize_callable, serialize_callable logger = logging.getLogger(__name__) # mapping of split by character, 'function' and 'sentence' don't split by character _CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"} +chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) +chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) +# Load Chinese sentence slicer +split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) @component -class ChineseDocumentSplitter(DocumentSplitter): +class chinese_DocumentSplitter(DocumentSplitter): def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs): - super(ChineseDocumentSplitter, self).__init__(*args, **kwargs) - - # coarse代表粗颗粒度中文分词,fine代表细颗粒度分词,默认为粗颗粒度分词 - # 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word - # segmentation, default is coarse granularity word segmentation + super(chinese_DocumentSplitter, self).__init__(*args, **kwargs) self.particle_size = particle_size - # self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) - # self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) - - # # 加载中文的句子切分器 - # self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) - - hanlp_import.check() - - self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) - self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) - self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) # 加载中文的句子切分器 def _split_by_character(self, doc) -> List[Document]: split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by] + # 'coarse' represents coarse granularity Chinese word segmentation, + # 'fine' represents fine granularity word segmentation, + # default is coarse granularity word segmentation if self.language == "zh" and self.particle_size == "coarse": - units = self.chinese_tokenizer_coarse(doc.content) + units = chinese_tokenizer_coarse(doc.content) if self.language == "zh" and self.particle_size == "fine": - units = self.chinese_tokenizer_fine(doc.content) + units = chinese_tokenizer_fine(doc.content) if self.language == "en": units = doc.content.split(split_at) # Add the delimiter back to all units except the last one @@ -63,18 +50,13 @@ class ChineseDocumentSplitter(DocumentSplitter): text_splits=text_splits, splits_pages=splits_pages, splits_start_idxs=splits_start_idxs, meta=metadata ) - # 定义一个函数用于处理中文分句 + # Define a function to handle Chinese clauses def chinese_sentence_split(self, text: str) -> list: - """ - Segmentation of Chinese text. + """Split Chinese text into sentences.""" + # Split sentences + sentences = split_sent(text) - :param text: The Chinese text to be segmented. - :returns: A list of dictionaries, each containing a sentence and its start and end indices. - """ - # 分句 - sentences = self.split_sent(text) - - # 整理格式 + # Organize the format of segmented sentences results = [] start = 0 for sentence in sentences: @@ -94,8 +76,9 @@ class ChineseDocumentSplitter(DocumentSplitter): return self._split_by_character(doc) + @staticmethod def _concatenate_sentences_based_on_word_amount( - self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str + sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str ) -> Tuple[List[str], List[int], List[int]]: """ Groups the sentences into chunks of `split_length` words while respecting sentence boundaries. @@ -122,18 +105,16 @@ class ChineseDocumentSplitter(DocumentSplitter): for sentence_idx, sentence in enumerate(sentences): current_chunk.append(sentence) if language == "zh" and particle_size == "coarse": - chunk_word_count += len(self.chinese_tokenizer_coarse(sentence)) + chunk_word_count += len(chinese_tokenizer_coarse(sentence)) next_sentence_word_count = ( - len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1])) + len(chinese_tokenizer_coarse(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0 ) if language == "zh" and particle_size == "fine": - chunk_word_count += len(self.chinese_tokenizer_fine(sentence)) + chunk_word_count += len(chinese_tokenizer_fine(sentence)) next_sentence_word_count = ( - len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1])) - if sentence_idx < len(sentences) - 1 - else 0 + len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0 ) # Number of words in the current chunk plus the next sentence is larger than the split_length, @@ -145,8 +126,7 @@ class ChineseDocumentSplitter(DocumentSplitter): split_start_indices.append(chunk_start_idx) # Get the number of sentences that overlap with the next chunk - num_sentences_to_keep = ChineseDocumentSplitter._number_of_sentences_to_keep( - self, + num_sentences_to_keep = chinese_DocumentSplitter._number_of_sentences_to_keep( sentences=current_chunk, split_length=split_length, split_overlap=split_overlap, @@ -178,12 +158,12 @@ class ChineseDocumentSplitter(DocumentSplitter): return text_splits, split_start_page_numbers, split_start_indices - # 增加中文句子切分,通过languge == "zh",进行启用 + # Add Chinese sentence segmentation and enable it using language=="zh" def _split_by_nltk_sentence(self, doc: Document) -> List[Document]: split_docs = [] if self.language == "zh": - result = ChineseDocumentSplitter.chinese_sentence_split(doc.content) + result = self.chinese_sentence_split(doc.content) if self.language == "en": result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run() @@ -316,8 +296,9 @@ class ChineseDocumentSplitter(DocumentSplitter): overlapping_range = (0, overlapping_range[1] - overlapping_range[0]) previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range}) + @staticmethod def _number_of_sentences_to_keep( - self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str + sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str ) -> int: """ Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`. @@ -333,20 +314,16 @@ class ChineseDocumentSplitter(DocumentSplitter): num_sentences_to_keep = 0 num_words = 0 - # chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) - # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) - # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence + for sent in reversed(sentences[1:]): if language == "zh" and particle_size == "coarse": - num_words += len(self.chinese_tokenizer_coarse(sent)) - # num_words += len(sent.split()) + num_words += len(chinese_tokenizer_coarse(sent)) if language == "zh" and particle_size == "fine": - num_words += len(self.chinese_tokenizer_fine(sent)) + num_words += len(chinese_tokenizer_fine(sent)) # If the number of words is larger than the split_length then don't add any more sentences if num_words > split_length: break num_sentences_to_keep += 1 if num_words > split_overlap: break - return num_sentences_to_keep diff --git a/pyproject.toml b/pyproject.toml index ae9362d3d..c7080d6cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -293,6 +293,7 @@ warn_unused_configs = true ignore_missing_imports = true check_untyped_defs = true + [[tool.mypy.overrides]] # TODO: Fix component typings module = ["haystack.components.*", "haystack.testing.*"] diff --git a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml b/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml index 07dacf43a..ca84e705c 100644 --- a/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml +++ b/releasenotes/notes/add-current-date-promptbuilder-ff60c846f5a70dc6.yaml @@ -2,15 +2,15 @@ enhancements: - | Allow the ability to add the current date inside a template in `PromptBuilder` using the following syntax: - + - `{% now 'UTC' %}`: Get the current date for the UTC timezone. - + - `{% now 'America/Chicago' + 'hours=2' %}`: Add two hours to the current date in the Chicago timezone. - + - `{% now 'Europe/Berlin' - 'weeks=2' %}`: Subtract two weeks from the current date in the Berlin timezone. - + - `{% now 'Pacific/Fiji' + 'hours=2', '%H' %}`: Display only the number of hours after adding two hours to the Fiji timezone. - + - `{% now 'Etc/GMT-4', '%I:%M %p' %}`: Change the date format to AM/PM for the GMT-4 timezone. - - Note that if no date format is provided, the default will be `%Y-%m-%d %H:%M:%S`. Please refer to [list of tz database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) for a list of timezones. \ No newline at end of file + + Note that if no date format is provided, the default will be `%Y-%m-%d %H:%M:%S`. Please refer to [list of tz database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) for a list of timezones. diff --git a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml b/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml index ccc967500..6be80ce80 100644 --- a/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml +++ b/releasenotes/notes/single-meta-in-azureconverter-ce1cc196a9b161f3.yaml @@ -2,4 +2,3 @@ enhancements: - | Adds support for single metadata dictionary input in `AzureOCRDocumentConverter`. In this way, additional metadata can be added to all files processed by this component even when the length of the list of sources is unknown. - diff --git a/test/components/preprocessors/test_chinese_document_splitter.py b/test/components/preprocessors/test_chinese_document_splitter.py new file mode 100644 index 000000000..8837419e3 --- /dev/null +++ b/test/components/preprocessors/test_chinese_document_splitter.py @@ -0,0 +1,129 @@ +import pytest +from haystack import Document +from haystack.components.preprocessors.chinese_document_spliter import chinese_DocumentSplitter + + +class TestChineseDocumentSplitter: + @pytest.fixture + def sample_text(self) -> str: + return "这是第一句话,也是故事的开端,紧接着是第二句话,渐渐引出了背景;随后,翻开新/f的一页,我们读到了这一页的第一句话,继续延展出情节的发展,直到这页的第二句话将整段文字温柔地收束于平静之中。" + + def test_split_by_word(self, sample_text): + """ + Test splitting by word. + + Note on Chinese words: + Unlike English where words are usually separated by spaces, + Chinese text is written continuously without spaces between words. + Chinese words can consist of multiple characters. + For example, the English word "America" is translated to "美国" in Chinese, + which consists of two characters but is treated as a single word. + Similarly, "Portugal" is "葡萄牙" in Chinese, + three characters but one word. + Therefore, splitting by word means splitting by these multi-character tokens, + not simply by single characters or spaces. + """ + splitter = chinese_DocumentSplitter( + split_by="word", language="zh", particle_size="coarse", split_length=5, split_overlap=0 + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[Document(content=sample_text)]) + docs = result["documents"] + + assert all(isinstance(doc, Document) for doc in docs) + assert all(len(doc.content.strip()) <= 10 for doc in docs) + + def test_split_by_sentence(self, sample_text): + splitter = chinese_DocumentSplitter( + split_by="sentence", language="zh", particle_size="coarse", split_length=10, split_overlap=0 + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[Document(content=sample_text)]) + docs = result["documents"] + + assert all(isinstance(doc, Document) for doc in docs) + assert all(doc.content.strip() != "" for doc in docs) + assert any("。" in doc.content for doc in docs), "Expected at least one chunk containing a full stop." + + def test_respect_sentence_boundary(self): + """Test that respect_sentence_boundary=True avoids splitting sentences""" + text = "这是第一句话,这是第二句话,这是第三句话。这是第四句话,这是第五句话,这是第六句话!这是第七句话,这是第八句话,这是第九句话?" + doc = Document(content=text) + + splitter = chinese_DocumentSplitter( + split_by="word", split_length=10, split_overlap=3, language="zh", respect_sentence_boundary=True + ) + splitter.warm_up() + result = splitter.run(documents=[doc]) + docs = result["documents"] + + print(f"Total chunks created: {len(docs)}.") + for i, d in enumerate(docs): + print(f"\nChunk {i + 1}:\n{d.content}") + # Optional: check that sentences are not cut off + assert d.content.strip().endswith(("。", "!", "?")), "Sentence was cut off!" + + def test_overlap_chunks_with_long_text(self): + """Test split_overlap parameter to ensure there is clear overlap between chunks of long text""" + text = ( + "月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。" + "树叶在微风中沙沙作响,影子在地面上摇曳不定。" + "一只猫头鹰静静地眨了眨眼,从枝头注视着四周……" + "远处的小溪哗啦啦地流淌,仿佛在向石头倾诉着什么。" + "“咔嚓”一声,某处的树枝突然断裂,然后恢复了寂静。" + "空气中弥漫着松树与湿土的气息,令人心安。" + "一只狐狸悄然出现,又迅速消失在灌木丛中。" + "天上的星星闪烁着,仿佛在诉说古老的故事。" + "时间仿佛停滞了……" + "万物静候,聆听着夜的呼吸!" + ) + doc = Document(content=text) + + splitter = chinese_DocumentSplitter( + split_by="word", language="zh", split_length=30, split_overlap=10, particle_size="coarse" + ) + if hasattr(splitter, "warm_up"): + splitter.warm_up() + + result = splitter.run(documents=[doc]) + docs = result["documents"] + + print(f"Total chunks generated: {len(docs)}.") + for i, d in enumerate(docs): + print(f"\nChunk {i + 1}:\n{d.content}") + + assert len(docs) > 1, "Expected multiple chunks to be generated" + + max_len_allowed = 80 # Allow a somewhat relaxed max chunk length + assert all(len(doc.content) <= max_len_allowed for doc in docs), ( + f"Some chunks exceed {max_len_allowed} characters" + ) + + def has_any_overlap(suffix: str, prefix: str) -> bool: + """ + Check if suffix and prefix have at least one continuous overlapping character sequence. + Tries from longest possible overlap down to 1 character. + Returns True if any overlap found. + """ + max_check_len = min(len(suffix), len(prefix)) + for length in range(max_check_len, 0, -1): + if suffix[-length:] == prefix[:length]: + return True + return False + + for i in range(1, len(docs)): + prev_chunk = docs[i - 1].content + curr_chunk = docs[i].content + + # Take last 20 chars of prev chunk and first 20 chars of current chunk to check overlap + overlap_prev = prev_chunk[-20:] + overlap_curr = curr_chunk[:20] + + assert has_any_overlap(overlap_prev, overlap_curr), ( + f"Chunks {i} and {i + 1} do not overlap. " + f"Tail (up to 20 chars): '{overlap_prev}' vs Head (up to 20 chars): '{overlap_curr}'" + )