From 34552f2e5d6231da0c9b6a0600f23315793a9f5f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 4 Jun 2025 17:36:55 +0200 Subject: [PATCH] cleaning up --- .../preprocessors/chinese_document_spliter.py | 142 +++++++++--------- 1 file changed, 70 insertions(+), 72 deletions(-) diff --git a/haystack/components/preprocessors/chinese_document_spliter.py b/haystack/components/preprocessors/chinese_document_spliter.py index 321664195..0fba3a7e9 100644 --- a/haystack/components/preprocessors/chinese_document_spliter.py +++ b/haystack/components/preprocessors/chinese_document_spliter.py @@ -2,42 +2,39 @@ # # SPDX-License-Identifier: Apache-2.0 - -from haystack.components.preprocessors import DocumentSplitter from copy import deepcopy -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple -from haystack.lazy_imports import LazyImport +from typing import Any, Dict, List, Literal, Tuple from more_itertools import windowed -from haystack import Document, component, logging -from haystack.components.preprocessors.sentence_tokenizer import Language, SentenceSplitter, nltk_imports -from haystack.core.serialization import default_from_dict, default_to_dict -from haystack.utils import deserialize_callable, serialize_callable -with LazyImport("Run 'pip install hanlp'") as hanlp: - import hanlp +from haystack import Document, component, logging +from haystack.components.preprocessors import DocumentSplitter +from haystack.lazy_imports import LazyImport + +with LazyImport("Run 'pip install hanlp'") as hanlp_import: + import hanlp logger = logging.getLogger(__name__) # mapping of split by character, 'function' and 'sentence' don't split by character -_CHARACTER_SPLIT_BY_MAPPING = { - "page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"} -chinese_tokenizer_coarse = hanlp.load( - hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) +_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"} + +hanlp_import.check() + +chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) -# 加载中文的句子切分器 -split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) +split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) # 加载中文的句子切分器 @component -class chinese_DocumentSpliter(DocumentSplitter): - +class ChineseDocumentspliter(DocumentSplitter): def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs): - super(chinese_DocumentSpliter, self).__init__(*args, **kwargs) + super(ChineseDocumentspliter, self).__init__(*args, **kwargs) # coarse代表粗颗粒度中文分词,fine代表细颗粒度分词,默认为粗颗粒度分词 - # 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word segmentation, default is coarse granularity word segmentation + # 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word + # segmentation, default is coarse granularity word segmentation self.particle_size = particle_size # self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) # self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) @@ -47,12 +44,12 @@ class chinese_DocumentSpliter(DocumentSplitter): def _split_by_character(self, doc) -> List[Document]: split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by] - if self.language == 'zh' and self.particle_size == "coarse": + if self.language == "zh" and self.particle_size == "coarse": units = chinese_tokenizer_coarse(doc.content) - if self.language == 'zh' and self.particle_size == "fine": + if self.language == "zh" and self.particle_size == "fine": units = chinese_tokenizer_fine(doc.content) - if self.language == 'en': + if self.language == "en": units = doc.content.split(split_at) # Add the delimiter back to all units except the last one for i in range(len(units) - 1): @@ -67,7 +64,14 @@ class chinese_DocumentSpliter(DocumentSplitter): ) # 定义一个函数用于处理中文分句 - def chinese_sentence_split(self, text: str) -> list: + @staticmethod + def chinese_sentence_split(text: str) -> list: + """ + Segmentation of Chinese text. + + :param text: The Chinese text to be segmented. + :returns: A list of dictionaries, each containing a sentence and its start and end indices. + """ # 分句 sentences = split_sent(text) @@ -77,11 +81,7 @@ class chinese_DocumentSpliter(DocumentSplitter): for sentence in sentences: start = text.find(sentence, start) end = start + len(sentence) - results.append({ - 'sentence': sentence + '\n', - 'start': start, - 'end': end - }) + results.append({"sentence": sentence + "\n", "start": start, "end": end}) start = end return results @@ -123,17 +123,17 @@ class chinese_DocumentSpliter(DocumentSplitter): # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) for sentence_idx, sentence in enumerate(sentences): current_chunk.append(sentence) - if language == 'zh' and particle_size == "coarse": + if language == "zh" and particle_size == "coarse": chunk_word_count += len(chinese_tokenizer_coarse(sentence)) next_sentence_word_count = ( - len(chinese_tokenizer_coarse( - sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0 + len(chinese_tokenizer_coarse(sentences[sentence_idx + 1])) + if sentence_idx < len(sentences) - 1 + else 0 ) - if language == 'zh' and particle_size == "fine": + if language == "zh" and particle_size == "fine": chunk_word_count += len(chinese_tokenizer_fine(sentence)) next_sentence_word_count = ( - len(chinese_tokenizer_fine( - sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0 + len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0 ) # Number of words in the current chunk plus the next sentence is larger than the split_length, @@ -145,25 +145,25 @@ class chinese_DocumentSpliter(DocumentSplitter): split_start_indices.append(chunk_start_idx) # Get the number of sentences that overlap with the next chunk - num_sentences_to_keep = chinese_DocumentSpliter._number_of_sentences_to_keep( - sentences=current_chunk, split_length=split_length, split_overlap=split_overlap, language=language, particle_size=particle_size + num_sentences_to_keep = ChineseDocumentspliter._number_of_sentences_to_keep( + sentences=current_chunk, + split_length=split_length, + split_overlap=split_overlap, + language=language, + particle_size=particle_size, ) # Set up information for the new chunk if num_sentences_to_keep > 0: # Processed sentences are the ones that are not overlapping with the next chunk - processed_sentences = current_chunk[:- - num_sentences_to_keep] - chunk_starting_page_number += sum(sent.count("\f") - for sent in processed_sentences) + processed_sentences = current_chunk[:-num_sentences_to_keep] + chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences) chunk_start_idx += len("".join(processed_sentences)) # Next chunk starts with the sentences that were overlapping with the previous chunk current_chunk = current_chunk[-num_sentences_to_keep:] - chunk_word_count = sum(len(s.split()) - for s in current_chunk) + chunk_word_count = sum(len(s.split()) for s in current_chunk) else: # Here processed_sentences is the same as current_chunk since there is no overlap - chunk_starting_page_number += sum(sent.count("\f") - for sent in current_chunk) + chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk) chunk_start_idx += len("".join(current_chunk)) current_chunk = [] chunk_word_count = 0 @@ -181,18 +181,21 @@ class chinese_DocumentSpliter(DocumentSplitter): def _split_by_nltk_sentence(self, doc: Document) -> List[Document]: split_docs = [] - if self.language == 'zh': - result = self.chinese_sentence_split(doc.content) - if self.language == 'en': - result = self.sentence_splitter.split_sentences( - doc.content) # type: ignore # None check is done in run() + if self.language == "zh": + result = ChineseDocumentspliter.chinese_sentence_split(doc.content) + if self.language == "en": + result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run() units = [sentence["sentence"] for sentence in result] if self.respect_sentence_boundary: text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount( - sentences=units, split_length=self.split_length, split_overlap=self.split_overlap, language=self.language, - particle_size=self.particle_size) + sentences=units, + split_length=self.split_length, + split_overlap=self.split_overlap, + language=self.language, + particle_size=self.particle_size, + ) else: text_splits, splits_pages, splits_start_idxs = self._concatenate_units( elements=units, @@ -224,8 +227,7 @@ class chinese_DocumentSpliter(DocumentSplitter): splits_start_idxs: List[int] = [] cur_start_idx = 0 cur_page = 1 - segments = windowed(elements, n=split_length, - step=split_length - split_overlap) + segments = windowed(elements, n=split_length, step=split_length - split_overlap) for seg in segments: current_units = [unit for unit in seg if unit is not None] @@ -248,8 +250,7 @@ class chinese_DocumentSpliter(DocumentSplitter): if self.split_by == "page": num_page_breaks = len(processed_units) else: - num_page_breaks = sum(processed_unit.count("\f") - for processed_unit in processed_units) + num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) cur_page += num_page_breaks @@ -282,11 +283,10 @@ class chinese_DocumentSpliter(DocumentSplitter): doc_start_idx = splits_start_idxs[i] previous_doc = documents[i - 1] previous_doc_start_idx = splits_start_idxs[i - 1] - self._add_split_overlap_information( - doc, doc_start_idx, previous_doc, previous_doc_start_idx) + self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx) for d in documents: - d.content=d.content.replace(" ","") + d.content = d.content.replace(" ", "") return documents @staticmethod @@ -301,26 +301,24 @@ class chinese_DocumentSpliter(DocumentSplitter): :param previous_doc: The Document that was split before the current Document. :param previous_doc_start_idx: The starting index of the previous Document. """ - overlapping_range = (current_doc_start_idx - previous_doc_start_idx, - len(previous_doc.content)) # type: ignore + overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore if overlapping_range[0] < overlapping_range[1]: # type: ignore - overlapping_str = previous_doc.content[overlapping_range[0]: overlapping_range[1]] + overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]] if current_doc.content.startswith(overlapping_str): # type: ignore # add split overlap information to this Document regarding the previous Document - current_doc.meta["_split_overlap"].append( - {"doc_id": previous_doc.id, "range": overlapping_range}) + current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range}) # add split overlap information to previous Document regarding this Document - overlapping_range = ( - 0, overlapping_range[1] - overlapping_range[0]) - previous_doc.meta["_split_overlap"].append( - {"doc_id": current_doc.id, "range": overlapping_range}) + overlapping_range = (0, overlapping_range[1] - overlapping_range[0]) + previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range}) @staticmethod - def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str) -> int: + def _number_of_sentences_to_keep( + sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str + ) -> int: """ Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`. @@ -339,10 +337,10 @@ class chinese_DocumentSpliter(DocumentSplitter): # chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH) # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence for sent in reversed(sentences[1:]): - if language == 'zh' and particle_size == "coarse": + if language == "zh" and particle_size == "coarse": num_words += len(chinese_tokenizer_coarse(sent)) # num_words += len(sent.split()) - if language == 'zh' and particle_size == "fine": + if language == "zh" and particle_size == "fine": num_words += len(chinese_tokenizer_fine(sent)) # If the number of words is larger than the split_length then don't add any more sentences if num_words > split_length: @@ -350,5 +348,5 @@ class chinese_DocumentSpliter(DocumentSplitter): num_sentences_to_keep += 1 if num_words > split_overlap: break - return num_sentences_to_keep + return num_sentences_to_keep