From 7b2d0380985b8293d68d2c07fba821178e22a0fa Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Wed, 4 Jun 2025 18:00:57 +0200
Subject: [PATCH] fixing lazy import

---
 .../preprocessors/chinese_document_spliter.py | 50 +++++++++----------
 pyproject.toml                                |  4 +-
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/haystack/components/preprocessors/chinese_document_spliter.py b/haystack/components/preprocessors/chinese_document_spliter.py
index 0fba3a7e9..7306e65d7 100644
--- a/haystack/components/preprocessors/chinese_document_spliter.py
+++ b/haystack/components/preprocessors/chinese_document_spliter.py
@@ -20,17 +20,11 @@ logger = logging.getLogger(__name__)
 # mapping of split by character, 'function' and 'sentence' don't split by character
 _CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
 
-hanlp_import.check()
-
-chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
-chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
-split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)  # 加载中文的句子切分器
-
 
 @component
-class ChineseDocumentspliter(DocumentSplitter):
+class ChineseDocumentSplitter(DocumentSplitter):
     def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
-        super(ChineseDocumentspliter, self).__init__(*args, **kwargs)
+        super(ChineseDocumentSplitter, self).__init__(*args, **kwargs)
 
         # coarse代表粗颗粒度中文分词，fine代表细颗粒度分词，默认为粗颗粒度分词
         # 'coarse' represents coarse granularity Chinese word segmentation, 'fine' represents fine granularity word
@@ -42,13 +36,19 @@ class ChineseDocumentspliter(DocumentSplitter):
         # # 加载中文的句子切分器
         # self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
 
+        hanlp_import.check()
+
+        self.chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+        self.chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+        self.split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)  # 加载中文的句子切分器
+
     def _split_by_character(self, doc) -> List[Document]:
         split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
         if self.language == "zh" and self.particle_size == "coarse":
-            units = chinese_tokenizer_coarse(doc.content)
+            units = self.chinese_tokenizer_coarse(doc.content)
 
         if self.language == "zh" and self.particle_size == "fine":
-            units = chinese_tokenizer_fine(doc.content)
+            units = self.chinese_tokenizer_fine(doc.content)
         if self.language == "en":
             units = doc.content.split(split_at)
             # Add the delimiter back to all units except the last one
@@ -64,8 +64,7 @@ class ChineseDocumentspliter(DocumentSplitter):
         )
 
     # 定义一个函数用于处理中文分句
-    @staticmethod
-    def chinese_sentence_split(text: str) -> list:
+    def chinese_sentence_split(self, text: str) -> list:
         """
         Segmentation of Chinese text.
 
@@ -73,7 +72,7 @@ class ChineseDocumentspliter(DocumentSplitter):
         :returns: A list of dictionaries, each containing a sentence and its start and end indices.
         """
         # 分句
-        sentences = split_sent(text)
+        sentences = self.split_sent(text)
 
         # 整理格式
         results = []
@@ -95,9 +94,8 @@ class ChineseDocumentspliter(DocumentSplitter):
 
         return self._split_by_character(doc)
 
-    @staticmethod
     def _concatenate_sentences_based_on_word_amount(
-        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
     ) -> Tuple[List[str], List[int], List[int]]:
         """
         Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@@ -124,16 +122,18 @@ class ChineseDocumentspliter(DocumentSplitter):
         for sentence_idx, sentence in enumerate(sentences):
             current_chunk.append(sentence)
             if language == "zh" and particle_size == "coarse":
-                chunk_word_count += len(chinese_tokenizer_coarse(sentence))
+                chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
                 next_sentence_word_count = (
-                    len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
+                    len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
                     if sentence_idx < len(sentences) - 1
                     else 0
                 )
             if language == "zh" and particle_size == "fine":
-                chunk_word_count += len(chinese_tokenizer_fine(sentence))
+                chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
                 next_sentence_word_count = (
-                    len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
+                    len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
+                    if sentence_idx < len(sentences) - 1
+                    else 0
                 )
 
             # Number of words in the current chunk plus the next sentence is larger than the split_length,
@@ -145,7 +145,8 @@ class ChineseDocumentspliter(DocumentSplitter):
                 split_start_indices.append(chunk_start_idx)
 
                 # Get the number of sentences that overlap with the next chunk
-                num_sentences_to_keep = ChineseDocumentspliter._number_of_sentences_to_keep(
+                num_sentences_to_keep = ChineseDocumentSplitter._number_of_sentences_to_keep(
+                    self,
                     sentences=current_chunk,
                     split_length=split_length,
                     split_overlap=split_overlap,
@@ -182,7 +183,7 @@ class ChineseDocumentspliter(DocumentSplitter):
         split_docs = []
 
         if self.language == "zh":
-            result = ChineseDocumentspliter.chinese_sentence_split(doc.content)
+            result = ChineseDocumentSplitter.chinese_sentence_split(doc.content)
         if self.language == "en":
             result = self.sentence_splitter.split_sentences(doc.content)  # type: ignore # None check is done in run()
 
@@ -315,9 +316,8 @@ class ChineseDocumentspliter(DocumentSplitter):
                 overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                 previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
 
-    @staticmethod
     def _number_of_sentences_to_keep(
-        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
     ) -> int:
         """
         Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
@@ -338,10 +338,10 @@ class ChineseDocumentspliter(DocumentSplitter):
         # Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
         for sent in reversed(sentences[1:]):
             if language == "zh" and particle_size == "coarse":
-                num_words += len(chinese_tokenizer_coarse(sent))
+                num_words += len(self.chinese_tokenizer_coarse(sent))
                 # num_words += len(sent.split())
             if language == "zh" and particle_size == "fine":
-                num_words += len(chinese_tokenizer_fine(sent))
+                num_words += len(self.chinese_tokenizer_fine(sent))
             # If the number of words is larger than the split_length then don't add any more sentences
             if num_words > split_length:
                 break
diff --git a/pyproject.toml b/pyproject.toml
index 07f84e6f4..ae9362d3d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,8 +114,8 @@ dependencies = [
   "python-oxmsg",                     # MSGToDocument
 
   "nltk>=3.9.1", # NLTKDocumentSplitter, RecursiveDocumentSplitter
-  "tiktoken", # RecursiveDocumentSplitter
-  "hanlp", # ChineseDocumentSplitter
+  "tiktoken",    # RecursiveDocumentSplitter
+  "hanlp",       # ChineseDocumentSplitter
 
   # OpenAPI
   "jsonref",              # OpenAPIServiceConnector, OpenAPIServiceToFunctions