From a1dea7f7ae7cb122d64461e57342939e6067cecf Mon Sep 17 00:00:00 2001
From: "David S. Batista" <dsbatista@gmail.com>
Date: Thu, 5 Jun 2025 11:14:17 +0200
Subject: [PATCH] fixing linting issues

---
 .../chinese_document_splitter.py              | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/haystack/components/preprocessors/chinese_document_splitter.py b/haystack/components/preprocessors/chinese_document_splitter.py
index 57b9aee86..c2fbb11c7 100644
--- a/haystack/components/preprocessors/chinese_document_splitter.py
+++ b/haystack/components/preprocessors/chinese_document_splitter.py
@@ -19,10 +19,11 @@ logger = logging.getLogger(__name__)
 
 # mapping of split by character, 'function' and 'sentence' don't split by character
 _CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
-chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
-chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
+
+# chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
+# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
 # Load Chinese sentence slicer
-split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
+# split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
 
 
 @component
@@ -85,7 +86,7 @@ class ChineseDocumentSplitter(DocumentSplitter):
     def chinese_sentence_split(self, text: str) -> list:
         """Split Chinese text into sentences."""
         # Split sentences
-        sentences = split_sent(text)
+        sentences = self.split_sent(text)
 
         # Organize the format of segmented sentences
         results = []
@@ -107,9 +108,8 @@ class ChineseDocumentSplitter(DocumentSplitter):
 
         return self._split_by_character(doc)
 
-    @staticmethod
-    def _concatenate_sentences_based_on_word_amount(
-        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+    def _concatenate_sentences_based_on_word_amount(  # pylint: disable=too-many-positional-arguments
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
     ) -> Tuple[List[str], List[int], List[int]]:
         """
         Groups the sentences into chunks of `split_length` words while respecting sentence boundaries.
@@ -136,16 +136,18 @@ class ChineseDocumentSplitter(DocumentSplitter):
         for sentence_idx, sentence in enumerate(sentences):
             current_chunk.append(sentence)
             if language == "zh" and particle_size == "coarse":
-                chunk_word_count += len(chinese_tokenizer_coarse(sentence))
+                chunk_word_count += len(self.chinese_tokenizer_coarse(sentence))
                 next_sentence_word_count = (
-                    len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
+                    len(self.chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
                     if sentence_idx < len(sentences) - 1
                     else 0
                 )
             if language == "zh" and particle_size == "fine":
-                chunk_word_count += len(chinese_tokenizer_fine(sentence))
+                chunk_word_count += len(self.chinese_tokenizer_fine(sentence))
                 next_sentence_word_count = (
-                    len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
+                    len(self.chinese_tokenizer_fine(sentences[sentence_idx + 1]))
+                    if sentence_idx < len(sentences) - 1
+                    else 0
                 )
 
             # Number of words in the current chunk plus the next sentence is larger than the split_length,
@@ -327,9 +329,8 @@ class ChineseDocumentSplitter(DocumentSplitter):
                 overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
                 previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
 
-    @staticmethod
-    def _number_of_sentences_to_keep(
-        sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
+    def _number_of_sentences_to_keep(  # pylint: disable=too-many-positional-arguments
+        self, sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
     ) -> int:
         """
         Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
@@ -348,9 +349,9 @@ class ChineseDocumentSplitter(DocumentSplitter):
 
         for sent in reversed(sentences[1:]):
             if language == "zh" and particle_size == "coarse":
-                num_words += len(chinese_tokenizer_coarse(sent))
+                num_words += len(self.chinese_tokenizer_coarse(sent))
             if language == "zh" and particle_size == "fine":
-                num_words += len(chinese_tokenizer_fine(sent))
+                num_words += len(self.chinese_tokenizer_fine(sent))
             # If the number of words is larger than the split_length then don't add any more sentences
             if num_words > split_length:
                 break