docs: Fix the word length splitting; should be set to 100 not 1,000 (#3133)

* Fix the word length splitting; should be set to 100 not 1,000 due to limitations of transformer models * Update documentation for tutorial change
2025-11-10 23:04:02 +00:00 · 2022-09-07 09:57:54 +01:00 · 2022-09-07 09:57:54 +01:00 · 9a750f7032
commit 9a750f7032
parent 84acb6584f
3 changed files with 4 additions and 4 deletions
--- a/docs/_src/tutorials/tutorials/8.md
+++ b/docs/_src/tutorials/tutorials/8.md
@ -118,7 +118,7 @@ and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages
 # This is a default usage of the PreProcessor.
 # Here, it performs cleaning of consecutive whitespaces
 # and splits a single large document into smaller documents.
-# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
+# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
 # Note how the single document passed into the document gets split into 5 smaller documents

 preprocessor = PreProcessor(
--- a/tutorials/Tutorial8_Preprocessing.ipynb
+++ b/tutorials/Tutorial8_Preprocessing.ipynb
@ -286,7 +286,7 @@
    "# This is a default usage of the PreProcessor.\n",
    "# Here, it performs cleaning of consecutive whitespaces\n",
    "# and splits a single large document into smaller documents.\n",
-    "# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n",
+    "# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences\n",
    "# Note how the single document passed into the document gets split into 5 smaller documents\n",
    "\n",
    "preprocessor = PreProcessor(\n",
--- a/tutorials/Tutorial8_Preprocessing.py
+++ b/tutorials/Tutorial8_Preprocessing.py
@ -77,7 +77,7 @@ def tutorial8_preprocessing():
    # This is a default usage of the PreProcessor.
    # Here, it performs cleaning of consecutive whitespaces
    # and splits a single large document into smaller documents.
-    # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
+    # Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
    # Note how the single document passed into the document gets split into 5 smaller documents

    preprocessor = PreProcessor(
@ -85,7 +85,7 @@ def tutorial8_preprocessing():
        clean_whitespace=True,
        clean_header_footer=False,
        split_by="word",
-        split_length=1000,
+        split_length=100,
        split_respect_sentence_boundary=True,
    )
    docs_default = preprocessor.process([doc_txt])