diff --git a/docs/_src/tutorials/tutorials/8.md b/docs/_src/tutorials/tutorials/8.md index bae35a5a4..cabe68e9e 100644 --- a/docs/_src/tutorials/tutorials/8.md +++ b/docs/_src/tutorials/tutorials/8.md @@ -118,7 +118,7 @@ and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages # This is a default usage of the PreProcessor. # Here, it performs cleaning of consecutive whitespaces # and splits a single large document into smaller documents. -# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences +# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences # Note how the single document passed into the document gets split into 5 smaller documents preprocessor = PreProcessor( diff --git a/tutorials/Tutorial8_Preprocessing.ipynb b/tutorials/Tutorial8_Preprocessing.ipynb index dd34847bb..5056df911 100644 --- a/tutorials/Tutorial8_Preprocessing.ipynb +++ b/tutorials/Tutorial8_Preprocessing.ipynb @@ -286,7 +286,7 @@ "# This is a default usage of the PreProcessor.\n", "# Here, it performs cleaning of consecutive whitespaces\n", "# and splits a single large document into smaller documents.\n", - "# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n", + "# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences\n", "# Note how the single document passed into the document gets split into 5 smaller documents\n", "\n", "preprocessor = PreProcessor(\n", diff --git a/tutorials/Tutorial8_Preprocessing.py b/tutorials/Tutorial8_Preprocessing.py index 9659e1a37..99392cdf4 100644 --- a/tutorials/Tutorial8_Preprocessing.py +++ b/tutorials/Tutorial8_Preprocessing.py @@ -77,7 +77,7 @@ def tutorial8_preprocessing(): # This is a default usage of the PreProcessor. # Here, it performs cleaning of consecutive whitespaces # and splits a single large document into smaller documents. - # Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences + # Each document is up to 100 words long and document breaks cannot fall in the middle of sentences # Note how the single document passed into the document gets split into 5 smaller documents preprocessor = PreProcessor( @@ -85,7 +85,7 @@ def tutorial8_preprocessing(): clean_whitespace=True, clean_header_footer=False, split_by="word", - split_length=1000, + split_length=100, split_respect_sentence_boundary=True, ) docs_default = preprocessor.process([doc_txt])