docs: Fix the word length splitting; should be set to 100 not 1,000 (#3133)

* Fix the word length splitting; should be set to 100 not 1,000 due to limitations of transformer models

* Update documentation for tutorial change
This commit is contained in:
Steven Haley 2022-09-07 09:57:54 +01:00 committed by GitHub
parent 84acb6584f
commit 9a750f7032
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 4 additions and 4 deletions

View File

@ -118,7 +118,7 @@ and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages
# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents
preprocessor = PreProcessor(

View File

@ -286,7 +286,7 @@
"# This is a default usage of the PreProcessor.\n",
"# Here, it performs cleaning of consecutive whitespaces\n",
"# and splits a single large document into smaller documents.\n",
"# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n",
"# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences\n",
"# Note how the single document passed into the document gets split into 5 smaller documents\n",
"\n",
"preprocessor = PreProcessor(\n",

View File

@ -77,7 +77,7 @@ def tutorial8_preprocessing():
# This is a default usage of the PreProcessor.
# Here, it performs cleaning of consecutive whitespaces
# and splits a single large document into smaller documents.
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
# Note how the single document passed into the document gets split into 5 smaller documents
preprocessor = PreProcessor(
@ -85,7 +85,7 @@ def tutorial8_preprocessing():
clean_whitespace=True,
clean_header_footer=False,
split_by="word",
split_length=1000,
split_length=100,
split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process([doc_txt])