mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-19 14:08:19 +00:00
docs: Fix the word length splitting; should be set to 100 not 1,000 (#3133)
* Fix the word length splitting; should be set to 100 not 1,000 due to limitations of transformer models * Update documentation for tutorial change
This commit is contained in:
parent
84acb6584f
commit
9a750f7032
@ -118,7 +118,7 @@ and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages
|
||||
# This is a default usage of the PreProcessor.
|
||||
# Here, it performs cleaning of consecutive whitespaces
|
||||
# and splits a single large document into smaller documents.
|
||||
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
|
||||
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
|
||||
# Note how the single document passed into the document gets split into 5 smaller documents
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
|
@ -286,7 +286,7 @@
|
||||
"# This is a default usage of the PreProcessor.\n",
|
||||
"# Here, it performs cleaning of consecutive whitespaces\n",
|
||||
"# and splits a single large document into smaller documents.\n",
|
||||
"# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n",
|
||||
"# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences\n",
|
||||
"# Note how the single document passed into the document gets split into 5 smaller documents\n",
|
||||
"\n",
|
||||
"preprocessor = PreProcessor(\n",
|
||||
|
@ -77,7 +77,7 @@ def tutorial8_preprocessing():
|
||||
# This is a default usage of the PreProcessor.
|
||||
# Here, it performs cleaning of consecutive whitespaces
|
||||
# and splits a single large document into smaller documents.
|
||||
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
|
||||
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
|
||||
# Note how the single document passed into the document gets split into 5 smaller documents
|
||||
|
||||
preprocessor = PreProcessor(
|
||||
@ -85,7 +85,7 @@ def tutorial8_preprocessing():
|
||||
clean_whitespace=True,
|
||||
clean_header_footer=False,
|
||||
split_by="word",
|
||||
split_length=1000,
|
||||
split_length=100,
|
||||
split_respect_sentence_boundary=True,
|
||||
)
|
||||
docs_default = preprocessor.process([doc_txt])
|
||||
|
Loading…
x
Reference in New Issue
Block a user