mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-19 22:18:11 +00:00
docs: Fix the word length splitting; should be set to 100 not 1,000 (#3133)
* Fix the word length splitting; should be set to 100 not 1,000 due to limitations of transformer models * Update documentation for tutorial change
This commit is contained in:
parent
84acb6584f
commit
9a750f7032
@ -118,7 +118,7 @@ and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages
|
|||||||
# This is a default usage of the PreProcessor.
|
# This is a default usage of the PreProcessor.
|
||||||
# Here, it performs cleaning of consecutive whitespaces
|
# Here, it performs cleaning of consecutive whitespaces
|
||||||
# and splits a single large document into smaller documents.
|
# and splits a single large document into smaller documents.
|
||||||
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
|
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
|
||||||
# Note how the single document passed into the document gets split into 5 smaller documents
|
# Note how the single document passed into the document gets split into 5 smaller documents
|
||||||
|
|
||||||
preprocessor = PreProcessor(
|
preprocessor = PreProcessor(
|
||||||
|
@ -286,7 +286,7 @@
|
|||||||
"# This is a default usage of the PreProcessor.\n",
|
"# This is a default usage of the PreProcessor.\n",
|
||||||
"# Here, it performs cleaning of consecutive whitespaces\n",
|
"# Here, it performs cleaning of consecutive whitespaces\n",
|
||||||
"# and splits a single large document into smaller documents.\n",
|
"# and splits a single large document into smaller documents.\n",
|
||||||
"# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences\n",
|
"# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences\n",
|
||||||
"# Note how the single document passed into the document gets split into 5 smaller documents\n",
|
"# Note how the single document passed into the document gets split into 5 smaller documents\n",
|
||||||
"\n",
|
"\n",
|
||||||
"preprocessor = PreProcessor(\n",
|
"preprocessor = PreProcessor(\n",
|
||||||
|
@ -77,7 +77,7 @@ def tutorial8_preprocessing():
|
|||||||
# This is a default usage of the PreProcessor.
|
# This is a default usage of the PreProcessor.
|
||||||
# Here, it performs cleaning of consecutive whitespaces
|
# Here, it performs cleaning of consecutive whitespaces
|
||||||
# and splits a single large document into smaller documents.
|
# and splits a single large document into smaller documents.
|
||||||
# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences
|
# Each document is up to 100 words long and document breaks cannot fall in the middle of sentences
|
||||||
# Note how the single document passed into the document gets split into 5 smaller documents
|
# Note how the single document passed into the document gets split into 5 smaller documents
|
||||||
|
|
||||||
preprocessor = PreProcessor(
|
preprocessor = PreProcessor(
|
||||||
@ -85,7 +85,7 @@ def tutorial8_preprocessing():
|
|||||||
clean_whitespace=True,
|
clean_whitespace=True,
|
||||||
clean_header_footer=False,
|
clean_header_footer=False,
|
||||||
split_by="word",
|
split_by="word",
|
||||||
split_length=1000,
|
split_length=100,
|
||||||
split_respect_sentence_boundary=True,
|
split_respect_sentence_boundary=True,
|
||||||
)
|
)
|
||||||
docs_default = preprocessor.process([doc_txt])
|
docs_default = preprocessor.process([doc_txt])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user