mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 07:59:27 +00:00
feat: hard document length limit at max_chars_check (#5191)
* implement hard cut at max_chars_check * regenerate ids * black * docstring * black
This commit is contained in:
parent
36192eca72
commit
31664627eb
@ -103,7 +103,10 @@ class PreProcessor(BasePreProcessor):
|
||||
field `"page"`. Page boundaries are determined by `"\f"` character which is added
|
||||
in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
|
||||
`AzureConverter`.
|
||||
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
|
||||
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than
|
||||
max_chars_check in characters after pre-processing will raise a warning and is going to be split at the
|
||||
`max_char_check`-th char, regardless of any other constraint. If the resulting documents are still too long,
|
||||
they'll be cut again until all fragments are below the maximum allowed length.
|
||||
"""
|
||||
if remove_substrings is None:
|
||||
remove_substrings = []
|
||||
@ -186,7 +189,9 @@ class PreProcessor(BasePreProcessor):
|
||||
|
||||
def _long_documents(self, documents: List[Document], max_chars_check=10_000):
|
||||
"""
|
||||
Function that tries to detect unusually long documents.
|
||||
Function that tries to detect unusually long documents. When detected, such documents are going to be
|
||||
split at the `max_char_check`-th char, regardless of any other constraint. If the resulting documents
|
||||
are still too long, they'll be cut again until all fragments are below the maximum allowed length.
|
||||
|
||||
NOTE: this function is a heuristic that is in place only because a proper fix that prevents such documents from forming
|
||||
would imply a complete revamp of this class, including better definitions of what the various units (word, sentence, passage) mean exactly.
|
||||
@ -195,11 +200,23 @@ class PreProcessor(BasePreProcessor):
|
||||
if len(document.content) > max_chars_check:
|
||||
logger.warning(
|
||||
"Document %s is %s characters long after preprocessing, where the maximum length should be %s. "
|
||||
"Something might be wrong with the splitting, check the document affected to prevent issues at query time.",
|
||||
"Something might be wrong with the splitting, check the document affected to prevent issues at "
|
||||
"query time. This document will be now hard-split at %s chars recursively.",
|
||||
document.id,
|
||||
len(document.content),
|
||||
max_chars_check,
|
||||
max_chars_check,
|
||||
)
|
||||
fields = document.to_dict()
|
||||
document.content = document.content[:max_chars_check]
|
||||
fields.pop("id")
|
||||
fields["content"] = fields["content"][max_chars_check:]
|
||||
# recursively check if tail_document is still too long
|
||||
tail_documents = self._long_documents(
|
||||
documents=[Document.from_dict(fields)], max_chars_check=max_chars_check
|
||||
)
|
||||
documents += tail_documents
|
||||
return documents
|
||||
|
||||
def _process_single(
|
||||
self,
|
||||
@ -250,7 +267,7 @@ class PreProcessor(BasePreProcessor):
|
||||
id_hash_keys=id_hash_keys,
|
||||
)
|
||||
|
||||
self._long_documents(split_documents, max_chars_check=self.max_chars_check)
|
||||
split_documents = self._long_documents(split_documents, max_chars_check=self.max_chars_check)
|
||||
|
||||
return split_documents
|
||||
|
||||
|
||||
@ -528,13 +528,12 @@ def test_preprocessor_very_long_document(caplog):
|
||||
preproc = PreProcessor(
|
||||
clean_empty_lines=False, clean_header_footer=False, clean_whitespace=False, split_by=None, max_chars_check=10
|
||||
)
|
||||
documents = [
|
||||
Document(content=f"this is a test document with more than max_char characters: {'1'*i}") for i in range(9)
|
||||
]
|
||||
documents = [Document(content=str(i) + (f"." * i)) for i in range(0, 30, 3)]
|
||||
results = preproc.process(documents)
|
||||
assert results == documents
|
||||
for i in range(5):
|
||||
assert f"is 6{i} characters long after preprocessing, where the maximum length should be 10." in caplog.text
|
||||
assert len(results) == 19
|
||||
assert any(d.content.startswith(".") for d in results)
|
||||
assert any(not d.content.startswith(".") for d in results)
|
||||
assert f"characters long after preprocessing, where the maximum length should be 10." in caplog.text
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user