feat: hard document length limit at max_chars_check (#5191)

* implement hard cut at max_chars_check

* regenerate ids

* black

* docstring

* black
This commit is contained in:
ZanSara 2023-06-23 12:34:19 +02:00 committed by GitHub
parent 36192eca72
commit 31664627eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 10 deletions

View File

@ -103,7 +103,10 @@ class PreProcessor(BasePreProcessor):
field `"page"`. Page boundaries are determined by `"\f"` character which is added
in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
`AzureConverter`.
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than max_chars_check in characters after pre-processing will raise a warning.
:param max_chars_check: the maximum length a document is expected to have. Each document that is longer than
max_chars_check in characters after pre-processing will raise a warning and is going to be split at the
`max_char_check`-th char, regardless of any other constraint. If the resulting documents are still too long,
they'll be cut again until all fragments are below the maximum allowed length.
"""
if remove_substrings is None:
remove_substrings = []
@ -186,7 +189,9 @@ class PreProcessor(BasePreProcessor):
def _long_documents(self, documents: List[Document], max_chars_check=10_000):
"""
Function that tries to detect unusually long documents.
Function that tries to detect unusually long documents. When detected, such documents are going to be
split at the `max_char_check`-th char, regardless of any other constraint. If the resulting documents
are still too long, they'll be cut again until all fragments are below the maximum allowed length.
NOTE: this function is a heuristic that is in place only because a proper fix that prevents such documents from forming
would imply a complete revamp of this class, including better definitions of what the various units (word, sentence, passage) mean exactly.
@ -195,11 +200,23 @@ class PreProcessor(BasePreProcessor):
if len(document.content) > max_chars_check:
logger.warning(
"Document %s is %s characters long after preprocessing, where the maximum length should be %s. "
"Something might be wrong with the splitting, check the document affected to prevent issues at query time.",
"Something might be wrong with the splitting, check the document affected to prevent issues at "
"query time. This document will be now hard-split at %s chars recursively.",
document.id,
len(document.content),
max_chars_check,
max_chars_check,
)
fields = document.to_dict()
document.content = document.content[:max_chars_check]
fields.pop("id")
fields["content"] = fields["content"][max_chars_check:]
# recursively check if tail_document is still too long
tail_documents = self._long_documents(
documents=[Document.from_dict(fields)], max_chars_check=max_chars_check
)
documents += tail_documents
return documents
def _process_single(
self,
@ -250,7 +267,7 @@ class PreProcessor(BasePreProcessor):
id_hash_keys=id_hash_keys,
)
self._long_documents(split_documents, max_chars_check=self.max_chars_check)
split_documents = self._long_documents(split_documents, max_chars_check=self.max_chars_check)
return split_documents

View File

@ -528,13 +528,12 @@ def test_preprocessor_very_long_document(caplog):
preproc = PreProcessor(
clean_empty_lines=False, clean_header_footer=False, clean_whitespace=False, split_by=None, max_chars_check=10
)
documents = [
Document(content=f"this is a test document with more than max_char characters: {'1'*i}") for i in range(9)
]
documents = [Document(content=str(i) + (f"." * i)) for i in range(0, 30, 3)]
results = preproc.process(documents)
assert results == documents
for i in range(5):
assert f"is 6{i} characters long after preprocessing, where the maximum length should be 10." in caplog.text
assert len(results) == 19
assert any(d.content.startswith(".") for d in results)
assert any(not d.content.startswith(".") for d in results)
assert f"characters long after preprocessing, where the maximum length should be 10." in caplog.text
@pytest.mark.unit