fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
# pyright: reportPrivateUsage=false
|
|
|
|
|
2024-02-23 10:22:44 -08:00
|
|
|
"""Test suite for the `unstructured.chunking.title` module."""
|
2024-02-21 15:16:13 -08:00
|
|
|
|
|
|
|
from __future__ import annotations
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
|
2024-03-15 11:48:07 -07:00
|
|
|
from typing import Any, Optional
|
2024-02-23 10:22:44 -08:00
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
import pytest
|
|
|
|
|
2024-03-15 11:48:07 -07:00
|
|
|
from test_unstructured.unit_utils import FixtureRequest, Mock, function_mock
|
2024-03-18 12:27:39 -07:00
|
|
|
from unstructured.chunking.base import CHUNK_MULTI_PAGE_DEFAULT
|
2024-02-23 10:22:44 -08:00
|
|
|
from unstructured.chunking.title import _ByTitleChunkingOptions, chunk_by_title
|
2023-09-14 13:10:03 +03:00
|
|
|
from unstructured.documents.coordinates import CoordinateSystem
|
2023-08-29 12:04:57 -04:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
CheckBox,
|
|
|
|
CompositeElement,
|
2023-09-14 13:10:03 +03:00
|
|
|
CoordinatesMetadata,
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
Element,
|
2023-08-29 12:04:57 -04:00
|
|
|
ElementMetadata,
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
ListItem,
|
2023-08-29 12:04:57 -04:00
|
|
|
Table,
|
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.partition.html import partition_html
|
2023-08-29 12:04:57 -04:00
|
|
|
|
2024-02-23 10:22:44 -08:00
|
|
|
# ================================================================================================
|
|
|
|
# INTEGRATION-TESTS
|
|
|
|
# ================================================================================================
|
|
|
|
# These test `chunk_by_title()` as an integrated whole, calling `chunk_by_title()` and inspecting
|
|
|
|
# the outputs.
|
|
|
|
# ================================================================================================
|
|
|
|
|
2023-08-29 12:04:57 -04:00
|
|
|
|
2023-12-13 15:13:57 -08:00
|
|
|
def test_it_splits_a_large_element_into_multiple_chunks():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
fix: split-chunks appear out-of-order (#1824)
**Executive Summary.** Code inspection in preparation for adding the
chunk-overlap feature revealed a bug causing split-chunks to be inserted
out-of-order. For example, elements like this:
```
Text("One" + 400 chars)
Text("Two" + 400 chars)
Text("Three" + 600 chars)
Text("Four" + 400 chars)
Text("Five" + 600 chars)
```
Should produce chunks:
```
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("Four") # (400 chars)
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
```
but produced this instead:
```
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Four") # (400 chars)
```
This PR fixes that behavior that was introduced on Oct 9 this year in
commit: f98d5e65 when adding chunk splitting.
**Technical Summary**
The essential transformation of chunking is:
```
elements sections chunks
List[Element] -> List[List[Element]] -> List[CompositeElement]
```
1. The _sectioner_ (`_split_elements_by_title_and_table()`) _groups_
semantically-related elements into _sections_ (`List[Element]`), in the
best case, that would be a title (heading) and the text that follows it
(until the next title). A heading and its text is often referred to as a
_section_ in publishing parlance, hence the name.
2. The _chunker_ (`chunk_by_title()` currently) does two things:
1. first it _consolidates_ the elements of each section into a single
`ConsolidatedElement` object (a "chunk"). This includes both joining the
element text into a single string as well as consolidating the metadata
of the section elements.
2. then if necessary it _splits_ the chunk into two or more
`ConsolidatedElement` objects when the consolidated text is too long to
fit in the specified window (`max_characters`).
Chunk splitting is only required when a single element (like a big
paragraph) has text longer than the specified window. Otherwise a
section and the chunk that derives from it reflects an even element
boundary.
`chunk_by_title()` was elaborated in commit f98d5e65 to add this
"chunk-splitting" behavior.
At the time there was some notion of wanting to "split from the end
backward" such that any small remainder chunk would appear first, and
could possibly be combined with a small prior chunk. To accomplish this,
split chunks were _inserted_ at the beginning of the list instead of
_appended_ to the end.
The `chunked_elements` variable (`List[CompositeElement]`) holds the
sequence of chunks that result from the chunking operation and is the
returned value for `chunk_by_title()`. This was the list
"split-from-the-end" chunks were inserted at the beginning of and that
unfortunately produces this out-of-order behavior because the insertion
was at the beginning of this "all-chunks-in-document" list, not a
sublist just for this chunk.
Further, the "split-from-the-end" behavior can produce no benefit
because chunks are never combined, only _elements_ are combined (across
semantic boundaries into a single section when a section is small) and
sectioning occurs _prior_ to chunking.
The fix is to rework the chunk-splitting passage to a straighforward
iterative algorithm that works both when a chunk must be split and when
it doesn't. This algorithm is also very easily extended to implement
split-chunk-overlap which is coming up in an immediately following PR.
```python
# -- split chunk into CompositeElements objects maxlen or smaller --
text_len = len(text)
start = 0
remaining = text_len
while remaining > 0:
end = min(start + max_characters, text_len)
chunked_elements.append(CompositeElement(text=text[start:end], metadata=chunk_meta))
start = end - overlap
remaining = text_len - end
```
*Forensic analysis*
The out-of-order-chunks behavior was introduced in commit 4ea71683 on
10/09/2023 in the same PR in which chunk-splitting was introduced.
---------
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2023-10-20 18:37:34 -07:00
|
|
|
Title("Introduction"),
|
|
|
|
Text(
|
|
|
|
"Lorem ipsum dolor sit amet consectetur adipiscing elit. In rhoncus ipsum sed lectus"
|
2023-10-23 17:11:53 -07:00
|
|
|
" porta volutpat.",
|
fix: split-chunks appear out-of-order (#1824)
**Executive Summary.** Code inspection in preparation for adding the
chunk-overlap feature revealed a bug causing split-chunks to be inserted
out-of-order. For example, elements like this:
```
Text("One" + 400 chars)
Text("Two" + 400 chars)
Text("Three" + 600 chars)
Text("Four" + 400 chars)
Text("Five" + 600 chars)
```
Should produce chunks:
```
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("Four") # (400 chars)
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
```
but produced this instead:
```
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Four") # (400 chars)
```
This PR fixes that behavior that was introduced on Oct 9 this year in
commit: f98d5e65 when adding chunk splitting.
**Technical Summary**
The essential transformation of chunking is:
```
elements sections chunks
List[Element] -> List[List[Element]] -> List[CompositeElement]
```
1. The _sectioner_ (`_split_elements_by_title_and_table()`) _groups_
semantically-related elements into _sections_ (`List[Element]`), in the
best case, that would be a title (heading) and the text that follows it
(until the next title). A heading and its text is often referred to as a
_section_ in publishing parlance, hence the name.
2. The _chunker_ (`chunk_by_title()` currently) does two things:
1. first it _consolidates_ the elements of each section into a single
`ConsolidatedElement` object (a "chunk"). This includes both joining the
element text into a single string as well as consolidating the metadata
of the section elements.
2. then if necessary it _splits_ the chunk into two or more
`ConsolidatedElement` objects when the consolidated text is too long to
fit in the specified window (`max_characters`).
Chunk splitting is only required when a single element (like a big
paragraph) has text longer than the specified window. Otherwise a
section and the chunk that derives from it reflects an even element
boundary.
`chunk_by_title()` was elaborated in commit f98d5e65 to add this
"chunk-splitting" behavior.
At the time there was some notion of wanting to "split from the end
backward" such that any small remainder chunk would appear first, and
could possibly be combined with a small prior chunk. To accomplish this,
split chunks were _inserted_ at the beginning of the list instead of
_appended_ to the end.
The `chunked_elements` variable (`List[CompositeElement]`) holds the
sequence of chunks that result from the chunking operation and is the
returned value for `chunk_by_title()`. This was the list
"split-from-the-end" chunks were inserted at the beginning of and that
unfortunately produces this out-of-order behavior because the insertion
was at the beginning of this "all-chunks-in-document" list, not a
sublist just for this chunk.
Further, the "split-from-the-end" behavior can produce no benefit
because chunks are never combined, only _elements_ are combined (across
semantic boundaries into a single section when a section is small) and
sectioning occurs _prior_ to chunking.
The fix is to rework the chunk-splitting passage to a straighforward
iterative algorithm that works both when a chunk must be split and when
it doesn't. This algorithm is also very easily extended to implement
split-chunk-overlap which is coming up in an immediately following PR.
```python
# -- split chunk into CompositeElements objects maxlen or smaller --
text_len = len(text)
start = 0
remaining = text_len
while remaining > 0:
end = min(start + max_characters, text_len)
chunked_elements.append(CompositeElement(text=text[start:end], metadata=chunk_meta))
start = end - overlap
remaining = text_len - end
```
*Forensic analysis*
The out-of-order-chunks behavior was introduced in commit 4ea71683 on
10/09/2023 in the same PR in which chunk-splitting was introduced.
---------
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2023-10-20 18:37:34 -07:00
|
|
|
),
|
|
|
|
]
|
|
|
|
|
fix: chunk_by_title() interface is rude (#1844)
### `chunk_by_title()` interface is "rude"
**Executive Summary.** Perhaps the most commonly specified option for
`chunk_by_title()` is `max_characters` (default: 500), which specifies
the chunk window size.
When a user specifies this value, they get an error message:
```python
>>> chunks = chunk_by_title(elements, max_characters=100)
ValueError: Invalid values for combine_text_under_n_chars, new_after_n_chars, and/or max_characters.
```
A few of the things that might reasonably pass through a user's mind at
such a moment are:
* "Is `110` not a valid value for `max_characters`? Why would that be?"
* "I didn't specify a value for `combine_text_under_n_chars` or
`new_after_n_chars`, in fact I don't know what they are because I
haven't studied the documentation and would prefer not to; I just want
smaller chunks! How could I supply an invalid value when I haven't
supplied any value at all for these?"
* "Which of these values is the problem? Why are you making me figure
that out for myself? I'm sure the code knows which one is not valid, why
doesn't it share that information with me? I'm busy here!"
In this particular case, the problem is that
`combine_text_under_n_chars` (defaults to 500) is greater than
`max_characters`, which means it would never take effect (which is
actually not a problem in itself).
To fix this, once figuring out that was the problem, probably after
opening an issue and maybe reading the source code, the user would need
to specify:
```python
>>> chunks = chunk_by_title(
... elements, max_characters=100, combine_text_under_n_chars=100
... )
```
This and other stressful user scenarios can be remedied by:
* Using "active" defaults for the `combine_text_under_n_chars` and
`new_after_n_chars` options.
* Providing a specific error message for each way a constraint may be
violated, such that direction to remedy the problem is immediately clear
to the user.
An *active default* is for example:
* Make the default for `combine_text_under_n_chars: int | None = None`
such that the code can detect when it has not been specified.
* When not specified, set its value to `max_characters`, the same as its
current (static) default.
This particular change would avoid the behavior in the motivating
example above.
Another alternative for this argument is simply:
```python
combine_text_under_n_chars = min(max_characters, combine_text_under_n_chars)
```
### Fix
1. Add constraint-specific error messages.
2. Use "active" defaults for `combine_text_under_n_ chars` and
`new_after_n_chars`.
3. Improve docstring to describe active defaults, and explain other
argument behaviors, in particular identifying suppression options like
`combine_text_under_n_chars = 0` to disable chunk combining.
2023-10-24 16:22:38 -07:00
|
|
|
chunks = chunk_by_title(elements, max_characters=50)
|
fix: split-chunks appear out-of-order (#1824)
**Executive Summary.** Code inspection in preparation for adding the
chunk-overlap feature revealed a bug causing split-chunks to be inserted
out-of-order. For example, elements like this:
```
Text("One" + 400 chars)
Text("Two" + 400 chars)
Text("Three" + 600 chars)
Text("Four" + 400 chars)
Text("Five" + 600 chars)
```
Should produce chunks:
```
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("Four") # (400 chars)
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
```
but produced this instead:
```
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Four") # (400 chars)
```
This PR fixes that behavior that was introduced on Oct 9 this year in
commit: f98d5e65 when adding chunk splitting.
**Technical Summary**
The essential transformation of chunking is:
```
elements sections chunks
List[Element] -> List[List[Element]] -> List[CompositeElement]
```
1. The _sectioner_ (`_split_elements_by_title_and_table()`) _groups_
semantically-related elements into _sections_ (`List[Element]`), in the
best case, that would be a title (heading) and the text that follows it
(until the next title). A heading and its text is often referred to as a
_section_ in publishing parlance, hence the name.
2. The _chunker_ (`chunk_by_title()` currently) does two things:
1. first it _consolidates_ the elements of each section into a single
`ConsolidatedElement` object (a "chunk"). This includes both joining the
element text into a single string as well as consolidating the metadata
of the section elements.
2. then if necessary it _splits_ the chunk into two or more
`ConsolidatedElement` objects when the consolidated text is too long to
fit in the specified window (`max_characters`).
Chunk splitting is only required when a single element (like a big
paragraph) has text longer than the specified window. Otherwise a
section and the chunk that derives from it reflects an even element
boundary.
`chunk_by_title()` was elaborated in commit f98d5e65 to add this
"chunk-splitting" behavior.
At the time there was some notion of wanting to "split from the end
backward" such that any small remainder chunk would appear first, and
could possibly be combined with a small prior chunk. To accomplish this,
split chunks were _inserted_ at the beginning of the list instead of
_appended_ to the end.
The `chunked_elements` variable (`List[CompositeElement]`) holds the
sequence of chunks that result from the chunking operation and is the
returned value for `chunk_by_title()`. This was the list
"split-from-the-end" chunks were inserted at the beginning of and that
unfortunately produces this out-of-order behavior because the insertion
was at the beginning of this "all-chunks-in-document" list, not a
sublist just for this chunk.
Further, the "split-from-the-end" behavior can produce no benefit
because chunks are never combined, only _elements_ are combined (across
semantic boundaries into a single section when a section is small) and
sectioning occurs _prior_ to chunking.
The fix is to rework the chunk-splitting passage to a straighforward
iterative algorithm that works both when a chunk must be split and when
it doesn't. This algorithm is also very easily extended to implement
split-chunk-overlap which is coming up in an immediately following PR.
```python
# -- split chunk into CompositeElements objects maxlen or smaller --
text_len = len(text)
start = 0
remaining = text_len
while remaining > 0:
end = min(start + max_characters, text_len)
chunked_elements.append(CompositeElement(text=text[start:end], metadata=chunk_meta))
start = end - overlap
remaining = text_len - end
```
*Forensic analysis*
The out-of-order-chunks behavior was introduced in commit 4ea71683 on
10/09/2023 in the same PR in which chunk-splitting was introduced.
---------
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2023-10-20 18:37:34 -07:00
|
|
|
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement("Introduction"),
|
rfctr(chunking): split oversized chunks on word boundary (#2297)
The text of an oversized chunk is split on an arbitrary character
boundary (mid-word). The `chunk_by_character()` strategy introduces the
idea of allowing the user to specify a separator to use for
chunk-splitting. For `langchain` this is typically "\n\n", "\n", or " ";
blank-line, newline, or word boundaries respectively.
Even if the user is allowed to specify a separator, we must provide
fall-back for when a chunk contains no such character. This can be done
incrementally, like blank-line is preferable to newline, newline is
preferable to word, and word is preferable to arbitrary character.
Further, there is nothing particular to `chunk_by_character()` in
providing such a fall-back text-splitting strategy. It would be
preferable for all strategies to split oversized chunks on even-word
boundaries for example.
Note that while a "blank-line" ("\n\n") may be common in plain text, it
is unlikely to appear in the text of an element because it would have
been interpreted as an element boundary during partitioning.
Add _TextSplitter with basic separator preferences and fall-back and
apply it to chunk-splitting for all strategies. The `by_character`
chunking strategy may enhance this behavior by adding the option for a
user to specify a particular separator suited to their use case.
2023-12-20 21:45:36 -08:00
|
|
|
CompositeElement("Lorem ipsum dolor sit amet consectetur adipiscing"),
|
fix: split-chunks appear out-of-order (#1824)
**Executive Summary.** Code inspection in preparation for adding the
chunk-overlap feature revealed a bug causing split-chunks to be inserted
out-of-order. For example, elements like this:
```
Text("One" + 400 chars)
Text("Two" + 400 chars)
Text("Three" + 600 chars)
Text("Four" + 400 chars)
Text("Five" + 600 chars)
```
Should produce chunks:
```
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("Four") # (400 chars)
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
```
but produced this instead:
```
CompositeElement("Five ...") # (500 chars)
CompositeElement("rest of Five ...") # (100 chars)
CompositeElement("Three ...") # (500 chars)
CompositeElement("rest of Three ...") # (100 chars)
CompositeElement("One ...") # (400 chars)
CompositeElement("Two ...") # (400 chars)
CompositeElement("Four") # (400 chars)
```
This PR fixes that behavior that was introduced on Oct 9 this year in
commit: f98d5e65 when adding chunk splitting.
**Technical Summary**
The essential transformation of chunking is:
```
elements sections chunks
List[Element] -> List[List[Element]] -> List[CompositeElement]
```
1. The _sectioner_ (`_split_elements_by_title_and_table()`) _groups_
semantically-related elements into _sections_ (`List[Element]`), in the
best case, that would be a title (heading) and the text that follows it
(until the next title). A heading and its text is often referred to as a
_section_ in publishing parlance, hence the name.
2. The _chunker_ (`chunk_by_title()` currently) does two things:
1. first it _consolidates_ the elements of each section into a single
`ConsolidatedElement` object (a "chunk"). This includes both joining the
element text into a single string as well as consolidating the metadata
of the section elements.
2. then if necessary it _splits_ the chunk into two or more
`ConsolidatedElement` objects when the consolidated text is too long to
fit in the specified window (`max_characters`).
Chunk splitting is only required when a single element (like a big
paragraph) has text longer than the specified window. Otherwise a
section and the chunk that derives from it reflects an even element
boundary.
`chunk_by_title()` was elaborated in commit f98d5e65 to add this
"chunk-splitting" behavior.
At the time there was some notion of wanting to "split from the end
backward" such that any small remainder chunk would appear first, and
could possibly be combined with a small prior chunk. To accomplish this,
split chunks were _inserted_ at the beginning of the list instead of
_appended_ to the end.
The `chunked_elements` variable (`List[CompositeElement]`) holds the
sequence of chunks that result from the chunking operation and is the
returned value for `chunk_by_title()`. This was the list
"split-from-the-end" chunks were inserted at the beginning of and that
unfortunately produces this out-of-order behavior because the insertion
was at the beginning of this "all-chunks-in-document" list, not a
sublist just for this chunk.
Further, the "split-from-the-end" behavior can produce no benefit
because chunks are never combined, only _elements_ are combined (across
semantic boundaries into a single section when a section is small) and
sectioning occurs _prior_ to chunking.
The fix is to rework the chunk-splitting passage to a straighforward
iterative algorithm that works both when a chunk must be split and when
it doesn't. This algorithm is also very easily extended to implement
split-chunk-overlap which is coming up in an immediately following PR.
```python
# -- split chunk into CompositeElements objects maxlen or smaller --
text_len = len(text)
start = 0
remaining = text_len
while remaining > 0:
end = min(start + max_characters, text_len)
chunked_elements.append(CompositeElement(text=text[start:end], metadata=chunk_meta))
start = end - overlap
remaining = text_len - end
```
*Forensic analysis*
The out-of-order-chunks behavior was introduced in commit 4ea71683 on
10/09/2023 in the same PR in which chunk-splitting was introduced.
---------
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
2023-10-20 18:37:34 -07:00
|
|
|
CompositeElement("elit. In rhoncus ipsum sed lectus porta volutpat."),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-03-18 12:27:39 -07:00
|
|
|
def test_it_splits_elements_by_title_and_table():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("A Great Day"),
|
|
|
|
Text("Today is a great day."),
|
|
|
|
Text("It is sunny outside."),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
Title("A Bad Day"),
|
|
|
|
Text("Today is a bad day."),
|
|
|
|
Text("It is storming outside."),
|
|
|
|
CheckBox(),
|
|
|
|
]
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
|
2024-03-18 12:27:39 -07:00
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=True)
|
2023-08-29 12:04:57 -04:00
|
|
|
|
2024-03-18 12:27:39 -07:00
|
|
|
assert len(chunks) == 4
|
|
|
|
# --
|
|
|
|
chunk = chunks[0]
|
|
|
|
assert isinstance(chunk, CompositeElement)
|
|
|
|
assert chunk.metadata.orig_elements == [
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
Title("A Great Day"),
|
|
|
|
Text("Today is a great day."),
|
|
|
|
Text("It is sunny outside."),
|
|
|
|
]
|
|
|
|
# --
|
2024-03-18 12:27:39 -07:00
|
|
|
chunk = chunks[1]
|
|
|
|
assert isinstance(chunk, Table)
|
|
|
|
assert chunk.metadata.orig_elements == [Table("Heading\nCell text")]
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
# ==
|
2024-03-18 12:27:39 -07:00
|
|
|
chunk = chunks[2]
|
|
|
|
assert isinstance(chunk, CompositeElement)
|
|
|
|
assert chunk.metadata.orig_elements == [
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
]
|
|
|
|
# --
|
2024-03-18 12:27:39 -07:00
|
|
|
chunk = chunks[3]
|
|
|
|
assert isinstance(chunk, CompositeElement)
|
|
|
|
assert chunk.metadata.orig_elements == [
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
Title("A Bad Day"),
|
|
|
|
Text("Today is a bad day."),
|
|
|
|
Text("It is storming outside."),
|
2023-12-13 12:22:25 -08:00
|
|
|
CheckBox(),
|
2023-08-29 12:04:57 -04:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_chunk_by_title():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
|
|
|
|
Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])),
|
|
|
|
Text("It is sunny outside."),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
Title("A Bad Day"),
|
2024-09-24 10:33:25 -07:00
|
|
|
Text("Today is a bad day."),
|
2023-08-29 12:04:57 -04:00
|
|
|
Text("It is storming outside."),
|
|
|
|
CheckBox(),
|
|
|
|
]
|
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
2023-11-15 13:22:15 -08:00
|
|
|
|
2024-03-18 12:27:39 -07:00
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0, include_orig_elements=False)
|
2023-08-29 12:04:57 -04:00
|
|
|
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement(
|
|
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
|
|
),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
|
|
CompositeElement(
|
|
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
|
|
|
|
|
|
|
|
|
|
|
def test_chunk_by_title_separates_by_page_number():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
Title("A Bad Day"),
|
2024-09-24 10:33:25 -07:00
|
|
|
Text("Today is a bad day."),
|
2023-08-29 12:04:57 -04:00
|
|
|
Text("It is storming outside."),
|
|
|
|
CheckBox(),
|
|
|
|
]
|
2023-10-03 09:40:34 -07:00
|
|
|
chunks = chunk_by_title(elements, multipage_sections=False, combine_text_under_n_chars=0)
|
2023-08-29 12:04:57 -04:00
|
|
|
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement(
|
|
|
|
"A Great Day",
|
|
|
|
),
|
|
|
|
CompositeElement(
|
|
|
|
"Today is a great day.\n\nIt is sunny outside.",
|
|
|
|
),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
|
|
CompositeElement(
|
|
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-04-30 11:20:26 -04:00
|
|
|
def test_chuck_by_title_respects_multipage():
|
|
|
|
elements: list[Element] = [
|
|
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
|
|
|
Table("Heading\nCell text"),
|
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
Title("A Bad Day"),
|
2024-09-24 10:33:25 -07:00
|
|
|
Text("Today is a bad day."),
|
2024-04-30 11:20:26 -04:00
|
|
|
Text("It is storming outside."),
|
|
|
|
CheckBox(),
|
|
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement(
|
|
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
|
|
),
|
|
|
|
Table("Heading\nCell text"),
|
|
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
|
|
CompositeElement(
|
|
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-08-29 12:04:57 -04:00
|
|
|
def test_chunk_by_title_groups_across_pages():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
|
|
|
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
|
|
|
|
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
Title("An Okay Day"),
|
|
|
|
Text("Today is an okay day."),
|
|
|
|
Text("It is rainy outside."),
|
|
|
|
Title("A Bad Day"),
|
2024-09-24 10:33:25 -07:00
|
|
|
Text("Today is a bad day."),
|
2023-08-29 12:04:57 -04:00
|
|
|
Text("It is storming outside."),
|
|
|
|
CheckBox(),
|
|
|
|
]
|
2023-10-03 09:40:34 -07:00
|
|
|
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
|
2023-08-29 12:04:57 -04:00
|
|
|
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement(
|
|
|
|
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
|
|
|
|
),
|
2023-11-16 08:22:50 -08:00
|
|
|
Table("Heading\nCell text"),
|
2023-08-29 12:04:57 -04:00
|
|
|
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
|
|
|
CompositeElement(
|
|
|
|
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
|
|
|
),
|
|
|
|
]
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_html():
|
|
|
|
filename = "example-docs/example-10k-1p.html"
|
|
|
|
chunk_elements = partition_html(filename, chunking_strategy="by_title")
|
|
|
|
elements = partition_html(filename)
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
|
|
|
2023-10-09 12:42:36 -07:00
|
|
|
def test_add_chunking_strategy_respects_max_characters():
|
|
|
|
filename = "example-docs/example-10k-1p.html"
|
|
|
|
chunk_elements = partition_html(
|
|
|
|
filename,
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
combine_text_under_n_chars=0,
|
|
|
|
new_after_n_chars=50,
|
|
|
|
max_characters=100,
|
|
|
|
)
|
|
|
|
elements = partition_html(filename)
|
|
|
|
chunks = chunk_by_title(
|
|
|
|
elements,
|
|
|
|
combine_text_under_n_chars=0,
|
|
|
|
new_after_n_chars=50,
|
|
|
|
max_characters=100,
|
|
|
|
)
|
|
|
|
|
|
|
|
for chunk in chunks:
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
assert isinstance(chunk, Text)
|
2023-10-09 12:42:36 -07:00
|
|
|
assert len(chunk.text) <= 100
|
|
|
|
for chunk_element in chunk_elements:
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
assert isinstance(chunk_element, Text)
|
2023-10-09 12:42:36 -07:00
|
|
|
assert len(chunk_element.text) <= 100
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
|
|
|
2023-10-04 15:14:21 -07:00
|
|
|
def test_chunk_by_title_drops_detection_class_prob():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-10-04 15:14:21 -07:00
|
|
|
Title(
|
|
|
|
"A Great Day",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
detection_class_prob=0.5,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"Today is a great day.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
detection_class_prob=0.62,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"It is sunny outside.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
detection_class_prob=0.73,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Title(
|
|
|
|
"An Okay Day",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
detection_class_prob=0.84,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"Today is an okay day.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
detection_class_prob=0.95,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
|
|
|
assert str(chunks[0]) == str(
|
|
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
|
|
)
|
|
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
|
|
|
|
|
|
|
|
2023-09-14 13:10:03 +03:00
|
|
|
def test_chunk_by_title_drops_extra_metadata():
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
2023-09-14 13:10:03 +03:00
|
|
|
Title(
|
|
|
|
"A Great Day",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=(
|
|
|
|
(0.1, 0.1),
|
|
|
|
(0.2, 0.1),
|
|
|
|
(0.1, 0.2),
|
|
|
|
(0.2, 0.2),
|
|
|
|
),
|
|
|
|
system=CoordinateSystem(width=0.1, height=0.1),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"Today is a great day.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=(
|
|
|
|
(0.2, 0.2),
|
|
|
|
(0.3, 0.2),
|
|
|
|
(0.2, 0.3),
|
|
|
|
(0.3, 0.3),
|
|
|
|
),
|
|
|
|
system=CoordinateSystem(width=0.2, height=0.2),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"It is sunny outside.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=(
|
|
|
|
(0.3, 0.3),
|
|
|
|
(0.4, 0.3),
|
|
|
|
(0.3, 0.4),
|
|
|
|
(0.4, 0.4),
|
|
|
|
),
|
|
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Title(
|
|
|
|
"An Okay Day",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=(
|
|
|
|
(0.3, 0.3),
|
|
|
|
(0.4, 0.3),
|
|
|
|
(0.3, 0.4),
|
|
|
|
(0.4, 0.4),
|
|
|
|
),
|
|
|
|
system=CoordinateSystem(width=0.3, height=0.3),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
Text(
|
|
|
|
"Today is an okay day.",
|
|
|
|
metadata=ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=(
|
|
|
|
(0.4, 0.4),
|
|
|
|
(0.5, 0.4),
|
|
|
|
(0.4, 0.5),
|
|
|
|
(0.5, 0.5),
|
|
|
|
),
|
|
|
|
system=CoordinateSystem(width=0.4, height=0.4),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
2023-09-14 13:10:03 +03:00
|
|
|
|
|
|
|
assert str(chunks[0]) == str(
|
|
|
|
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
|
|
|
)
|
|
|
|
|
|
|
|
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
|
|
|
|
|
2023-12-13 15:13:57 -08:00
|
|
|
def test_it_considers_separator_length_when_pre_chunking():
|
|
|
|
"""PreChunker includes length of separators when computing remaining space."""
|
2024-02-21 15:16:13 -08:00
|
|
|
elements: list[Element] = [
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
Title("Chunking Priorities"), # 19 chars
|
|
|
|
ListItem("Divide text into manageable chunks"), # 34 chars
|
|
|
|
ListItem("Preserve semantic boundaries"), # 28 chars
|
|
|
|
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
|
|
|
|
] # 114 chars total but 120 chars with separators
|
|
|
|
|
|
|
|
chunks = chunk_by_title(elements, max_characters=115)
|
|
|
|
|
|
|
|
assert chunks == [
|
|
|
|
CompositeElement(
|
|
|
|
"Chunking Priorities"
|
|
|
|
"\n\nDivide text into manageable chunks"
|
2023-10-30 01:10:51 -06:00
|
|
|
"\n\nPreserve semantic boundaries",
|
fix: sectioner does not consider separator length (#1858)
### sectioner-does-not-consider-separator-length
**Executive Summary.** A primary responsibility of the sectioner is to
minimize the number of chunks that need to be split mid-text. It does
this by computing text-length of the section being formed and
"finishing" the section when adding another element would extend its
text beyond the window size.
When element-text is consolidated into a chunk, the text of each element
is joined, separated by a "blank-line" (`"\n\n"`). The sectioner does
not currently consider the added length of separators (2-chars each) and
so forms sections that need to be split mid-text when chunked.
Chunk-splitting should only be necessary when the text of a single
element is longer than the chunking window.
**Example**
```python
elements: List[Element] = [
Title("Chunking Priorities"), # 19 chars
ListItem("Divide text into manageable chunks"), # 34 chars
ListItem("Preserve semantic boundaries"), # 28 chars
ListItem("Minimize mid-text chunk-splitting"), # 33 chars
] # 114 chars total but 120 chars with separators
chunks = chunk_by_title(elements, max_characters=115)
```
Want:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
),
CompositeElement("Minimize mid-text chunk-splitting"),
]
```
Got:
```python
[
CompositeElement(
"Chunking Priorities"
"\n\nDivide text into manageable chunks"
"\n\nPreserve semantic boundaries"
"\n\nMinimize mid-text chunk-spli"),
)
CompositeElement("tting")
```
### Technical Summary
Because the sectioner does not consider separator (`"\n\n"`) length when
it computes the space remaining in the section, it over-populates the
section and when the chunker concatenates the element text (each
separated by the separator) the text exceeds the window length and the
chunk must be split mid-text, even though there was an even element
boundary it could have been split on.
### Fix
Consider separator length in the space-remaining computation.
The solution here extracts both the `section.text_length` and
`section.space_remaining` computations to a `_TextSectionBuilder` object
which removes the need for the sectioner
(`_split_elements_by_title_and_table()`) to deal with primitives
(List[Element], running text length, separator length, etc.) and allows
it to focus on the rules of when to start a new section.
This solution may seem like overkill at the moment and indeed it would
be except it forms the foundation for adding section-level chunk
combination (fix: dissociated title elements) in the next PR. The
objects introduced here will gain several additional responsibilities in
the next few chunking PRs in the pipeline and will earn their place.
2023-10-26 14:34:15 -07:00
|
|
|
),
|
|
|
|
CompositeElement("Minimize mid-text chunk-splitting"),
|
|
|
|
]
|
2024-02-23 10:22:44 -08:00
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
# UNIT-TESTS
|
|
|
|
# ================================================================================================
|
|
|
|
# These test individual components in isolation so can exercise all edge cases while still
|
|
|
|
# performing well.
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
2024-03-15 11:48:07 -07:00
|
|
|
class Describe_chunk_by_title:
|
|
|
|
"""Unit-test suite for `unstructured.chunking.title.chunk_by_title()` function."""
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("kwargs", "expected_value"),
|
|
|
|
[
|
|
|
|
({"include_orig_elements": True}, True),
|
|
|
|
({"include_orig_elements": False}, False),
|
|
|
|
({"include_orig_elements": None}, True),
|
|
|
|
({}, True),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def it_supports_the_include_orig_elements_option(
|
|
|
|
self, kwargs: dict[str, Any], expected_value: bool, _chunk_by_title_: Mock
|
|
|
|
):
|
|
|
|
# -- this line would raise if "include_orig_elements" was not an available parameter on
|
|
|
|
# -- `chunk_by_title()`.
|
|
|
|
chunk_by_title([], **kwargs)
|
|
|
|
|
|
|
|
_, opts = _chunk_by_title_.call_args.args
|
|
|
|
assert opts.include_orig_elements is expected_value
|
|
|
|
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def _chunk_by_title_(self, request: FixtureRequest):
|
|
|
|
return function_mock(request, "unstructured.chunking.title._chunk_by_title")
|
|
|
|
|
|
|
|
|
2024-02-23 10:22:44 -08:00
|
|
|
class Describe_ByTitleChunkingOptions:
|
|
|
|
"""Unit-test suite for `unstructured.chunking.title._ByTitleChunkingOptions` objects."""
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("n_chars", [-1, -42])
|
|
|
|
def it_rejects_combine_text_under_n_chars_for_n_less_than_zero(self, n_chars: int):
|
|
|
|
with pytest.raises(
|
|
|
|
ValueError,
|
|
|
|
match=f"'combine_text_under_n_chars' argument must be >= 0, got {n_chars}",
|
|
|
|
):
|
|
|
|
_ByTitleChunkingOptions.new(combine_text_under_n_chars=n_chars)
|
|
|
|
|
|
|
|
def it_accepts_0_for_combine_text_under_n_chars_to_disable_chunk_combining(self):
|
|
|
|
"""Specifying `combine_text_under_n_chars=0` is how a caller disables chunk-combining."""
|
|
|
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=0)
|
|
|
|
assert opts.combine_text_under_n_chars == 0
|
|
|
|
|
|
|
|
def it_does_not_complain_when_specifying_combine_text_under_n_chars_by_itself(self):
|
|
|
|
"""Caller can specify `combine_text_under_n_chars` arg without specifying other options."""
|
|
|
|
try:
|
|
|
|
opts = _ByTitleChunkingOptions(combine_text_under_n_chars=50)
|
|
|
|
except ValueError:
|
|
|
|
pytest.fail("did not accept `combine_text_under_n_chars` as option by itself")
|
|
|
|
|
|
|
|
assert opts.combine_text_under_n_chars == 50
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("combine_text_under_n_chars", "max_characters", "expected_hard_max"),
|
|
|
|
[(600, None, 500), (600, 450, 450)],
|
|
|
|
)
|
|
|
|
def it_rejects_combine_text_under_n_chars_greater_than_maxchars(
|
|
|
|
self, combine_text_under_n_chars: int, max_characters: Optional[int], expected_hard_max: int
|
|
|
|
):
|
|
|
|
"""`combine_text_under_n_chars` > `max_characters` can produce behavior confusing to users.
|
|
|
|
|
|
|
|
The behavior is no different from `combine_text_under_n_chars == max_characters`, but if
|
|
|
|
`max_characters` is left to default (500) and `combine_text_under_n_chars` is set to a
|
|
|
|
larger number like 1500 then it can look like chunk-combining isn't working.
|
|
|
|
"""
|
|
|
|
with pytest.raises(
|
|
|
|
ValueError,
|
|
|
|
match=(
|
|
|
|
"'combine_text_under_n_chars' argument must not exceed `max_characters` value,"
|
|
|
|
f" got {combine_text_under_n_chars} > {expected_hard_max}"
|
|
|
|
),
|
|
|
|
):
|
|
|
|
_ByTitleChunkingOptions.new(
|
|
|
|
max_characters=max_characters, combine_text_under_n_chars=combine_text_under_n_chars
|
|
|
|
)
|
|
|
|
|
|
|
|
def it_does_not_complain_when_specifying_new_after_n_chars_by_itself(self):
|
2024-03-19 12:48:23 -05:00
|
|
|
"""Caller can specify `new_after_n_chars` arg without specifying any other options."""
|
2024-02-23 10:22:44 -08:00
|
|
|
try:
|
2024-03-19 12:48:23 -05:00
|
|
|
opts = _ByTitleChunkingOptions.new(new_after_n_chars=200)
|
2024-02-23 10:22:44 -08:00
|
|
|
except ValueError:
|
|
|
|
pytest.fail("did not accept `new_after_n_chars` as option by itself")
|
|
|
|
|
|
|
|
assert opts.soft_max == 200
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("multipage_sections", "expected_value"),
|
|
|
|
[(True, True), (False, False), (None, CHUNK_MULTI_PAGE_DEFAULT)],
|
|
|
|
)
|
|
|
|
def it_knows_whether_to_break_chunks_on_page_boundaries(
|
|
|
|
self, multipage_sections: bool, expected_value: bool
|
|
|
|
):
|
|
|
|
opts = _ByTitleChunkingOptions(multipage_sections=multipage_sections)
|
|
|
|
assert opts.multipage_sections is expected_value
|