2023-08-11 07:02:37 -04:00
|
|
|
import json
|
2022-12-15 17:19:02 -05:00
|
|
|
from functools import partial
|
2023-02-27 17:30:54 +01:00
|
|
|
|
2022-12-15 17:19:02 -05:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from unstructured.cleaners.core import clean_prefix
|
|
|
|
from unstructured.cleaners.translate import translate_text
|
2023-06-20 11:19:55 -05:00
|
|
|
from unstructured.documents.coordinates import (
|
|
|
|
CoordinateSystem,
|
|
|
|
Orientation,
|
|
|
|
RelativeCoordinateSystem,
|
|
|
|
)
|
2023-08-09 15:32:20 -07:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
UUID,
|
|
|
|
CoordinatesMetadata,
|
|
|
|
Element,
|
2023-09-27 14:40:56 -04:00
|
|
|
ElementMetadata,
|
2023-08-09 15:32:20 -07:00
|
|
|
NoID,
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
RegexMetadata,
|
2023-08-09 15:32:20 -07:00
|
|
|
Text,
|
|
|
|
)
|
2022-06-29 14:35:19 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_text_id():
|
|
|
|
text_element = Text(text="hello there!")
|
|
|
|
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
|
|
|
|
|
|
|
|
|
2023-08-09 15:32:20 -07:00
|
|
|
def test_text_uuid():
|
|
|
|
text_element = Text(text="hello there!", element_id=UUID())
|
2023-08-11 07:02:37 -04:00
|
|
|
assert len(text_element.id) == 36
|
|
|
|
assert text_element.id.count("-") == 4
|
|
|
|
# Test that the element is JSON serializable. This shold run without an error
|
|
|
|
json.dumps(text_element.to_dict())
|
2023-08-09 15:32:20 -07:00
|
|
|
|
|
|
|
|
2022-06-29 14:35:19 -04:00
|
|
|
def test_element_defaults_to_blank_id():
|
|
|
|
element = Element()
|
|
|
|
assert isinstance(element.id, NoID)
|
2022-12-15 17:19:02 -05:00
|
|
|
|
|
|
|
|
2023-08-09 15:32:20 -07:00
|
|
|
def test_element_uuid():
|
|
|
|
element = Element(element_id=UUID())
|
|
|
|
assert isinstance(element.id, UUID)
|
|
|
|
|
|
|
|
|
2022-12-15 17:19:02 -05:00
|
|
|
def test_text_element_apply_cleaners():
|
|
|
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
|
|
|
|
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
|
|
|
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
|
|
|
|
|
|
|
|
|
|
|
def test_text_element_apply_multiple_cleaners():
|
|
|
|
cleaners = [
|
|
|
|
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
|
|
|
partial(translate_text, target_lang="ru"),
|
|
|
|
]
|
|
|
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
text_element.apply(*cleaners)
|
|
|
|
assert str(text_element) == "Учебник по крокодильным средам обитания"
|
|
|
|
|
|
|
|
|
|
|
|
def test_apply_raises_if_func_does_not_produce_string():
|
|
|
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
text_element.apply(lambda s: 1)
|
2023-06-20 11:19:55 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("coordinates", "orientation1", "orientation2", "expected_coords"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
Orientation.CARTESIAN,
|
|
|
|
Orientation.CARTESIAN,
|
|
|
|
((10, 20), (10, 40), (30, 40), (30, 20)),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
Orientation.CARTESIAN,
|
|
|
|
Orientation.SCREEN,
|
|
|
|
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
Orientation.SCREEN,
|
|
|
|
Orientation.CARTESIAN,
|
|
|
|
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
Orientation.SCREEN,
|
|
|
|
Orientation.SCREEN,
|
|
|
|
((10, 20), (10, 40), (30, 40), (30, 20)),
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_convert_coordinates_to_new_system(
|
|
|
|
coordinates,
|
|
|
|
orientation1,
|
|
|
|
orientation2,
|
|
|
|
expected_coords,
|
|
|
|
):
|
|
|
|
coord1 = CoordinateSystem(100, 200)
|
|
|
|
coord1.orientation = orientation1
|
|
|
|
coord2 = CoordinateSystem(1000, 2000)
|
|
|
|
coord2.orientation = orientation2
|
|
|
|
element = Element(coordinates=coordinates, coordinate_system=coord1)
|
|
|
|
new_coords = element.convert_coordinates_to_new_system(coord2)
|
|
|
|
for new_coord, expected_coord in zip(new_coords, expected_coords):
|
|
|
|
new_coord == pytest.approx(expected_coord)
|
|
|
|
element.convert_coordinates_to_new_system(coord2, in_place=True)
|
2023-07-05 11:25:11 -07:00
|
|
|
for new_coord, expected_coord in zip(element.metadata.coordinates.points, expected_coords):
|
2023-06-20 11:19:55 -05:00
|
|
|
assert new_coord == pytest.approx(expected_coord)
|
2023-07-05 11:25:11 -07:00
|
|
|
assert element.metadata.coordinates.system == coord2
|
2023-06-20 11:19:55 -05:00
|
|
|
|
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
def test_convert_coordinate_to_new_system_none():
|
|
|
|
element = Element(coordinates=None, coordinate_system=None)
|
2023-06-20 11:19:55 -05:00
|
|
|
coord = CoordinateSystem(100, 200)
|
|
|
|
coord.orientation = Orientation.SCREEN
|
|
|
|
assert element.convert_coordinates_to_new_system(coord) is None
|
|
|
|
|
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
def test_element_constructor_coordinates_all_present():
|
2023-06-20 11:19:55 -05:00
|
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
|
|
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
|
2023-07-05 11:25:11 -07:00
|
|
|
expected_coordinates_metadata = CoordinatesMetadata(
|
|
|
|
points=coordinates,
|
|
|
|
system=coordinate_system,
|
|
|
|
)
|
|
|
|
assert element.metadata.coordinates == expected_coordinates_metadata
|
|
|
|
|
|
|
|
|
|
|
|
def test_element_constructor_coordinates_points_absent():
|
|
|
|
with pytest.raises(ValueError) as exc_info:
|
|
|
|
Element(coordinate_system=RelativeCoordinateSystem())
|
|
|
|
assert (
|
|
|
|
str(exc_info.value)
|
|
|
|
== "Coordinates points should not exist without coordinates system and vice versa."
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_element_constructor_coordinates_system_absent():
|
|
|
|
with pytest.raises(ValueError) as exc_info:
|
|
|
|
Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))
|
|
|
|
assert (
|
|
|
|
str(exc_info.value)
|
|
|
|
== "Coordinates points should not exist without coordinates system and vice versa."
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_coordinate_metadata_serdes():
|
|
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
|
|
coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)
|
2023-06-20 11:19:55 -05:00
|
|
|
expected_schema = {
|
|
|
|
"layout_height": 1,
|
2023-07-05 11:25:11 -07:00
|
|
|
"layout_width": 1,
|
|
|
|
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
"system": "RelativeCoordinateSystem",
|
|
|
|
}
|
|
|
|
coordinates_metadata_dict = coordinates_metadata.to_dict()
|
|
|
|
assert coordinates_metadata_dict == expected_schema
|
|
|
|
assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata
|
|
|
|
|
|
|
|
|
|
|
|
def test_element_to_dict():
|
|
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
|
|
element = Element(
|
|
|
|
element_id="awt32t1",
|
|
|
|
coordinates=coordinates,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
|
|
|
expected = {
|
|
|
|
"metadata": {
|
|
|
|
"coordinates": {
|
|
|
|
"layout_height": 1,
|
|
|
|
"layout_width": 1,
|
|
|
|
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
"system": "RelativeCoordinateSystem",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"type": None,
|
|
|
|
"element_id": "awt32t1",
|
2023-06-20 11:19:55 -05:00
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
assert element.to_dict() == expected
|
2023-09-27 14:40:56 -04:00
|
|
|
|
|
|
|
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
def test_regex_metadata_round_trips_through_JSON():
|
|
|
|
"""metadata.regex_metadata should appear at full depth in JSON."""
|
|
|
|
regex_metadata = {
|
|
|
|
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
|
|
|
|
"version": [
|
|
|
|
RegexMetadata(text="current=v1.7.2", start=7, end=21),
|
|
|
|
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
|
|
|
|
],
|
|
|
|
}
|
|
|
|
metadata = ElementMetadata(regex_metadata=regex_metadata)
|
|
|
|
|
|
|
|
metadata_json = json.dumps(metadata.to_dict())
|
|
|
|
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
|
|
|
|
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
|
|
|
|
|
|
|
|
assert reserialized_metadata_json == metadata_json
|
|
|
|
|
|
|
|
|
2023-09-27 14:40:56 -04:00
|
|
|
def test_metadata_from_dict_extra_fields():
|
|
|
|
"""
|
|
|
|
Assert that the metadata classes ignore nonexistent fields.
|
|
|
|
This can be an issue when elements_from_json gets a schema
|
|
|
|
from the future.
|
|
|
|
"""
|
|
|
|
element_metadata = {
|
|
|
|
"new_field": "hello",
|
|
|
|
"data_source": {
|
|
|
|
"new_field": "world",
|
|
|
|
},
|
|
|
|
"coordinates": {
|
|
|
|
"new_field": "foo",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
metadata = ElementMetadata.from_dict(element_metadata)
|
|
|
|
metadata_dict = metadata.to_dict()
|
|
|
|
|
|
|
|
assert "new_field" not in metadata_dict
|
|
|
|
assert "new_field" not in metadata_dict["coordinates"]
|
|
|
|
assert "new_field" not in metadata_dict["data_source"]
|