2023-10-12 20:28:46 -05:00
|
|
|
from dataclasses import dataclass
|
|
|
|
from unittest import mock
|
|
|
|
|
2023-08-10 16:28:57 -07:00
|
|
|
import pytest
|
2023-08-24 17:46:19 -07:00
|
|
|
from PIL import Image
|
|
|
|
from unstructured_inference.inference import layout
|
2023-10-30 13:13:29 -07:00
|
|
|
from unstructured_inference.inference.elements import TextRegion
|
2023-10-12 20:28:46 -05:00
|
|
|
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
2023-01-19 09:29:28 -05:00
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
2023-01-20 08:55:11 -05:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
CheckBox,
|
2023-10-30 13:13:29 -07:00
|
|
|
CoordinatesMetadata,
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
ElementMetadata,
|
2023-01-20 08:55:11 -05:00
|
|
|
FigureCaption,
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
Header,
|
2023-01-20 08:55:11 -05:00
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.partition import common
|
2023-10-12 20:28:46 -05:00
|
|
|
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
|
2023-08-24 17:46:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
|
|
|
|
|
|
|
@property
|
|
|
|
def elements(self):
|
|
|
|
return [
|
2023-10-12 20:28:46 -05:00
|
|
|
LayoutElement(
|
2023-08-24 17:46:19 -07:00
|
|
|
type="Headline",
|
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-10-12 20:28:46 -05:00
|
|
|
bbox=None,
|
2023-08-24 17:46:19 -07:00
|
|
|
),
|
2023-10-12 20:28:46 -05:00
|
|
|
LayoutElement(
|
2023-10-05 10:51:06 -07:00
|
|
|
type="Subheadline",
|
|
|
|
text="The Beginning",
|
2023-10-12 20:28:46 -05:00
|
|
|
bbox=None,
|
2023-10-05 10:51:06 -07:00
|
|
|
),
|
2023-10-12 20:28:46 -05:00
|
|
|
LayoutElement(
|
2023-10-05 10:51:06 -07:00
|
|
|
type="Text",
|
|
|
|
text="This time Charlie Brown had it really tricky...",
|
2023-10-12 20:28:46 -05:00
|
|
|
bbox=None,
|
2023-10-05 10:51:06 -07:00
|
|
|
),
|
2023-10-12 20:28:46 -05:00
|
|
|
LayoutElement(
|
2023-10-05 10:51:06 -07:00
|
|
|
type="Title",
|
|
|
|
text="Another book title in the same page",
|
2023-10-12 20:28:46 -05:00
|
|
|
bbox=None,
|
2023-10-05 10:51:06 -07:00
|
|
|
),
|
2023-08-24 17:46:19 -07:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
|
|
|
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
|
|
|
|
]
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_dict():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Title",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-06-20 11:19:55 -05:00
|
|
|
"coordinate_system": None,
|
2023-01-19 09:29:28 -05:00
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-06-20 11:19:55 -05:00
|
|
|
assert element == Title(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-06-20 11:19:55 -05:00
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_dict_caption():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Figure",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
assert element == FigureCaption(
|
2023-02-27 17:30:54 +01:00
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-01-19 09:29:28 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-09-15 07:43:17 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("element_type", "expected_type", "expected_depth"),
|
|
|
|
[
|
|
|
|
("Title", Title, None),
|
|
|
|
("Headline", Title, 1),
|
|
|
|
("Subheadline", Title, 2),
|
|
|
|
("Header", Header, None),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_normalize_layout_element_headline(element_type, expected_type, expected_depth):
|
|
|
|
layout_element = {
|
|
|
|
"type": element_type,
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
|
|
|
element = common.normalize_layout_element(layout_element, coordinate_system=coordinate_system)
|
|
|
|
assert element.metadata.category_depth == expected_depth
|
|
|
|
assert isinstance(element, expected_type)
|
|
|
|
|
|
|
|
|
2023-02-28 10:36:08 -05:00
|
|
|
def test_normalize_layout_element_dict_figure_caption():
|
|
|
|
layout_element = {
|
|
|
|
"type": "FigureCaption",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-02-28 10:36:08 -05:00
|
|
|
assert element == FigureCaption(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-02-28 10:36:08 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-01-19 09:29:28 -05:00
|
|
|
def test_normalize_layout_element_dict_misc():
|
|
|
|
layout_element = {
|
|
|
|
"type": "Misc",
|
|
|
|
"coordinates": [[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
"text": "Some lovely text",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
assert element == Text(
|
|
|
|
text="Some lovely text",
|
|
|
|
coordinates=[[1, 2], [3, 4], [5, 6], [7, 8]],
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_layout_element():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-01-19 09:29:28 -05:00
|
|
|
type="Text",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="Some lovely text",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-02-28 10:36:08 -05:00
|
|
|
assert element == NarrativeText(
|
|
|
|
text="Some lovely text",
|
2023-04-04 19:59:06 -07:00
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-02-28 10:36:08 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_layout_element_narrative_text():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-02-28 10:36:08 -05:00
|
|
|
type="NarrativeText",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-02-28 10:36:08 -05:00
|
|
|
text="Some lovely text",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
assert element == NarrativeText(
|
2023-02-27 17:30:54 +01:00
|
|
|
text="Some lovely text",
|
2023-04-04 19:59:06 -07:00
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system=coordinate_system,
|
2023-01-19 09:29:28 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_checked_box():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-01-19 09:29:28 -05:00
|
|
|
type="Checked",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
assert element == CheckBox(
|
|
|
|
checked=True,
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-19 09:29:28 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_unchecked_box():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-01-19 09:29:28 -05:00
|
|
|
type="Unchecked",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-19 09:29:28 -05:00
|
|
|
text="",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
element = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
assert element == CheckBox(
|
|
|
|
checked=False,
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-20 08:55:11 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_enumerated_list():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-01-20 08:55:11 -05:00
|
|
|
type="List",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-20 08:55:11 -05:00
|
|
|
text="1. I'm so cool! 2. You're cool too. 3. We're all cool!",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
elements = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-20 08:55:11 -05:00
|
|
|
assert elements == [
|
2023-07-05 11:25:11 -07:00
|
|
|
ListItem(
|
|
|
|
text="I'm so cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="You're cool too.",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="We're all cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
2023-01-20 08:55:11 -05:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalize_layout_element_bulleted_list():
|
2023-10-12 20:28:46 -05:00
|
|
|
layout_element = LayoutElement.from_coords(
|
2023-01-20 08:55:11 -05:00
|
|
|
type="List",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=1,
|
|
|
|
y1=2,
|
|
|
|
x2=3,
|
|
|
|
y2=4,
|
2023-01-20 08:55:11 -05:00
|
|
|
text="* I'm so cool! * You're cool too. * We're all cool!",
|
|
|
|
)
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinate_system = PixelSpace(width=10, height=20)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
elements = common.normalize_layout_element(
|
|
|
|
layout_element,
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
)
|
2023-01-20 08:55:11 -05:00
|
|
|
assert elements == [
|
2023-07-05 11:25:11 -07:00
|
|
|
ListItem(
|
|
|
|
text="I'm so cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="You're cool too.",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
|
|
|
ListItem(
|
|
|
|
text="We're all cool!",
|
|
|
|
coordinates=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
|
|
coordinate_system=coordinate_system,
|
|
|
|
),
|
2023-01-20 08:55:11 -05:00
|
|
|
]
|
2023-06-08 12:33:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MockPopenWithError:
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def communicate(self):
|
|
|
|
return b"", b"an error occurred"
|
|
|
|
|
|
|
|
|
|
|
|
def test_convert_office_doc_captures_errors(monkeypatch, caplog):
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
monkeypatch.setattr(subprocess, "Popen", MockPopenWithError)
|
|
|
|
common.convert_office_doc("no-real.docx", "fake-directory", target_format="docx")
|
|
|
|
assert "an error occurred" in caplog.text
|
2023-07-27 11:07:27 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MockDocxEmptyTable:
|
|
|
|
def __init__(self):
|
|
|
|
self.rows = []
|
|
|
|
|
|
|
|
|
|
|
|
def test_convert_ms_office_table_to_text_works_with_empty_tables():
|
|
|
|
table = MockDocxEmptyTable()
|
|
|
|
assert common.convert_ms_office_table_to_text(table, as_html=True) == ""
|
|
|
|
assert common.convert_ms_office_table_to_text(table, as_html=False) == ""
|
2023-08-10 16:28:57 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("text", "expected"),
|
|
|
|
[
|
|
|
|
("<table><tbody><tr><td>👨\\U+1F3FB🔧</td></tr></tbody></table>", True),
|
|
|
|
("<table><tbody><tr><td>Hello!</td></tr></tbody></table>", False),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_contains_emoji(text, expected):
|
2023-10-30 13:13:29 -07:00
|
|
|
assert common.contains_emoji(text) is expected
|
2023-08-24 17:46:19 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
|
|
|
|
layout_elem_absent_coordinates = MockDocumentLayout()
|
2023-10-12 20:28:46 -05:00
|
|
|
for page in layout_elem_absent_coordinates.pages:
|
|
|
|
for el in page.elements:
|
|
|
|
el.bbox = None
|
2023-10-30 13:13:29 -07:00
|
|
|
elements = common.document_to_element_list(layout_elem_absent_coordinates)
|
2023-08-24 17:46:19 -07:00
|
|
|
assert elements[0].metadata.coordinates is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_page_image_metadata_and_coordinate_system():
|
|
|
|
doc = MockDocumentLayout()
|
2023-10-30 13:13:29 -07:00
|
|
|
metadata = common._get_page_image_metadata(doc.pages[0])
|
2023-08-24 17:46:19 -07:00
|
|
|
assert isinstance(metadata, dict)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_set_element_hierarchy():
|
|
|
|
elements_to_set = [
|
|
|
|
Title(text="Title"), # 0
|
|
|
|
NarrativeText(text="NarrativeText"), # 1
|
|
|
|
FigureCaption(text="FigureCaption"), # 2
|
|
|
|
ListItem(text="ListItem"), # 3
|
|
|
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 4
|
|
|
|
ListItem(text="ListItem", metadata=ElementMetadata(category_depth=1)), # 5
|
|
|
|
ListItem(text="ListItem"), # 6
|
|
|
|
CheckBox(element_id="some-id-1", checked=True), # 7
|
|
|
|
Title(text="Title 2"), # 8
|
|
|
|
ListItem(text="ListItem"), # 9
|
|
|
|
ListItem(text="ListItem"), # 10
|
|
|
|
Text(text="Text"), # 11
|
|
|
|
]
|
|
|
|
elements = common.set_element_hierarchy(elements_to_set)
|
|
|
|
|
|
|
|
assert (
|
|
|
|
elements[1].metadata.parent_id == elements[0].id
|
|
|
|
), "NarrativeText should be child of Title"
|
|
|
|
assert (
|
|
|
|
elements[2].metadata.parent_id == elements[0].id
|
|
|
|
), "FigureCaption should be child of Title"
|
|
|
|
assert elements[3].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
|
|
|
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
|
|
|
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
|
|
|
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
|
|
|
assert (
|
|
|
|
elements[7].metadata.parent_id is None
|
|
|
|
), "CheckBox should be None, as it's not a Text based element"
|
|
|
|
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
|
|
|
|
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
|
|
|
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
|
|
|
assert elements[11].metadata.parent_id == elements[8].id, "Text should be child of Title 2"
|
|
|
|
|
|
|
|
|
|
|
|
def test_set_element_hierarchy_custom_rule_set():
|
|
|
|
elements_to_set = [
|
|
|
|
Header(text="Header"), # 0
|
|
|
|
Title(text="Title"), # 1
|
|
|
|
NarrativeText(text="NarrativeText"), # 2
|
|
|
|
Text(text="Text"), # 3
|
|
|
|
Title(text="Title 2"), # 4
|
|
|
|
FigureCaption(text="FigureCaption"), # 5
|
|
|
|
]
|
|
|
|
|
|
|
|
custom_rule_set = {
|
|
|
|
"Header": ["Title", "Text"],
|
|
|
|
"Title": ["NarrativeText", "UncategorizedText", "FigureCaption"],
|
|
|
|
}
|
|
|
|
|
|
|
|
elements = common.set_element_hierarchy(
|
|
|
|
elements=elements_to_set,
|
|
|
|
ruleset=custom_rule_set,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[1].metadata.parent_id == elements[0].id, "Title should be child of Header"
|
|
|
|
assert (
|
|
|
|
elements[2].metadata.parent_id == elements[1].id
|
|
|
|
), "NarrativeText should be child of Title"
|
|
|
|
assert elements[3].metadata.parent_id == elements[1].id, "Text should be child of Title"
|
|
|
|
assert elements[4].metadata.parent_id == elements[0].id, "Title 2 should be child of Header"
|
|
|
|
assert (
|
|
|
|
elements[5].metadata.parent_id == elements[4].id
|
|
|
|
), "FigureCaption should be child of Title 2"
|
2023-10-05 10:51:06 -07:00
|
|
|
|
|
|
|
|
2023-10-12 20:28:46 -05:00
|
|
|
@dataclass
|
|
|
|
class MockImage:
|
|
|
|
width = 640
|
|
|
|
height = 480
|
|
|
|
format = "JPG"
|
|
|
|
|
|
|
|
|
|
|
|
def test_document_to_element_list_handles_parent():
|
|
|
|
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
|
|
|
|
block2 = LayoutElement.from_coords(
|
|
|
|
1,
|
|
|
|
2,
|
|
|
|
3,
|
|
|
|
4,
|
|
|
|
text="block 2",
|
|
|
|
parent=block1,
|
|
|
|
type="NarrativeText",
|
|
|
|
)
|
|
|
|
page = PageLayout(
|
|
|
|
number=1,
|
|
|
|
image=MockImage(),
|
|
|
|
layout=None,
|
|
|
|
)
|
|
|
|
page.elements = [block1, block2]
|
|
|
|
doc = DocumentLayout.from_pages([page])
|
2023-10-30 13:13:29 -07:00
|
|
|
el1, el2 = common.document_to_element_list(doc)
|
2023-10-12 20:28:46 -05:00
|
|
|
assert el2.metadata.parent_id == el1.id
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("sort_mode", "call_count"),
|
|
|
|
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
|
|
|
|
)
|
|
|
|
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
|
|
|
|
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
|
|
|
|
block2 = LayoutElement.from_coords(
|
|
|
|
1,
|
|
|
|
2,
|
|
|
|
3,
|
|
|
|
4,
|
|
|
|
text="block 2",
|
|
|
|
parent=block1,
|
|
|
|
type="NarrativeText",
|
|
|
|
)
|
|
|
|
page = PageLayout(
|
|
|
|
number=1,
|
|
|
|
image=MockImage(),
|
|
|
|
layout=None,
|
|
|
|
)
|
|
|
|
page.elements = [block1, block2]
|
|
|
|
doc = DocumentLayout.from_pages([page])
|
|
|
|
with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:
|
2023-10-30 13:13:29 -07:00
|
|
|
common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
|
2023-10-12 20:28:46 -05:00
|
|
|
assert mock_sort_page_elements.call_count == call_count
|
|
|
|
|
|
|
|
|
2023-10-05 10:51:06 -07:00
|
|
|
def test_document_to_element_list_sets_category_depth_titles():
|
|
|
|
layout_with_hierarchies = MockDocumentLayout()
|
2023-10-30 13:13:29 -07:00
|
|
|
elements = common.document_to_element_list(layout_with_hierarchies)
|
2023-10-05 10:51:06 -07:00
|
|
|
assert elements[0].metadata.category_depth == 1
|
|
|
|
assert elements[1].metadata.category_depth == 2
|
|
|
|
assert elements[2].metadata.category_depth is None
|
|
|
|
assert elements[3].metadata.category_depth == 0
|
2023-10-30 13:13:29 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_ocr_data_to_elements(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
|
|
):
|
|
|
|
text_regions = [
|
|
|
|
TextRegion.from_coords(
|
|
|
|
163.0,
|
|
|
|
115.0,
|
|
|
|
452.0,
|
|
|
|
129.0,
|
|
|
|
text="LayoutParser: A Unified Toolkit for Deep",
|
|
|
|
),
|
|
|
|
TextRegion.from_coords(
|
|
|
|
156.0,
|
|
|
|
132.0,
|
|
|
|
457.0,
|
|
|
|
147.0,
|
|
|
|
text="Learning Based Document Image Analysis",
|
|
|
|
),
|
|
|
|
]
|
|
|
|
ocr_data = [
|
|
|
|
LayoutElement(
|
|
|
|
bbox=r.bbox,
|
|
|
|
text=r.text,
|
|
|
|
source=r.source,
|
|
|
|
type="UncategorizedText",
|
|
|
|
)
|
|
|
|
for r in text_regions
|
|
|
|
]
|
|
|
|
image = Image.open(filename)
|
|
|
|
|
|
|
|
elements = common.ocr_data_to_elements(
|
|
|
|
ocr_data=ocr_data,
|
|
|
|
image_size=image.size,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert len(ocr_data) == len(elements)
|
|
|
|
assert {el.category for el in elements} == {"UncategorizedText"}
|
|
|
|
|
|
|
|
# check coordinates metadata
|
|
|
|
image_width, image_height = image.size
|
|
|
|
coordinate_system = PixelSpace(width=image_width, height=image_height)
|
|
|
|
for el, layout_el in zip(elements, ocr_data):
|
|
|
|
assert el.metadata.coordinates == CoordinatesMetadata(
|
|
|
|
points=layout_el.bbox.coordinates,
|
|
|
|
system=coordinate_system,
|
|
|
|
)
|