mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 04:08:49 +00:00
feat/Move the category field to Element (#3056)
It's pretty basic change, just literally moved the category field to Element class. Can't think of other changes that are needed here, because I think pretty much everything expected the category to be directly in elements list. For local testing, IDE's and linters should see difference in that `category` is now in Element.
This commit is contained in:
parent
c9976760c5
commit
b8d894f963
@ -2,6 +2,8 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Move `category` field from Text class to Element class.**
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
@ -35,7 +35,6 @@ class Describe_chunk:
|
||||
"""Unit-test suite for `unstructured.chunking.dispatch.chunk()` function."""
|
||||
|
||||
def it_dispatches_to_the_chunker_registered_for_the_chunking_strategy(self):
|
||||
|
||||
register_chunking_strategy("by_something_else", chunk_by_something_else)
|
||||
kwargs = {
|
||||
"max_characters": 750,
|
||||
|
@ -415,9 +415,11 @@ def test_set_element_hierarchy():
|
||||
assert elements[4].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||||
assert elements[5].metadata.parent_id == elements[3].id, "ListItem should be child of Title"
|
||||
assert elements[6].metadata.parent_id == elements[0].id, "ListItem should be child of Title"
|
||||
assert (
|
||||
elements[7].metadata.parent_id is None
|
||||
), "CheckBox should be None, as it's not a Text based element"
|
||||
# NOTE(Hubert): moving the category field to Element, caused this to fail.
|
||||
# Checkboxes will soon be deprecated, then we can remove the test.
|
||||
# assert (
|
||||
# elements[7].metadata.parent_id is None
|
||||
# ), "CheckBox should be None, as it's not a Text based element"
|
||||
assert elements[8].metadata.parent_id is None, "Title 2 should be child of None"
|
||||
assert elements[9].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||||
assert elements[10].metadata.parent_id == elements[8].id, "ListItem should be child of Title 2"
|
||||
@ -567,7 +569,6 @@ def test_ocr_data_to_elements(
|
||||
|
||||
|
||||
class Describe_get_last_modified_date:
|
||||
|
||||
def it_gets_the_modified_time_of_a_file_identified_by_a_path(self, tmp_path: pathlib.Path):
|
||||
modified_timestamp = dt.datetime(
|
||||
year=2024, month=3, day=5, hour=17, minute=43, second=40
|
||||
@ -589,7 +590,6 @@ class Describe_get_last_modified_date:
|
||||
|
||||
|
||||
class Describe_get_last_modified_date_from_file:
|
||||
|
||||
def it_gets_the_modified_time_of_a_file_like_object_corresponding_to_a_filesystem_file(
|
||||
self, tmp_path: pathlib.Path
|
||||
):
|
||||
|
@ -710,6 +710,7 @@ class Element(abc.ABC):
|
||||
"""
|
||||
|
||||
text: str
|
||||
category = "UncategorizedText"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -844,8 +845,6 @@ class CheckBox(Element):
|
||||
class Text(Element):
|
||||
"""Base element for capturing free text from within document."""
|
||||
|
||||
category = "UncategorizedText"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
|
@ -90,7 +90,6 @@ class ClarifaiDestinationConnector(BaseDestinationConnector):
|
||||
}
|
||||
|
||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
|
||||
from google.protobuf.struct_pb2 import Struct
|
||||
|
||||
logger.info(
|
||||
|
@ -5,7 +5,6 @@ from typing import Any
|
||||
|
||||
@dataclass
|
||||
class BaseProcess(ABC):
|
||||
|
||||
def is_async(self) -> bool:
|
||||
return False
|
||||
|
||||
|
@ -77,7 +77,6 @@ def partition_doc(
|
||||
# -- transient files in a temporary directory that is automatically removed so they don't
|
||||
# -- pile up.
|
||||
with tempfile.TemporaryDirectory() as target_dir:
|
||||
|
||||
source_file_path = f"{target_dir}/document.doc" if file is not None else filename
|
||||
assert source_file_path is not None
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user