mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
fix: remove unused ElementMetadata.section (#2921)
**Summary** The `.section` field in `ElementMetadata` is dead code, possibly a remainder from a prior iteration of `partition_epub()`. In any case, it is not populated by any partitioner. Remove it and any code that uses it.
This commit is contained in:
parent
305247b4e1
commit
05ff975081
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.13.4-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.
|
||||
|
||||
## 0.13.3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -152,13 +152,6 @@ following behaviors:
|
||||
``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve
|
||||
section boundaries" contract.
|
||||
|
||||
- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is
|
||||
considered to start a new section. When a change in this value is encountered a new chunk is
|
||||
started. This implements the second aspect of preserving section boundaries. This metadata is not
|
||||
present in all document formats so is not used alone. An element having ``None`` for this metadata
|
||||
field is considered to be part of the prior section; a section break is only detected on an
|
||||
explicit change in value.
|
||||
|
||||
- **Respect page boundaries.** Page boundaries can optionally also be respected using the
|
||||
``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not*
|
||||
start a new chunk. Setting this to ``False`` will separate elements that occur on different pages
|
||||
|
||||
@ -17,7 +17,6 @@ from unstructured.chunking.base import (
|
||||
TextPreChunk,
|
||||
TextPreChunkAccumulator,
|
||||
_TextSplitter,
|
||||
is_in_next_section,
|
||||
is_on_next_page,
|
||||
is_title,
|
||||
)
|
||||
@ -1514,68 +1513,6 @@ class DescribeTextPreChunkAccumulator:
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class Describe_is_in_next_section:
|
||||
"""Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function.
|
||||
|
||||
`is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element
|
||||
(`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an
|
||||
element stream.
|
||||
"""
|
||||
|
||||
def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self):
|
||||
"""This is an explicit first-section; first-section does not represent a section break."""
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
|
||||
|
||||
def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self):
|
||||
"""This is an anonymous first-section; still doesn't represent a section break."""
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd"))
|
||||
|
||||
def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self):
|
||||
"""A `None` section element is considered to continue the prior section."""
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
|
||||
assert not pred(Text("efgh"))
|
||||
assert not pred(Text("ijkl"))
|
||||
|
||||
def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self):
|
||||
"""A `None` section element is considered to continue the prior section."""
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd"))
|
||||
assert not pred(Text("efgh"))
|
||||
assert not pred(Text("ijkl"))
|
||||
|
||||
def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self):
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
|
||||
assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
|
||||
assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
|
||||
|
||||
def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self):
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd"))
|
||||
assert not pred(Text("efgh"))
|
||||
assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction")))
|
||||
|
||||
def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self):
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction")))
|
||||
assert pred(Text("efgh", metadata=ElementMetadata(section="Summary")))
|
||||
|
||||
def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self):
|
||||
pred = is_in_next_section()
|
||||
assert not pred(Text("abcd"))
|
||||
assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction")))
|
||||
assert not pred(Text("ijkl"))
|
||||
assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction")))
|
||||
assert not pred(Text("qrst"))
|
||||
assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary")))
|
||||
assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary")))
|
||||
assert not pred(Text("cdef"))
|
||||
assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix")))
|
||||
|
||||
|
||||
class Describe_is_on_next_page:
|
||||
"""Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function.
|
||||
|
||||
|
||||
@ -139,43 +139,6 @@ def test_chunk_by_title():
|
||||
)
|
||||
|
||||
|
||||
def test_chunk_by_title_respects_section_change():
|
||||
elements: list[Element] = [
|
||||
Title("A Great Day", metadata=ElementMetadata(section="first")),
|
||||
Text("Today is a great day.", metadata=ElementMetadata(section="second")),
|
||||
Text("It is sunny outside.", metadata=ElementMetadata(section="second")),
|
||||
Table("Heading\nCell text"),
|
||||
Title("An Okay Day"),
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
Title("A Bad Day"),
|
||||
Text(
|
||||
"Today is a bad day.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
||||
),
|
||||
),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
||||
|
||||
assert chunks == [
|
||||
CompositeElement(
|
||||
"A Great Day",
|
||||
),
|
||||
CompositeElement(
|
||||
"Today is a great day.\n\nIt is sunny outside.",
|
||||
),
|
||||
Table("Heading\nCell text"),
|
||||
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
|
||||
CompositeElement(
|
||||
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_chunk_by_title_separates_by_page_number():
|
||||
elements: list[Element] = [
|
||||
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
||||
|
||||
@ -77,7 +77,6 @@ def test_partition_epub_from_filename_exclude_metadata():
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
assert elements[0].metadata.section is None
|
||||
|
||||
|
||||
def test_partition_epub_from_file_exlcude_metadata():
|
||||
@ -87,7 +86,6 @@ def test_partition_epub_from_file_exlcude_metadata():
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
assert elements[0].metadata.section is None
|
||||
|
||||
|
||||
def test_partition_epub_metadata_date(
|
||||
|
||||
@ -166,7 +166,6 @@ def test_default_pandas_dtypes():
|
||||
sent_from=["sent", "from"],
|
||||
sent_to=["sent", "to"],
|
||||
subject="subject",
|
||||
section="section",
|
||||
header_footer_type="header_footer_type",
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
@ -321,7 +320,6 @@ def test_convert_to_coco():
|
||||
sent_from=["sent", "from"],
|
||||
sent_to=["sent", "to"],
|
||||
subject="subject",
|
||||
section="section",
|
||||
header_footer_type="header_footer_type",
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
@ -366,7 +364,6 @@ def test_convert_to_coco():
|
||||
sent_from=["sent", "from"],
|
||||
sent_to=["sent", "to"],
|
||||
subject="subject",
|
||||
section="section",
|
||||
header_footer_type="header_footer_type",
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.13.3" # pragma: no cover
|
||||
__version__ = "0.13.4-dev0" # pragma: no cover
|
||||
|
||||
@ -1022,51 +1022,6 @@ class TextPreChunkAccumulator:
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def is_in_next_section() -> BoundaryPredicate:
|
||||
"""Not a predicate itself, calling this returns a predicate that triggers on each new section.
|
||||
|
||||
The lifetime of the returned callable cannot extend beyond a single element-stream because it
|
||||
stores current state (current section) that is particular to that element stream.
|
||||
|
||||
A "section" of this type is particular to the EPUB format (so far) and not to be confused with
|
||||
a "section" composed of a section-heading (`Title` element) followed by content elements.
|
||||
|
||||
The returned predicate tracks the current section, starting at `None`. Calling with an element
|
||||
with a different value for `metadata.section` returns True, indicating the element starts a new
|
||||
section boundary, and updates the enclosed section name ready for the next transition.
|
||||
"""
|
||||
current_section: Optional[str] = None
|
||||
is_first: bool = True
|
||||
|
||||
def section_changed(element: Element) -> bool:
|
||||
nonlocal current_section, is_first
|
||||
|
||||
section = element.metadata.section
|
||||
|
||||
# -- The first element never reports a section break, it starts the first section of the
|
||||
# -- document. That section could be named (section is non-None) or anonymous (section is
|
||||
# -- None). We don't really have to care.
|
||||
if is_first:
|
||||
current_section = section
|
||||
is_first = False
|
||||
return False
|
||||
|
||||
# -- An element with a `None` section is assumed to continue the current section. It never
|
||||
# -- updates the current-section because once set, the current-section is "sticky" until
|
||||
# -- replaced by another explicit section.
|
||||
if section is None:
|
||||
return False
|
||||
|
||||
# -- another element with the same section continues that section --
|
||||
if section == current_section:
|
||||
return False
|
||||
|
||||
current_section = section
|
||||
return True
|
||||
|
||||
return section_changed
|
||||
|
||||
|
||||
def is_on_next_page() -> BoundaryPredicate:
|
||||
"""Not a predicate itself, calling this returns a predicate that triggers on each new page.
|
||||
|
||||
|
||||
@ -13,7 +13,6 @@ from unstructured.chunking.base import (
|
||||
ChunkingOptions,
|
||||
PreChunkCombiner,
|
||||
PreChunker,
|
||||
is_in_next_section,
|
||||
is_on_next_page,
|
||||
is_title,
|
||||
)
|
||||
@ -121,7 +120,6 @@ class _ByTitleChunkingOptions(ChunkingOptions):
|
||||
|
||||
def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:
|
||||
yield is_title
|
||||
yield is_in_next_section()
|
||||
if not self.multipage_sections:
|
||||
yield is_on_next_page()
|
||||
|
||||
|
||||
@ -191,8 +191,6 @@ class ElementMetadata:
|
||||
parent_id: Optional[str]
|
||||
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
|
||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
|
||||
# -- EPUB document section --
|
||||
section: Optional[str]
|
||||
|
||||
# -- e-mail specific metadata fields --
|
||||
sent_from: Optional[list[str]]
|
||||
@ -235,7 +233,6 @@ class ElementMetadata:
|
||||
page_number: Optional[int] = None,
|
||||
parent_id: Optional[str] = None,
|
||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
|
||||
section: Optional[str] = None,
|
||||
sent_from: Optional[list[str]] = None,
|
||||
sent_to: Optional[list[str]] = None,
|
||||
signature: Optional[str] = None,
|
||||
@ -275,7 +272,6 @@ class ElementMetadata:
|
||||
self.page_number = page_number
|
||||
self.parent_id = parent_id
|
||||
self.regex_metadata = regex_metadata
|
||||
self.section = section
|
||||
self.sent_from = sent_from
|
||||
self.sent_to = sent_to
|
||||
self.signature = signature
|
||||
@ -488,7 +484,6 @@ class ConsolidationStrategy(enum.Enum):
|
||||
"page_number": cls.FIRST,
|
||||
"parent_id": cls.DROP,
|
||||
"regex_metadata": cls.REGEX,
|
||||
"section": cls.FIRST,
|
||||
"sent_from": cls.FIRST,
|
||||
"sent_to": cls.FIRST,
|
||||
"signature": cls.FIRST,
|
||||
@ -671,7 +666,7 @@ class ElementType:
|
||||
|
||||
|
||||
class Element(abc.ABC):
|
||||
"""An element is a section of a page in the document.
|
||||
"""An element is a semantically-coherent component of a document, often a paragraph.
|
||||
|
||||
There are a few design principles that are followed when creating an element:
|
||||
1. It will always have an ID, which by default is a random UUID.
|
||||
@ -694,7 +689,9 @@ class Element(abc.ABC):
|
||||
metadata: Optional[ElementMetadata] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
):
|
||||
if element_id is not None and not isinstance(element_id, str):
|
||||
if element_id is not None and not isinstance(
|
||||
element_id, str
|
||||
): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
raise ValueError("element_id must be of type str or None.")
|
||||
|
||||
self._element_id = element_id
|
||||
@ -885,7 +882,12 @@ class Formula(Text):
|
||||
|
||||
|
||||
class CompositeElement(Text):
|
||||
"""A section of text consisting of a combination of elements."""
|
||||
"""A chunk formed from text (non-Table) elements.
|
||||
|
||||
Only produced by chunking. An instance may be formed by combining one or more sequential
|
||||
elements produced by partitioning. It it also used when text-splitting an "oversized" element,
|
||||
a single element that by itself is larger than the requested chunk size.
|
||||
"""
|
||||
|
||||
category = "CompositeElement"
|
||||
|
||||
|
||||
@ -272,7 +272,6 @@ def add_element_metadata(
|
||||
text_as_html: Optional[str] = None,
|
||||
coordinates: Optional[tuple[tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
section: Optional[str] = None,
|
||||
image_path: Optional[str] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
languages: Optional[List[str]] = None,
|
||||
@ -324,7 +323,6 @@ def add_element_metadata(
|
||||
link_start_indexes=link_start_indexes,
|
||||
emphasized_text_contents=emphasized_text_contents,
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
section=section,
|
||||
category_depth=depth,
|
||||
image_path=image_path,
|
||||
languages=languages,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user