mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 07:33:36 +00:00
fix: properly handle the case when an element's text is None (#3995)
Some elements, like `Image`, can have `None` as its `text` attribute's value. In that case current chunking logic fails because it expects the field to always have a length or can be split. The fix is to update the logic as `element.text or ""` for checking length and add flow control to early exit to avoid calling split on `None`.
This commit is contained in:
parent
604c4a7c5e
commit
b814ece39f
@ -1,4 +1,4 @@
|
||||
## 0.17.6-dev2
|
||||
## 0.17.6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@ Two executions of the same code, on the same file, produce different results. Th
|
||||
This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
|
||||
- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
|
||||
- Resolve open CVEs
|
||||
- Properly handle the case when an element's `text` attribute is None
|
||||
|
||||
|
||||
## 0.17.5
|
||||
@ -48,7 +49,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
|
||||
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
|
||||
|
||||
## 0.17.2
|
||||
|
||||
|
||||
@ -31,6 +31,7 @@ from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Image,
|
||||
PageBreak,
|
||||
Table,
|
||||
TableChunk,
|
||||
@ -234,6 +235,10 @@ class DescribePreChunkBuilder:
|
||||
assert builder._text_length == 112
|
||||
assert builder._remaining_space == 36
|
||||
|
||||
def it_will_fit_when_element_has_none_as_text(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||
assert builder.will_fit(Image(None))
|
||||
|
||||
def it_will_fit_an_oversized_element_when_empty(self):
|
||||
builder = PreChunkBuilder(opts=ChunkingOptions())
|
||||
assert builder.will_fit(Text("abcd " * 200))
|
||||
@ -405,6 +410,12 @@ class DescribePreChunk:
|
||||
pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions())
|
||||
assert pre_chunk != 42
|
||||
|
||||
def it_can_handle_element_with_none_as_text(self):
|
||||
pre_chunk = PreChunk(
|
||||
[Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions()
|
||||
)
|
||||
assert pre_chunk._text == "hello"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("max_characters", "combine_text_under_n_chars", "expected_value"),
|
||||
[
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.17.6-dev2" # pragma: no cover
|
||||
__version__ = "0.17.6" # pragma: no cover
|
||||
|
||||
@ -387,7 +387,7 @@ class PreChunkBuilder:
|
||||
if self._text_length > self._opts.soft_max:
|
||||
return False
|
||||
# -- don't add an element if it would increase total size beyond the hard-max --
|
||||
return not self._remaining_space < len(element.text)
|
||||
return not self._remaining_space < len(element.text or "")
|
||||
|
||||
@property
|
||||
def _remaining_space(self) -> int:
|
||||
@ -503,6 +503,8 @@ class PreChunk:
|
||||
if self._overlap_prefix:
|
||||
yield self._overlap_prefix
|
||||
for e in self._elements:
|
||||
if e.text is None:
|
||||
continue
|
||||
text = " ".join(e.text.strip().split())
|
||||
if not text:
|
||||
continue
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user