mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
bug: empty-elements (#1252)
- This PR adds a function to check if a piece of text only contains a bullet (no text) to prevent creating an empty element. - Also fixed a test that had a typo.
This commit is contained in:
parent
69265685ea
commit
d07baed4a1
@ -13,6 +13,8 @@
|
||||
* **Allow setting table crop parameter** In certain circumstances, adjusting the table crop padding may improve table.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fixes `partition_text` to prevent empty elements** Adds a check to filter out empty bullets.
|
||||
* **Handle empty string for `ocr_languages` with values for `languages`** Some API users ran into an issue with sending `languages` params because the API defaulted to also using an empty string for `ocr_languages`. This update handles situations where `languages` is defined and `ocr_languages` is an empty string.
|
||||
* **Fix PDF tried to loop through None** Previously the PDF annotation extraction tried to loop through `annots` that resolved out as None. A logical check added to avoid such error.
|
||||
* **Ingest session handler not being shared correctly** All ingest docs that leverage the session handler should only need to set it once per process. It was recreating it each time because the right values weren't being set nor available given how dataclasses work in python.
|
||||
|
@ -14,7 +14,8 @@ This is a test email to use for unit tests.
|
||||
Important points:
|
||||
|
||||
- Roses are red
|
||||
- Violets are blue
|
||||
- Violets are blue
|
||||
-
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
|
@ -15,7 +15,7 @@ from unstructured.staging.base import elements_to_json
|
||||
"fake-email.txt",
|
||||
{
|
||||
("UncategorizedText", None): 6,
|
||||
("ListItem", None): 12,
|
||||
("ListItem", None): 2,
|
||||
("Title", None): 5,
|
||||
("NarrativeText", None): 2,
|
||||
},
|
||||
|
@ -22,7 +22,7 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
|
||||
from unstructured.nlp.tokenize import sent_tokenize
|
||||
from unstructured.partition.common import (
|
||||
exactly_one,
|
||||
@ -186,7 +186,7 @@ def _partition_text(
|
||||
for ctext in file_content:
|
||||
ctext = ctext.strip()
|
||||
|
||||
if ctext:
|
||||
if ctext and not is_empty_bullet(ctext):
|
||||
element = element_from_text(ctext)
|
||||
element.metadata = copy.deepcopy(metadata)
|
||||
elements.append(element)
|
||||
@ -201,14 +201,20 @@ def _partition_text(
|
||||
return elements
|
||||
|
||||
|
||||
def is_empty_bullet(text: str) -> bool:
|
||||
"""Checks if input text is an empty bullet."""
|
||||
return UNICODE_BULLETS_RE.match(text) and len(text) == 1
|
||||
|
||||
|
||||
def element_from_text(
|
||||
text: str,
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
) -> Element:
|
||||
if is_bulleted_text(text):
|
||||
clean_text = clean_bullets(text)
|
||||
return ListItem(
|
||||
text=clean_bullets(text),
|
||||
text=clean_text,
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user