mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 11:34:07 +00:00
feat: apply method to apply cleaning bricks to elements (#102)
* add apply method to apply cleaners to elements * bump version * add check for string output * documentations for the apply method * change interface to *cleaners
This commit is contained in:
parent
b1cce16c16
commit
1d68bb2482
@ -1,6 +1,7 @@
|
||||
## 0.3.2-dev0
|
||||
## 0.3.2
|
||||
|
||||
* Added `translate_text` brick for translating text between languages
|
||||
* Add an `apply` method to make it easier to apply cleaners to elements
|
||||
|
||||
## 0.3.1
|
||||
|
||||
|
||||
@ -11,3 +11,35 @@ elements.
|
||||
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
|
||||
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
|
||||
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
|
||||
|
||||
|
||||
#########################################
|
||||
Applying Cleaning Bricks to Text Elements
|
||||
#########################################
|
||||
|
||||
You can apply cleaning bricks to a text element by using the ``apply`` method. The
|
||||
apply method accepts any function that takes a string as input and produces a string
|
||||
as output. Use the `partial` function from `functools` if you need to set additional
|
||||
args or kwargs for your cleaning brick. The `apply` method will accept either a single
|
||||
cleaner or a list of cleaners.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from functools import partial
|
||||
|
||||
from unstructured.cleaners.core import clean_prefix
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
from unstructured.documents.elements import ListItem
|
||||
|
||||
cleaners = [
|
||||
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||
partial(translate_text, target_lang="ru"),
|
||||
]
|
||||
|
||||
item = ListItem(text="[1] A Textbook on Crocodile Habitats")
|
||||
item.apply(*cleaners)
|
||||
|
||||
# The output will be: Учебник по крокодильным средам обитания
|
||||
print(item)
|
||||
@ -1,3 +1,8 @@
|
||||
from functools import partial
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_prefix
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
from unstructured.documents.elements import Element, NoID, Text
|
||||
|
||||
|
||||
@ -9,3 +14,26 @@ def test_text_id():
|
||||
def test_element_defaults_to_blank_id():
|
||||
element = Element()
|
||||
assert isinstance(element.id, NoID)
|
||||
|
||||
|
||||
def test_text_element_apply_cleaners():
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
|
||||
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
||||
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
||||
|
||||
|
||||
def test_text_element_apply_multiple_cleaners():
|
||||
cleaners = [
|
||||
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||
partial(translate_text, target_lang="ru"),
|
||||
]
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
text_element.apply(*cleaners)
|
||||
assert str(text_element) == "Учебник по крокодильным средам обитания"
|
||||
|
||||
|
||||
def test_apply_raises_if_func_does_not_produce_string():
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
with pytest.raises(ValueError):
|
||||
text_element.apply(lambda s: 1)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.2-dev0" # pragma: no cover
|
||||
__version__ = "0.3.2" # pragma: no cover
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
from abc import ABC
|
||||
import hashlib
|
||||
from typing import Union
|
||||
from typing import Callable, Union
|
||||
|
||||
|
||||
class NoID(ABC):
|
||||
@ -36,6 +36,18 @@ class Text(Element):
|
||||
def __eq__(self, other):
|
||||
return self.text == other.text
|
||||
|
||||
def apply(self, *cleaners: Callable):
|
||||
"""Applies a cleaning brick to the text element. The function that's passed in
|
||||
should take a string as input and produce a string as output."""
|
||||
cleaned_text = self.text
|
||||
for cleaner in cleaners:
|
||||
cleaned_text = cleaner(cleaned_text)
|
||||
|
||||
if not isinstance(cleaned_text, str):
|
||||
raise ValueError("Cleaner produced a non-string output.")
|
||||
|
||||
self.text = cleaned_text
|
||||
|
||||
|
||||
class NarrativeText(Text):
|
||||
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user