feat: apply method to apply cleaning bricks to elements (#102)

* add apply method to apply cleaners to elements

* bump version

* add check for string output

* documentations for the apply method

* change interface to *cleaners
This commit is contained in:
Matt Robinson 2022-12-15 17:19:02 -05:00 committed by GitHub
parent b1cce16c16
commit 1d68bb2482
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 76 additions and 3 deletions

View File

@ -1,6 +1,7 @@
## 0.3.2-dev0
## 0.3.2
* Added `translate_text` brick for translating text between languages
* Add an `apply` method to make it easier to apply cleaners to elements
## 0.3.1

View File

@ -11,3 +11,35 @@ elements.
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
#########################################
Applying Cleaning Bricks to Text Elements
#########################################
You can apply cleaning bricks to a text element by using the ``apply`` method. The
apply method accepts any function that takes a string as input and produces a string
as output. Use the `partial` function from `functools` if you need to set additional
args or kwargs for your cleaning brick. The `apply` method will accept either a single
cleaner or a list of cleaners.
Examples:
.. code:: python
from functools import partial
from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.elements import ListItem
cleaners = [
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
partial(translate_text, target_lang="ru"),
]
item = ListItem(text="[1] A Textbook on Crocodile Habitats")
item.apply(*cleaners)
# The output will be: Учебник по крокодильным средам обитания
print(item)

View File

@ -1,3 +1,8 @@
from functools import partial
import pytest
from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.elements import Element, NoID, Text
@ -9,3 +14,26 @@ def test_text_id():
def test_element_defaults_to_blank_id():
element = Element()
assert isinstance(element.id, NoID)
def test_text_element_apply_cleaners():
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
assert str(text_element) == "A Textbook on Crocodile Habitats"
def test_text_element_apply_multiple_cleaners():
cleaners = [
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
partial(translate_text, target_lang="ru"),
]
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
text_element.apply(*cleaners)
assert str(text_element) == "Учебник по крокодильным средам обитания"
def test_apply_raises_if_func_does_not_produce_string():
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError):
text_element.apply(lambda s: 1)

View File

@ -1 +1 @@
__version__ = "0.3.2-dev0" # pragma: no cover
__version__ = "0.3.2" # pragma: no cover

View File

@ -1,6 +1,6 @@
from abc import ABC
import hashlib
from typing import Union
from typing import Callable, Union
class NoID(ABC):
@ -36,6 +36,18 @@ class Text(Element):
def __eq__(self, other):
return self.text == other.text
def apply(self, *cleaners: Callable):
"""Applies a cleaning brick to the text element. The function that's passed in
should take a string as input and produce a string as output."""
cleaned_text = self.text
for cleaner in cleaners:
cleaned_text = cleaner(cleaned_text)
if not isinstance(cleaned_text, str):
raise ValueError("Cleaner produced a non-string output.")
self.text = cleaned_text
class NarrativeText(Text):
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This