From 1d68bb248205c457664f1bd278f4bca75c5840e7 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 15 Dec 2022 17:19:02 -0500 Subject: [PATCH] feat: `apply` method to apply cleaning bricks to elements (#102) * add apply method to apply cleaners to elements * bump version * add check for string output * documentations for the apply method * change interface to *cleaners --- CHANGELOG.md | 3 +- docs/source/elements.rst | 32 ++++++++++++++++++++ test_unstructured/documents/test_elements.py | 28 +++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 14 ++++++++- 5 files changed, 76 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f080879e8..9f1b82b09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.3.2-dev0 +## 0.3.2 * Added `translate_text` brick for translating text between languages +* Add an `apply` method to make it easier to apply cleaners to elements ## 0.3.1 diff --git a/docs/source/elements.rst b/docs/source/elements.rst index 67cb359c0..e68534f06 100644 --- a/docs/source/elements.rst +++ b/docs/source/elements.rst @@ -11,3 +11,35 @@ elements. * ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``. * ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``. * ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``. + + +######################################### +Applying Cleaning Bricks to Text Elements +######################################### + +You can apply cleaning bricks to a text element by using the ``apply`` method. The +apply method accepts any function that takes a string as input and produces a string +as output. Use the `partial` function from `functools` if you need to set additional +args or kwargs for your cleaning brick. The `apply` method will accept either a single +cleaner or a list of cleaners. + +Examples: + +.. code:: python + + from functools import partial + + from unstructured.cleaners.core import clean_prefix + from unstructured.cleaners.translate import translate_text + from unstructured.documents.elements import ListItem + + cleaners = [ + partial(clean_prefix, pattern=r"\[\d{1,2}\]"), + partial(translate_text, target_lang="ru"), + ] + + item = ListItem(text="[1] A Textbook on Crocodile Habitats") + item.apply(*cleaners) + + # The output will be: Учебник по крокодильным средам обитания + print(item) \ No newline at end of file diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 82752c8d1..70136fa0d 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -1,3 +1,8 @@ +from functools import partial +import pytest + +from unstructured.cleaners.core import clean_prefix +from unstructured.cleaners.translate import translate_text from unstructured.documents.elements import Element, NoID, Text @@ -9,3 +14,26 @@ def test_text_id(): def test_element_defaults_to_blank_id(): element = Element() assert isinstance(element.id, NoID) + + +def test_text_element_apply_cleaners(): + text_element = Text(text="[1] A Textbook on Crocodile Habitats") + + text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]")) + assert str(text_element) == "A Textbook on Crocodile Habitats" + + +def test_text_element_apply_multiple_cleaners(): + cleaners = [ + partial(clean_prefix, pattern=r"\[\d{1,2}\]"), + partial(translate_text, target_lang="ru"), + ] + text_element = Text(text="[1] A Textbook on Crocodile Habitats") + text_element.apply(*cleaners) + assert str(text_element) == "Учебник по крокодильным средам обитания" + + +def test_apply_raises_if_func_does_not_produce_string(): + text_element = Text(text="[1] A Textbook on Crocodile Habitats") + with pytest.raises(ValueError): + text_element.apply(lambda s: 1) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dc138860f..9ed9c2a0e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.3.2-dev0" # pragma: no cover +__version__ = "0.3.2" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 4afdf693f..88714fb17 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -1,6 +1,6 @@ from abc import ABC import hashlib -from typing import Union +from typing import Callable, Union class NoID(ABC): @@ -36,6 +36,18 @@ class Text(Element): def __eq__(self, other): return self.text == other.text + def apply(self, *cleaners: Callable): + """Applies a cleaning brick to the text element. The function that's passed in + should take a string as input and produce a string as output.""" + cleaned_text = self.text + for cleaner in cleaners: + cleaned_text = cleaner(cleaned_text) + + if not isinstance(cleaned_text, str): + raise ValueError("Cleaner produced a non-string output.") + + self.text = cleaned_text + class NarrativeText(Text): """NarrativeText is an element consisting of multiple, well-formulated sentences. This