mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 23:47:36 +00:00
feat: apply method to apply cleaning bricks to elements (#102)
* add apply method to apply cleaners to elements * bump version * add check for string output * documentations for the apply method * change interface to *cleaners
This commit is contained in:
parent
b1cce16c16
commit
1d68bb2482
@ -1,6 +1,7 @@
|
|||||||
## 0.3.2-dev0
|
## 0.3.2
|
||||||
|
|
||||||
* Added `translate_text` brick for translating text between languages
|
* Added `translate_text` brick for translating text between languages
|
||||||
|
* Add an `apply` method to make it easier to apply cleaners to elements
|
||||||
|
|
||||||
## 0.3.1
|
## 0.3.1
|
||||||
|
|
||||||
|
|||||||
@ -11,3 +11,35 @@ elements.
|
|||||||
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
|
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
|
||||||
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
|
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
|
||||||
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
|
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
|
||||||
|
|
||||||
|
|
||||||
|
#########################################
|
||||||
|
Applying Cleaning Bricks to Text Elements
|
||||||
|
#########################################
|
||||||
|
|
||||||
|
You can apply cleaning bricks to a text element by using the ``apply`` method. The
|
||||||
|
apply method accepts any function that takes a string as input and produces a string
|
||||||
|
as output. Use the `partial` function from `functools` if you need to set additional
|
||||||
|
args or kwargs for your cleaning brick. The `apply` method will accept either a single
|
||||||
|
cleaner or a list of cleaners.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_prefix
|
||||||
|
from unstructured.cleaners.translate import translate_text
|
||||||
|
from unstructured.documents.elements import ListItem
|
||||||
|
|
||||||
|
cleaners = [
|
||||||
|
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||||
|
partial(translate_text, target_lang="ru"),
|
||||||
|
]
|
||||||
|
|
||||||
|
item = ListItem(text="[1] A Textbook on Crocodile Habitats")
|
||||||
|
item.apply(*cleaners)
|
||||||
|
|
||||||
|
# The output will be: Учебник по крокодильным средам обитания
|
||||||
|
print(item)
|
||||||
@ -1,3 +1,8 @@
|
|||||||
|
from functools import partial
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_prefix
|
||||||
|
from unstructured.cleaners.translate import translate_text
|
||||||
from unstructured.documents.elements import Element, NoID, Text
|
from unstructured.documents.elements import Element, NoID, Text
|
||||||
|
|
||||||
|
|
||||||
@ -9,3 +14,26 @@ def test_text_id():
|
|||||||
def test_element_defaults_to_blank_id():
|
def test_element_defaults_to_blank_id():
|
||||||
element = Element()
|
element = Element()
|
||||||
assert isinstance(element.id, NoID)
|
assert isinstance(element.id, NoID)
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_element_apply_cleaners():
|
||||||
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||||
|
|
||||||
|
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
||||||
|
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_element_apply_multiple_cleaners():
|
||||||
|
cleaners = [
|
||||||
|
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||||
|
partial(translate_text, target_lang="ru"),
|
||||||
|
]
|
||||||
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||||
|
text_element.apply(*cleaners)
|
||||||
|
assert str(text_element) == "Учебник по крокодильным средам обитания"
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_raises_if_func_does_not_produce_string():
|
||||||
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
text_element.apply(lambda s: 1)
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.3.2-dev0" # pragma: no cover
|
__version__ = "0.3.2" # pragma: no cover
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
from abc import ABC
|
from abc import ABC
|
||||||
import hashlib
|
import hashlib
|
||||||
from typing import Union
|
from typing import Callable, Union
|
||||||
|
|
||||||
|
|
||||||
class NoID(ABC):
|
class NoID(ABC):
|
||||||
@ -36,6 +36,18 @@ class Text(Element):
|
|||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return self.text == other.text
|
return self.text == other.text
|
||||||
|
|
||||||
|
def apply(self, *cleaners: Callable):
|
||||||
|
"""Applies a cleaning brick to the text element. The function that's passed in
|
||||||
|
should take a string as input and produce a string as output."""
|
||||||
|
cleaned_text = self.text
|
||||||
|
for cleaner in cleaners:
|
||||||
|
cleaned_text = cleaner(cleaned_text)
|
||||||
|
|
||||||
|
if not isinstance(cleaned_text, str):
|
||||||
|
raise ValueError("Cleaner produced a non-string output.")
|
||||||
|
|
||||||
|
self.text = cleaned_text
|
||||||
|
|
||||||
|
|
||||||
class NarrativeText(Text):
|
class NarrativeText(Text):
|
||||||
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This
|
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user