mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	feat: apply method to apply cleaning bricks to elements (#102)
				
					
				
			* add apply method to apply cleaners to elements * bump version * add check for string output * documentations for the apply method * change interface to *cleaners
This commit is contained in:
		
							parent
							
								
									b1cce16c16
								
							
						
					
					
						commit
						1d68bb2482
					
				@ -1,6 +1,7 @@
 | 
			
		||||
## 0.3.2-dev0
 | 
			
		||||
## 0.3.2
 | 
			
		||||
 | 
			
		||||
* Added `translate_text` brick for translating text between languages
 | 
			
		||||
* Add an `apply` method to make it easier to apply cleaners to elements
 | 
			
		||||
 | 
			
		||||
## 0.3.1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -11,3 +11,35 @@ elements.
 | 
			
		||||
* ``NarrativeText`` - Sections of a document that include well-formed prose. Sub-class of ``Text``.
 | 
			
		||||
* ``Title`` - Headings and sub-headings wtihin a document. Sub-class of ``Text``.
 | 
			
		||||
* ``ListItem`` - A text element that is part of an ordered or unordered list. Sub-class of ``Text``.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#########################################
 | 
			
		||||
Applying Cleaning Bricks to Text Elements
 | 
			
		||||
#########################################
 | 
			
		||||
 | 
			
		||||
You can apply cleaning bricks to a text element by using the ``apply`` method. The
 | 
			
		||||
apply method accepts any function that takes a string as input and produces a string
 | 
			
		||||
as output. Use the `partial` function from `functools` if you need to set additional
 | 
			
		||||
args or kwargs for your cleaning brick. The `apply` method will accept either a single
 | 
			
		||||
cleaner or a list of cleaners.
 | 
			
		||||
 | 
			
		||||
Examples:
 | 
			
		||||
 | 
			
		||||
.. code:: python
 | 
			
		||||
 | 
			
		||||
  from functools import partial
 | 
			
		||||
 | 
			
		||||
  from unstructured.cleaners.core import clean_prefix
 | 
			
		||||
  from unstructured.cleaners.translate import translate_text
 | 
			
		||||
  from unstructured.documents.elements import ListItem
 | 
			
		||||
 | 
			
		||||
  cleaners = [
 | 
			
		||||
    partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
 | 
			
		||||
    partial(translate_text, target_lang="ru"),
 | 
			
		||||
  ]
 | 
			
		||||
 | 
			
		||||
  item = ListItem(text="[1] A Textbook on Crocodile Habitats")
 | 
			
		||||
  item.apply(*cleaners)
 | 
			
		||||
 | 
			
		||||
  # The output will be: Учебник по крокодильным средам обитания
 | 
			
		||||
  print(item)
 | 
			
		||||
@ -1,3 +1,8 @@
 | 
			
		||||
from functools import partial
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
from unstructured.cleaners.core import clean_prefix
 | 
			
		||||
from unstructured.cleaners.translate import translate_text
 | 
			
		||||
from unstructured.documents.elements import Element, NoID, Text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -9,3 +14,26 @@ def test_text_id():
 | 
			
		||||
def test_element_defaults_to_blank_id():
 | 
			
		||||
    element = Element()
 | 
			
		||||
    assert isinstance(element.id, NoID)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_text_element_apply_cleaners():
 | 
			
		||||
    text_element = Text(text="[1] A Textbook on Crocodile Habitats")
 | 
			
		||||
 | 
			
		||||
    text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
 | 
			
		||||
    assert str(text_element) == "A Textbook on Crocodile Habitats"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_text_element_apply_multiple_cleaners():
 | 
			
		||||
    cleaners = [
 | 
			
		||||
        partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
 | 
			
		||||
        partial(translate_text, target_lang="ru"),
 | 
			
		||||
    ]
 | 
			
		||||
    text_element = Text(text="[1] A Textbook on Crocodile Habitats")
 | 
			
		||||
    text_element.apply(*cleaners)
 | 
			
		||||
    assert str(text_element) == "Учебник по крокодильным средам обитания"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_apply_raises_if_func_does_not_produce_string():
 | 
			
		||||
    text_element = Text(text="[1] A Textbook on Crocodile Habitats")
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        text_element.apply(lambda s: 1)
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.3.2-dev0"  # pragma: no cover
 | 
			
		||||
__version__ = "0.3.2"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,6 @@
 | 
			
		||||
from abc import ABC
 | 
			
		||||
import hashlib
 | 
			
		||||
from typing import Union
 | 
			
		||||
from typing import Callable, Union
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NoID(ABC):
 | 
			
		||||
@ -36,6 +36,18 @@ class Text(Element):
 | 
			
		||||
    def __eq__(self, other):
 | 
			
		||||
        return self.text == other.text
 | 
			
		||||
 | 
			
		||||
    def apply(self, *cleaners: Callable):
 | 
			
		||||
        """Applies a cleaning brick to the text element. The function that's passed in
 | 
			
		||||
        should take a string as input and produce a string as output."""
 | 
			
		||||
        cleaned_text = self.text
 | 
			
		||||
        for cleaner in cleaners:
 | 
			
		||||
            cleaned_text = cleaner(cleaned_text)
 | 
			
		||||
 | 
			
		||||
        if not isinstance(cleaned_text, str):
 | 
			
		||||
            raise ValueError("Cleaner produced a non-string output.")
 | 
			
		||||
 | 
			
		||||
        self.text = cleaned_text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NarrativeText(Text):
 | 
			
		||||
    """NarrativeText is an element consisting of multiple, well-formulated sentences. This
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user