diff --git a/CHANGELOG.md b/CHANGELOG.md index 14cb51c24..89023b915 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.2.3-dev0 +## 0.2.3 * Add cleaning bricks for removing prefixes and postfixes +* Add cleaning bricks for extracting text before and after a pattern ## 0.2.2 diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 628b58f58..5dd8aaaba 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -360,6 +360,52 @@ Examples: clean_postfix(text, r"(END|STOP)", ignore_case=True) +``extract_text_before`` +----------------------- + +Extracts text that occurs before the specified pattern. + +Options: + +* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``. +* Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``. + + +Examples: + +.. code:: python + + from unstructured.cleaners.extract import extract_text_before + + text = "Here I am! STOP Look at me! STOP I'm flying! STOP" + + # Returns "Here I am!" + extract_text_before(text, r"STOP") + + +``extract_text_after`` +---------------------- + +Extracts text that occurs after the specified pattern. + +Options: + +* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``. +* Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``. + + +Examples: + +.. code:: python + + from unstructured.cleaners.extract import extract_text_after + + text = "SPEAKER 1: Look at me, I'm flying!" + + # Returns "Look at me, I'm flying!" + extract_text_after(text, r"SPEAKER \d{1}:") + + ####### Staging ####### diff --git a/test_unstructured/cleaners/test_extract.py b/test_unstructured/cleaners/test_extract.py new file mode 100644 index 000000000..1a00ec542 --- /dev/null +++ b/test_unstructured/cleaners/test_extract.py @@ -0,0 +1,23 @@ +import pytest + +import unstructured.cleaners.extract as extract + + +def test_get_indexed_match_raises_with_bad_index(): + with pytest.raises(ValueError): + extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1) + + +def test_get_indexed_match_raises_with_index_too_high(): + with pytest.raises(ValueError): + extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4) + + +def test_extract_text_before(): + text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!" + assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH" + + +def test_extract_text_after(): + text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!" + assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 575b4d957..1cc734ad3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.3-dev0" # pragma: no cover +__version__ = "0.2.3" # pragma: no cover diff --git a/unstructured/cleaners/extract.py b/unstructured/cleaners/extract.py new file mode 100644 index 000000000..db396886f --- /dev/null +++ b/unstructured/cleaners/extract.py @@ -0,0 +1,46 @@ +import re + + +def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match: + if not isinstance(index, int) or index < 0: + raise ValueError(f"The index is {index}. Index must be a non-negative integer.") + + regex_match = None + for i, result in enumerate(re.finditer(pattern, text)): + if i == index: + regex_match = result + + if regex_match is None: + raise ValueError(f"Result with index {index} was not found. The largest index was {i}.") + + return regex_match + + +def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: + """Extracts texts that occurs before the specified pattern. By default, it will use + the first occurence of the pattern (index 0). Use the index kwarg to choose a different + index. + + Input + ----- + strip: If True, removes trailing whitespace from the extracted string + """ + regex_match = _get_indexed_match(text, pattern, index) + start, _ = regex_match.span() + before_text = text[:start] + return before_text.rstrip() if strip else before_text + + +def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: + """Extracts texts that occurs before the specified pattern. By default, it will use + the first occurence of the pattern (index 0). Use the index kwarg to choose a different + index. + + Input + ----- + strip: If True, removes leading whitespace from the extracted string + """ + regex_match = _get_indexed_match(text, pattern, index) + _, end = regex_match.span() + before_text = text[end:] + return before_text.lstrip() if strip else before_text