feat: Cleaning bricks to extract text before/after a pattern (#63)

* brick to extract text before * brick for extract text after * tests for extract before and after * updated docs * changelog and bump version * fix typo * fix another typo * positive -> non-negative
2025-12-03 18:49:53 +00:00 · 2022-11-10 16:35:37 -05:00 · 2022-11-10 16:35:37 -05:00 · 300c564c62
commit 300c564c62
parent f3756abc90
5 changed files with 118 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,7 @@
-## 0.2.3-dev0
+## 0.2.3

 * Add cleaning bricks for removing prefixes and postfixes
+* Add cleaning bricks for extracting text before and after a pattern

 ## 0.2.2

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -360,6 +360,52 @@ Examples:
  clean_postfix(text, r"(END|STOP)", ignore_case=True)


+``extract_text_before``
+-----------------------
+
+Extracts text that occurs before the specified pattern.
+
+Options:
+
+* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``.
+* Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.extract import extract_text_before
+
+  text = "Here I am! STOP Look at me! STOP I'm flying! STOP"
+
+  # Returns "Here I am!"
+  extract_text_before(text, r"STOP")
+
+
+``extract_text_after``
+----------------------
+
+Extracts text that occurs after the specified pattern.
+
+Options:
+
+* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``.
+* Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.cleaners.extract import extract_text_after
+
+  text = "SPEAKER 1: Look at me, I'm flying!"
+
+  # Returns "Look at me, I'm flying!"
+  extract_text_after(text, r"SPEAKER \d{1}:")
+
+
 #######
 Staging
 #######
--- a/test_unstructured/cleaners/test_extract.py
+++ b/test_unstructured/cleaners/test_extract.py
@ -0,0 +1,23 @@
+import pytest
+
+import unstructured.cleaners.extract as extract
+
+
+def test_get_indexed_match_raises_with_bad_index():
+    with pytest.raises(ValueError):
+        extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
+
+
+def test_get_indexed_match_raises_with_index_too_high():
+    with pytest.raises(ValueError):
+        extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
+
+
+def test_extract_text_before():
+    text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
+    assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
+
+
+def test_extract_text_after():
+    text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
+    assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.2.3-dev0"  # pragma: no cover
+__version__ = "0.2.3"  # pragma: no cover
--- a/unstructured/cleaners/extract.py
+++ b/unstructured/cleaners/extract.py
@ -0,0 +1,46 @@
+import re
+
+
+def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
+    if not isinstance(index, int) or index < 0:
+        raise ValueError(f"The index is {index}. Index must be a non-negative integer.")
+
+    regex_match = None
+    for i, result in enumerate(re.finditer(pattern, text)):
+        if i == index:
+            regex_match = result
+
+    if regex_match is None:
+        raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")
+
+    return regex_match
+
+
+def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
+    """Extracts texts that occurs before the specified pattern. By default, it will use
+    the first occurence of the pattern (index 0). Use the index kwarg to choose a different
+    index.
+
+    Input
+    -----
+    strip: If True, removes trailing whitespace from the extracted string
+    """
+    regex_match = _get_indexed_match(text, pattern, index)
+    start, _ = regex_match.span()
+    before_text = text[:start]
+    return before_text.rstrip() if strip else before_text
+
+
+def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
+    """Extracts texts that occurs before the specified pattern. By default, it will use
+    the first occurence of the pattern (index 0). Use the index kwarg to choose a different
+    index.
+
+    Input
+    -----
+    strip: If True, removes leading whitespace from the extracted string
+    """
+    regex_match = _get_indexed_match(text, pattern, index)
+    _, end = regex_match.span()
+    before_text = text[end:]
+    return before_text.lstrip() if strip else before_text