mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-03 18:49:53 +00:00
feat: Cleaning bricks to extract text before/after a pattern (#63)
* brick to extract text before * brick for extract text after * tests for extract before and after * updated docs * changelog and bump version * fix typo * fix another typo * positive -> non-negative
This commit is contained in:
parent
f3756abc90
commit
300c564c62
@ -1,6 +1,7 @@
|
||||
## 0.2.3-dev0
|
||||
## 0.2.3
|
||||
|
||||
* Add cleaning bricks for removing prefixes and postfixes
|
||||
* Add cleaning bricks for extracting text before and after a pattern
|
||||
|
||||
## 0.2.2
|
||||
|
||||
|
||||
@ -360,6 +360,52 @@ Examples:
|
||||
clean_postfix(text, r"(END|STOP)", ignore_case=True)
|
||||
|
||||
|
||||
``extract_text_before``
|
||||
-----------------------
|
||||
|
||||
Extracts text that occurs before the specified pattern.
|
||||
|
||||
Options:
|
||||
|
||||
* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``.
|
||||
* Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_text_before
|
||||
|
||||
text = "Here I am! STOP Look at me! STOP I'm flying! STOP"
|
||||
|
||||
# Returns "Here I am!"
|
||||
extract_text_before(text, r"STOP")
|
||||
|
||||
|
||||
``extract_text_after``
|
||||
----------------------
|
||||
|
||||
Extracts text that occurs after the specified pattern.
|
||||
|
||||
Options:
|
||||
|
||||
* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``.
|
||||
* Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_text_after
|
||||
|
||||
text = "SPEAKER 1: Look at me, I'm flying!"
|
||||
|
||||
# Returns "Look at me, I'm flying!"
|
||||
extract_text_after(text, r"SPEAKER \d{1}:")
|
||||
|
||||
|
||||
#######
|
||||
Staging
|
||||
#######
|
||||
|
||||
23
test_unstructured/cleaners/test_extract.py
Normal file
23
test_unstructured/cleaners/test_extract.py
Normal file
@ -0,0 +1,23 @@
|
||||
import pytest
|
||||
|
||||
import unstructured.cleaners.extract as extract
|
||||
|
||||
|
||||
def test_get_indexed_match_raises_with_bad_index():
|
||||
with pytest.raises(ValueError):
|
||||
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
|
||||
|
||||
|
||||
def test_get_indexed_match_raises_with_index_too_high():
|
||||
with pytest.raises(ValueError):
|
||||
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
|
||||
|
||||
|
||||
def test_extract_text_before():
|
||||
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
||||
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
|
||||
|
||||
|
||||
def test_extract_text_after():
|
||||
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
||||
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.2.3-dev0" # pragma: no cover
|
||||
__version__ = "0.2.3" # pragma: no cover
|
||||
|
||||
46
unstructured/cleaners/extract.py
Normal file
46
unstructured/cleaners/extract.py
Normal file
@ -0,0 +1,46 @@
|
||||
import re
|
||||
|
||||
|
||||
def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
|
||||
if not isinstance(index, int) or index < 0:
|
||||
raise ValueError(f"The index is {index}. Index must be a non-negative integer.")
|
||||
|
||||
regex_match = None
|
||||
for i, result in enumerate(re.finditer(pattern, text)):
|
||||
if i == index:
|
||||
regex_match = result
|
||||
|
||||
if regex_match is None:
|
||||
raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")
|
||||
|
||||
return regex_match
|
||||
|
||||
|
||||
def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
|
||||
"""Extracts texts that occurs before the specified pattern. By default, it will use
|
||||
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
|
||||
index.
|
||||
|
||||
Input
|
||||
-----
|
||||
strip: If True, removes trailing whitespace from the extracted string
|
||||
"""
|
||||
regex_match = _get_indexed_match(text, pattern, index)
|
||||
start, _ = regex_match.span()
|
||||
before_text = text[:start]
|
||||
return before_text.rstrip() if strip else before_text
|
||||
|
||||
|
||||
def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
|
||||
"""Extracts texts that occurs before the specified pattern. By default, it will use
|
||||
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
|
||||
index.
|
||||
|
||||
Input
|
||||
-----
|
||||
strip: If True, removes leading whitespace from the extracted string
|
||||
"""
|
||||
regex_match = _get_indexed_match(text, pattern, index)
|
||||
_, end = regex_match.span()
|
||||
before_text = text[end:]
|
||||
return before_text.lstrip() if strip else before_text
|
||||
Loading…
x
Reference in New Issue
Block a user