mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 19:42:27 +00:00
feat: Cleaning bricks to extract text before/after a pattern (#63)
* brick to extract text before * brick for extract text after * tests for extract before and after * updated docs * changelog and bump version * fix typo * fix another typo * positive -> non-negative
This commit is contained in:
parent
f3756abc90
commit
300c564c62
@ -1,6 +1,7 @@
|
|||||||
## 0.2.3-dev0
|
## 0.2.3
|
||||||
|
|
||||||
* Add cleaning bricks for removing prefixes and postfixes
|
* Add cleaning bricks for removing prefixes and postfixes
|
||||||
|
* Add cleaning bricks for extracting text before and after a pattern
|
||||||
|
|
||||||
## 0.2.2
|
## 0.2.2
|
||||||
|
|
||||||
|
|||||||
@ -360,6 +360,52 @@ Examples:
|
|||||||
clean_postfix(text, r"(END|STOP)", ignore_case=True)
|
clean_postfix(text, r"(END|STOP)", ignore_case=True)
|
||||||
|
|
||||||
|
|
||||||
|
``extract_text_before``
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
Extracts text that occurs before the specified pattern.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
|
||||||
|
* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``.
|
||||||
|
* Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``.
|
||||||
|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_text_before
|
||||||
|
|
||||||
|
text = "Here I am! STOP Look at me! STOP I'm flying! STOP"
|
||||||
|
|
||||||
|
# Returns "Here I am!"
|
||||||
|
extract_text_before(text, r"STOP")
|
||||||
|
|
||||||
|
|
||||||
|
``extract_text_after``
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Extracts text that occurs after the specified pattern.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
|
||||||
|
* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``.
|
||||||
|
* Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``.
|
||||||
|
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_text_after
|
||||||
|
|
||||||
|
text = "SPEAKER 1: Look at me, I'm flying!"
|
||||||
|
|
||||||
|
# Returns "Look at me, I'm flying!"
|
||||||
|
extract_text_after(text, r"SPEAKER \d{1}:")
|
||||||
|
|
||||||
|
|
||||||
#######
|
#######
|
||||||
Staging
|
Staging
|
||||||
#######
|
#######
|
||||||
|
|||||||
23
test_unstructured/cleaners/test_extract.py
Normal file
23
test_unstructured/cleaners/test_extract.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
import unstructured.cleaners.extract as extract
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_indexed_match_raises_with_bad_index():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_indexed_match_raises_with_index_too_high():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_text_before():
|
||||||
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
||||||
|
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_text_after():
|
||||||
|
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!"
|
||||||
|
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.2.3-dev0" # pragma: no cover
|
__version__ = "0.2.3" # pragma: no cover
|
||||||
|
|||||||
46
unstructured/cleaners/extract.py
Normal file
46
unstructured/cleaners/extract.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
|
||||||
|
if not isinstance(index, int) or index < 0:
|
||||||
|
raise ValueError(f"The index is {index}. Index must be a non-negative integer.")
|
||||||
|
|
||||||
|
regex_match = None
|
||||||
|
for i, result in enumerate(re.finditer(pattern, text)):
|
||||||
|
if i == index:
|
||||||
|
regex_match = result
|
||||||
|
|
||||||
|
if regex_match is None:
|
||||||
|
raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")
|
||||||
|
|
||||||
|
return regex_match
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
|
||||||
|
"""Extracts texts that occurs before the specified pattern. By default, it will use
|
||||||
|
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
|
||||||
|
index.
|
||||||
|
|
||||||
|
Input
|
||||||
|
-----
|
||||||
|
strip: If True, removes trailing whitespace from the extracted string
|
||||||
|
"""
|
||||||
|
regex_match = _get_indexed_match(text, pattern, index)
|
||||||
|
start, _ = regex_match.span()
|
||||||
|
before_text = text[:start]
|
||||||
|
return before_text.rstrip() if strip else before_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
|
||||||
|
"""Extracts texts that occurs before the specified pattern. By default, it will use
|
||||||
|
the first occurence of the pattern (index 0). Use the index kwarg to choose a different
|
||||||
|
index.
|
||||||
|
|
||||||
|
Input
|
||||||
|
-----
|
||||||
|
strip: If True, removes leading whitespace from the extracted string
|
||||||
|
"""
|
||||||
|
regex_match = _get_indexed_match(text, pattern, index)
|
||||||
|
_, end = regex_match.span()
|
||||||
|
before_text = text[end:]
|
||||||
|
return before_text.lstrip() if strip else before_text
|
||||||
Loading…
x
Reference in New Issue
Block a user