unstructured/test_unstructured/partition/test_text.py

import os
import pathlib

import pytest

from unstructured.cleaners.core import group_broken_paragraphs
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
from unstructured.partition.text import partition_text

DIRECTORY = pathlib.Path(__file__).parent.resolve()

EXPECTED_OUTPUT = [
    NarrativeText(text="This is a test document to use for unit tests."),
    Address(text="Doylestown, PA 18901"),
    Title(text="Important points:"),
    ListItem(text="Hamburgers are delicious"),
    ListItem(text="Dogs are the best"),
    ListItem(text="I love fuzzy blankets"),
]


@pytest.mark.parametrize(
    ("filename", "encoding"),
    [("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")],
)
def test_partition_text_from_filename(filename, encoding):
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
    elements = partition_text(filename=filename, encoding=encoding)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


@pytest.mark.parametrize(
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_filename_default_encoding(filename):
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
    elements = partition_text(filename=filename)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


@pytest.mark.parametrize(
    ("filename", "encoding", "error"),
    [
        ("fake-text.txt", "utf-16", UnicodeDecodeError),
        ("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
    ],
)
def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):
    with pytest.raises(error):
        filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
        partition_text(filename=filename, encoding=encoding)


def test_partition_text_from_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename) as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


@pytest.mark.parametrize(
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_file_default_encoding(filename):
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
    with open(filename) as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


def test_partition_text_from_bytes_file():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


@pytest.mark.parametrize(
    "filename",
    ["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
)
def test_partition_text_from_bytes_file_default_encoding(filename):
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
    with open(filename, "rb") as f:
        elements = partition_text(file=f)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


def test_partition_text_from_text():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename) as f:
        text = f.read()
    elements = partition_text(text=text)
    assert len(elements) > 0
    assert elements == EXPECTED_OUTPUT


def test_partition_text_from_text_works_with_empty_string():
    assert partition_text(text="") == []


def test_partition_text_raises_with_none_specified():
    with pytest.raises(ValueError):
        partition_text()


def test_partition_text_raises_with_too_many_specified():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
    with open(filename) as f:
        text = f.read()

    with pytest.raises(ValueError):
        partition_text(filename=filename, text=text)


def test_partition_text_captures_everything_even_with_linebreaks():
    text = """
    VERY IMPORTANT MEMO
    DOYLESTOWN, PA 18901
    """
    elements = partition_text(text=text)
    assert elements == [
        Title(text="VERY IMPORTANT MEMO"),
        Address(text="DOYLESTOWN, PA 18901"),
    ]


def test_partition_text_groups_broken_paragraphs():
    text = """The big brown fox
was walking down the lane.

At the end of the lane,
the fox met a bear."""

    elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
    assert elements == [
        NarrativeText(text="The big brown fox was walking down the lane."),
        NarrativeText(text="At the end of the lane, the fox met a bear."),
    ]


def test_partition_text_extract_regex_metadata():
    text = "SPEAKER 1: It is my turn to speak now!"

    elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
    assert elements[0].metadata.regex_metadata == {
        "speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
    }
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`import os`
			`import pathlib`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`import pytest`

feat: enable grouping broken paragraphs in `partition_text` (#456) * cleaning brick to group broken paragraphs * docs for group_broken_paragraphs * add docs for partition_text with grouper * partition_text and auto with paragraph_grouper * version and changelog * typo in the docs * linting, linting, linting * switch to using regular expressions 2023-04-06 14:35:22 -04:00			`from unstructured.cleaners.core import group_broken_paragraphs`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`from unstructured.documents.elements import Address, ListItem, NarrativeText, Title`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`from unstructured.partition.text import partition_text`

			`DIRECTORY = pathlib.Path(__file__).parent.resolve()`

			`EXPECTED_OUTPUT = [`
			`NarrativeText(text="This is a test document to use for unit tests."),`
fix: cleanup from live `.docx` tests (#177) * add env var for cap threshold; raise default threshold * update docs and tests * added check for ending in a comma * update docs * no caps check for all upper text * capture Text in html and text * check category in Text equality check * lower case all caps before checking for verbs * added check for us city/state/zip * added address type * add address to html * add address to text * fix for text tests; escape for large text segments * refactor regex for readability * update comment * additional test for text with linebreaks * update docs * update changelog * update elements docs * remove old comment * case -> cast * type fix 2023-01-26 10:52:25 -05:00			`Address(text="Doylestown, PA 18901"),`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`Title(text="Important points:"),`
			`ListItem(text="Hamburgers are delicious"),`
			`ListItem(text="Dogs are the best"),`
			`ListItem(text="I love fuzzy blankets"),`
			`]`


Adding optional encoding arg, and text_partition tests (#339) 2023-03-06 15:07:33 -08:00			`@pytest.mark.parametrize(`
			`("filename", "encoding"),`
			`[("fake-text.txt", "utf-8"), ("fake-text.txt", None), ("fake-text-utf-16-be.txt", "utf-16-be")],`
			`)`
			`def test_partition_text_from_filename(filename, encoding):`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)`
			`elements = partition_text(filename=filename, encoding=encoding)`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


Issue/unicode error (#608) This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding. 2023-05-23 15:35:38 -05:00			`@pytest.mark.parametrize(`
			`"filename",`
			`["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],`
			`)`
			`def test_partition_text_from_filename_default_encoding(filename):`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)`
			`elements = partition_text(filename=filename)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


Adding optional encoding arg, and text_partition tests (#339) 2023-03-06 15:07:33 -08:00			`@pytest.mark.parametrize(`
			`("filename", "encoding", "error"),`
			`[`
			`("fake-text.txt", "utf-16", UnicodeDecodeError),`
			`("fake-text-utf-16-be.txt", "utf-16", UnicodeError),`
			`],`
			`)`
			`def test_partition_text_from_filename_raises_econding_error(filename, encoding, error):`
			`with pytest.raises(error):`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)`
			`partition_text(filename=filename, encoding=encoding)`


feat: add support for `.txt` files in `partition` (#150) * added partition_text for auto * rename partition_text tests * bump version and update docs 2023-01-13 16:39:53 -05:00			`def test_partition_text_from_file():`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`with open(filename) as f:`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`elements = partition_text(file=f)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


Issue/unicode error (#608) This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding. 2023-05-23 15:35:38 -05:00			`@pytest.mark.parametrize(`
			`"filename",`
			`["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],`
			`)`
			`def test_partition_text_from_file_default_encoding(filename):`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)`
			`with open(filename) as f:`
			`elements = partition_text(file=f)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


Fixing test for unstructured-api (#425) Ran into an error in tests for unstructured-api (see below for output). Somewhere along the lines we were reading a txt file into bytes and then the PARAGRAPH_PATTERN (a string) was not able to be compared to the bytes file. 2023-04-03 11:12:12 -07:00			`def test_partition_text_from_bytes_file():`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")`
			`with open(filename, "rb") as f:`
			`elements = partition_text(file=f)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


Issue/unicode error (#608) This PR adds functionality to try other common encodings if an error related to the encoding is raised and the user has not specified an encoding. 2023-05-23 15:35:38 -05:00			`@pytest.mark.parametrize(`
			`"filename",`
			`["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],`
			`)`
			`def test_partition_text_from_bytes_file_default_encoding(filename):`
			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)`
			`with open(filename, "rb") as f:`
			`elements = partition_text(file=f)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


feat: add support for `.txt` files in `partition` (#150) * added partition_text for auto * rename partition_text tests * bump version and update docs 2023-01-13 16:39:53 -05:00			`def test_partition_text_from_text():`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`with open(filename) as f:`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`text = f.read()`
			`elements = partition_text(text=text)`
			`assert len(elements) > 0`
			`assert elements == EXPECTED_OUTPUT`


fix: text kwargs no longer fail with empty string (#413) * fix: text kwargs no longer fail with empty string * linting 2023-03-28 17:03:51 -04:00			`def test_partition_text_from_text_works_with_empty_string():`
			`assert partition_text(text="") == []`


feat: add support for `.txt` files in `partition` (#150) * added partition_text for auto * rename partition_text tests * bump version and update docs 2023-01-13 16:39:53 -05:00			`def test_partition_text_raises_with_none_specified():`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`with pytest.raises(ValueError):`
			`partition_text()`


feat: add support for `.txt` files in `partition` (#150) * added partition_text for auto * rename partition_text tests * bump version and update docs 2023-01-13 16:39:53 -05:00			`def test_partition_text_raises_with_too_many_specified():`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")`
Resolve various style issues to improve overall code quality (#282) * Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR. 2023-02-27 17:30:54 +01:00			`with open(filename) as f:`
feat: Add new functionality to parse text and header of emails (#111) * partition_text function 2023-01-09 11:08:08 -06:00			`text = f.read()`

			`with pytest.raises(ValueError):`
			`partition_text(filename=filename, text=text)`
fix: cleanup from live `.docx` tests (#177) * add env var for cap threshold; raise default threshold * update docs and tests * added check for ending in a comma * update docs * no caps check for all upper text * capture Text in html and text * check category in Text equality check * lower case all caps before checking for verbs * added check for us city/state/zip * added address type * add address to html * add address to text * fix for text tests; escape for large text segments * refactor regex for readability * update comment * additional test for text with linebreaks * update docs * update changelog * update elements docs * remove old comment * case -> cast * type fix 2023-01-26 10:52:25 -05:00

			`def test_partition_text_captures_everything_even_with_linebreaks():`
			`text = """`
			`VERY IMPORTANT MEMO`
			`DOYLESTOWN, PA 18901`
			`"""`
			`elements = partition_text(text=text)`
			`assert elements == [`
			`Title(text="VERY IMPORTANT MEMO"),`
			`Address(text="DOYLESTOWN, PA 18901"),`
			`]`
feat: enable grouping broken paragraphs in `partition_text` (#456) * cleaning brick to group broken paragraphs * docs for group_broken_paragraphs * add docs for partition_text with grouper * partition_text and auto with paragraph_grouper * version and changelog * typo in the docs * linting, linting, linting * switch to using regular expressions 2023-04-06 14:35:22 -04:00

			`def test_partition_text_groups_broken_paragraphs():`
			`text = """The big brown fox`
			`was walking down the lane.`

			`At the end of the lane,`
			`the fox met a bear."""`

			`elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)`
			`assert elements == [`
			`NarrativeText(text="The big brown fox was walking down the lane."),`
			`NarrativeText(text="At the end of the lane, the fox met a bear."),`
			`]`
feat: add ability to extract extra metadata with regex (#763) * first pass on regex metadata * fix typing for regex metadata * add dataclass back in * add decorators * fix tests * update docs * add tests for regex metadata * add process metadata to tsv * changelog and version * docs typos * consolidate to using a single kwarg * fix test 2023-06-16 10:10:56 -04:00

			`def test_partition_text_extract_regex_metadata():`
			`text = "SPEAKER 1: It is my turn to speak now!"`

			`elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})`
			`assert elements[0].metadata.regex_metadata == {`
			`"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],`
			`}`