mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

**Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
527 lines
18 KiB
Python
527 lines
18 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import uuid
|
|
from typing import Optional, Type, cast
|
|
|
|
import pytest
|
|
from pytest_mock import MockerFixture
|
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
|
from unstructured.chunking.title import chunk_by_title
|
|
from unstructured.cleaners.core import group_broken_paragraphs
|
|
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
|
from unstructured.partition.text import (
|
|
_combine_paragraphs_less_than_min,
|
|
_split_content_to_fit_max,
|
|
partition_text,
|
|
)
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
EXPECTED_OUTPUT = [
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
|
Address(text="Doylestown, PA 18901"),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Hamburgers are delicious"),
|
|
ListItem(text="Dogs are the best"),
|
|
ListItem(text="I love fuzzy blankets"),
|
|
]
|
|
|
|
MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
|
|
because it is just being used as an example. Hi. Hello. Howdy. Hola.
|
|
The example is simple and repetitive and long and somewhat boring,
|
|
but it serves a purpose. End.""".replace(
|
|
"\n",
|
|
"",
|
|
)
|
|
|
|
SHORT_PARAGRAPHS = """This is a story.
|
|
|
|
This is a story that doesn't matter because it is just being used as an example.
|
|
|
|
Hi.
|
|
|
|
Hello.
|
|
|
|
Howdy.
|
|
|
|
Hola.
|
|
|
|
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
|
|
|
|
End.
|
|
"""
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "encoding"),
|
|
[
|
|
("fake-text.txt", "utf-8"),
|
|
("fake-text.txt", None),
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
],
|
|
)
|
|
def test_partition_text_from_filename(filename: str, encoding: Optional[str]):
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
elements = partition_text(filename=filename_path, encoding=encoding)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename == filename
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
assert {element.metadata.detection_origin for element in elements} == {"text"}
|
|
|
|
|
|
def test_partition_text_from_filename_with_metadata_filename():
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
elements = partition_text(
|
|
filename=filename_path,
|
|
encoding="utf-8",
|
|
metadata_filename="test",
|
|
)
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
)
|
|
def test_partition_text_from_filename_default_encoding(filename: str):
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
elements = partition_text(filename=filename_path)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename == filename
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "encoding", "error"),
|
|
[
|
|
("fake-text.txt", "utf-16", UnicodeDecodeError),
|
|
("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
|
|
],
|
|
)
|
|
def test_partition_text_from_filename_raises_econding_error(
|
|
filename: str,
|
|
encoding: Optional[str],
|
|
error: Type[BaseException],
|
|
):
|
|
with pytest.raises(error):
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
partition_text(filename=filename, encoding=encoding)
|
|
|
|
|
|
def test_partition_text_from_file():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_text(file=f)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_from_file_with_metadata_filename():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_text(file=f, metadata_filename="test")
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
)
|
|
def test_partition_text_from_file_default_encoding(filename: str):
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
with open(filename_path, "rb") as f:
|
|
elements = partition_text(file=f)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_from_bytes_file():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_text(file=f)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"filename",
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
)
|
|
def test_partition_text_from_bytes_file_default_encoding(filename: str):
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
with open(filename_path, "rb") as f:
|
|
elements = partition_text(file=f)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_text_partition_element_metadata_user_provided_languages():
|
|
filename = "example-docs/book-war-and-peace-1p.txt"
|
|
elements = partition_text(filename=filename, strategy="fast", languages=["en"])
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
def test_partition_text_from_text():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
elements = partition_text(text=text)
|
|
assert len(elements) > 0
|
|
assert elements == EXPECTED_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_from_text_works_with_empty_string():
|
|
assert partition_text(text="") == []
|
|
|
|
|
|
def test_partition_text_raises_with_none_specified():
|
|
with pytest.raises(ValueError):
|
|
partition_text()
|
|
|
|
|
|
def test_partition_text_raises_with_too_many_specified():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
|
|
with pytest.raises(ValueError):
|
|
partition_text(filename=filename, text=text)
|
|
|
|
|
|
def test_partition_text_captures_everything_even_with_linebreaks():
|
|
text = """
|
|
VERY IMPORTANT MEMO
|
|
DOYLESTOWN, PA 18901
|
|
"""
|
|
elements = partition_text(text=text)
|
|
assert elements == [
|
|
Title(text="VERY IMPORTANT MEMO"),
|
|
Address(text="DOYLESTOWN, PA 18901"),
|
|
]
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_groups_broken_paragraphs():
|
|
text = """The big brown fox
|
|
was walking down the lane.
|
|
|
|
At the end of the lane,
|
|
the fox met a bear."""
|
|
|
|
elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
|
|
assert elements == [
|
|
NarrativeText(text="The big brown fox was walking down the lane."),
|
|
NarrativeText(text="At the end of the lane, the fox met a bear."),
|
|
]
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_extract_regex_metadata():
|
|
text = "SPEAKER 1: It is my turn to speak now!"
|
|
|
|
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
|
|
assert elements[0].metadata.regex_metadata == {
|
|
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
|
|
}
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_text_splits_long_text():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename)
|
|
assert len(elements) > 0
|
|
assert elements[0].text.startswith("Iwan Roberts")
|
|
assert elements[-1].text.endswith("External links")
|
|
|
|
|
|
def test_partition_text_splits_long_text_max_partition():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename)
|
|
elements_max_part = partition_text(filename=filename, max_partition=500)
|
|
# NOTE(klaijan) - I edited the operation here from < to <=
|
|
# Please revert back if this does not make sense
|
|
assert len(elements) <= len(elements_max_part)
|
|
for element in elements_max_part:
|
|
assert len(element.text) <= 500
|
|
|
|
# Make sure combined text is all the same
|
|
assert " ".join([el.text for el in elements]) == " ".join([el.text for el in elements_max_part])
|
|
|
|
|
|
def test_partition_text_splits_max_min_partition():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename)
|
|
elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)
|
|
for i, element in enumerate(elements_max_part):
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
# so it can be short
|
|
if i < len(elements_max_part) - 1:
|
|
assert len(element.text) <= 1500
|
|
assert len(element.text) >= 1000
|
|
|
|
import re
|
|
|
|
from unstructured.nlp.patterns import BULLETS_PATTERN
|
|
|
|
# NOTE(klaijan) - clean the asterik out of both text.
|
|
# The `elements` was partitioned by new line and thus makes line 56 (shown below)
|
|
# "*Club domestic league appearances and goals"
|
|
# be considered as a bullet point by the function is_bulleted_text
|
|
# and so the asterik was removed from the paragraph
|
|
# whereas `elements_max_part` was partitioned differently and thus none of the line
|
|
# starts with any of the BULLETS_PATTERN.
|
|
|
|
# TODO(klaijan) - when edit the function partition_text to support non-bullet paragraph
|
|
# that starts with bullet-like BULLETS_PATTERN, remove the re.sub part from the assert below.
|
|
|
|
# Make sure combined text is all the same
|
|
assert re.sub(BULLETS_PATTERN, "", " ".join([el.text for el in elements])) == re.sub(
|
|
BULLETS_PATTERN,
|
|
"",
|
|
" ".join([el.text for el in elements_max_part]),
|
|
)
|
|
|
|
|
|
def test_partition_text_min_max():
|
|
segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6)
|
|
for i, segment in enumerate(segments):
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
# so it can be short
|
|
if i < len(segments) - 1:
|
|
assert len(segment.text) >= 6
|
|
|
|
segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7)
|
|
for i, segment in enumerate(segments):
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
# so it can be short
|
|
if i < len(segments) - 1:
|
|
assert len(segment.text) >= 7
|
|
assert len(segment.text) <= 20
|
|
|
|
|
|
def test_split_content_to_fit_max():
|
|
segments = _split_content_to_fit_max(
|
|
content=MIN_MAX_TEXT,
|
|
max_partition=75,
|
|
)
|
|
assert segments == [
|
|
"This is a story.",
|
|
"This is a story that doesn't matter because",
|
|
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
|
|
"The example is simple and repetitive and long",
|
|
"and somewhat boring, but it serves a purpose. End.",
|
|
]
|
|
|
|
|
|
def test_combine_paragraphs_less_than_min():
|
|
segments = _combine_paragraphs_less_than_min(
|
|
cast(list[str], SHORT_PARAGRAPHS.split("\n\n")),
|
|
max_partition=1500,
|
|
min_partition=7,
|
|
)
|
|
assert len(segments) < len(SHORT_PARAGRAPHS)
|
|
|
|
|
|
def test_partition_text_doesnt_get_page_breaks():
|
|
text = "--------------------"
|
|
elements = partition_text(text=text)
|
|
assert len(elements) == 1
|
|
assert elements[0].text == text
|
|
assert not isinstance(elements[0], ListItem)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "encoding"),
|
|
[
|
|
("fake-text.txt", "utf-8"),
|
|
("fake-text.txt", None),
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
],
|
|
)
|
|
def test_partition_text_from_filename_exclude_metadata(filename: str, encoding: Optional[str]):
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
elements = partition_text(
|
|
filename=filename,
|
|
encoding=encoding,
|
|
include_metadata=False,
|
|
)
|
|
for i in range(len(elements)):
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
def test_partition_text_from_file_exclude_metadata():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_text(file=f, include_metadata=False)
|
|
for i in range(len(elements)):
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_text_from_file_path_gets_last_modified_from_filesystem(mocker: MockerFixture):
|
|
filesystem_last_modified = "2029-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.text.get_last_modified_date", return_value=filesystem_last_modified
|
|
)
|
|
|
|
elements = partition_text(example_doc_path("fake-text.txt"))
|
|
|
|
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
|
|
|
|
|
def test_partition_text_from_file_gets_last_modified_None():
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
elements = partition_text(file=f)
|
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
|
|
|
|
|
def test_partition_text_from_text_gets_last_modified_None():
|
|
with open(example_doc_path("fake-text.txt")) as f:
|
|
text = f.read()
|
|
|
|
elements = partition_text(text=text)
|
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
|
|
|
|
|
def test_partition_text_from_file_path_prefers_metadata_last_modified(mocker: MockerFixture):
|
|
filesystem_last_modified = "2029-07-05T09:24:28"
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
mocker.patch(
|
|
"unstructured.partition.text.get_last_modified_date", return_value=filesystem_last_modified
|
|
)
|
|
|
|
elements = partition_text(
|
|
example_doc_path("fake-text.txt"), metadata_last_modified=metadata_last_modified
|
|
)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
def test_partition_text_from_file_prefers_metadata_last_modified():
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
elements = partition_text(file=f, metadata_last_modified=metadata_last_modified)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
def test_partition_text_from_text_prefers_metadata_last_modified():
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
with open(example_doc_path("fake-text.txt")) as f:
|
|
text = f.read()
|
|
|
|
elements = partition_text(text=text, metadata_last_modified=metadata_last_modified)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
# ------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def test_Text_element_assigns_id_hashes_that_are_unique_and_deterministic():
|
|
ids = [element.id for element in partition_text(text="hello\nhello\nhello")]
|
|
assert ids == [
|
|
"8657c0ec31a4cfc822f6cd4a5684cafd",
|
|
"72aefb4a12be063ad160931fdb380163",
|
|
"ba8c1a216ca585aecdd365a72e6124f1",
|
|
]
|
|
|
|
|
|
def test_Text_element_assings_UUID_when_unique_element_ids_is_True():
|
|
elements = partition_text(text="hello\nhello\nhello", unique_element_ids=True)
|
|
|
|
for element in elements:
|
|
assert uuid.UUID(element.id, version=4)
|
|
|
|
# Test that the element is JSON serializable. This should run without an error
|
|
json.dumps(element.to_dict())
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_name", "encoding"),
|
|
[
|
|
("fake-text.txt", "utf-8"),
|
|
("fake-text.txt", None),
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
],
|
|
)
|
|
def test_partition_text_with_json(file_name: str, encoding: str | None):
|
|
elements = partition_text(example_doc_path(file_name), encoding=encoding)
|
|
assert_round_trips_through_JSON(elements)
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_text():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename)
|
|
chunk_elements = partition_text(filename, chunking_strategy="by_title")
|
|
chunks = chunk_by_title(elements)
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
def test_partition_text_element_metadata_has_languages():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename)
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
def test_partition_text_respects_detect_language_per_element():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "eng_spa_mult.txt")
|
|
elements = partition_text(filename=filename, detect_language_per_element=True)
|
|
langs = [element.metadata.languages for element in elements]
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
|
|
|
|
|
def test_partition_text_respects_languages_arg():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
elements = partition_text(filename=filename, languages=["deu"])
|
|
assert elements[0].metadata.languages == ["deu"]
|
|
|
|
|
|
def test_partition_text_element_metadata_raises_TypeError():
|
|
with pytest.raises(TypeError):
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
partition_text(filename=filename, languages="eng") # type: ignore
|
|
|
|
|
|
def test_partition_text_detects_more_than_3_languages():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "UDHR_first_article_all.txt")
|
|
elements = partition_text(filename=filename, detect_language_per_element=True)
|
|
langs = list(
|
|
{element.metadata.languages[0] for element in elements if element.metadata.languages},
|
|
)
|
|
assert len(langs) > 10
|