2023-11-01 17:44:55 -05:00
|
|
|
# pyright: reportPrivateUsage=false
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-08-11 07:02:37 -04:00
|
|
|
import json
|
2023-01-09 11:08:08 -06:00
|
|
|
import os
|
|
|
|
import pathlib
|
2024-04-24 09:05:20 +02:00
|
|
|
import uuid
|
2024-09-23 15:23:10 -07:00
|
|
|
from typing import Optional, Type, cast
|
2023-02-27 17:30:54 +01:00
|
|
|
|
2023-01-09 11:08:08 -06:00
|
|
|
import pytest
|
2023-11-01 17:44:55 -05:00
|
|
|
from pytest_mock import MockerFixture
|
2023-01-09 11:08:08 -06:00
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-04-06 14:35:22 -04:00
|
|
|
from unstructured.cleaners.core import group_broken_paragraphs
|
2023-12-13 12:22:25 -08:00
|
|
|
from unstructured.documents.elements import Address, ListItem, NarrativeText, Title
|
2023-07-24 10:57:24 -05:00
|
|
|
from unstructured.partition.text import (
|
2023-11-01 17:44:55 -05:00
|
|
|
_combine_paragraphs_less_than_min,
|
|
|
|
_split_content_to_fit_max,
|
2023-07-24 10:57:24 -05:00
|
|
|
partition_text,
|
|
|
|
)
|
2023-10-05 15:26:47 -05:00
|
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
2023-11-01 17:44:55 -05:00
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
EXPECTED_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test document to use for unit tests."),
|
2023-01-26 10:52:25 -05:00
|
|
|
Address(text="Doylestown, PA 18901"),
|
2023-01-09 11:08:08 -06:00
|
|
|
Title(text="Important points:"),
|
|
|
|
ListItem(text="Hamburgers are delicious"),
|
|
|
|
ListItem(text="Dogs are the best"),
|
|
|
|
ListItem(text="I love fuzzy blankets"),
|
|
|
|
]
|
|
|
|
|
2023-07-24 10:57:24 -05:00
|
|
|
MIN_MAX_TEXT = """This is a story. This is a story that doesn't matter
|
|
|
|
because it is just being used as an example. Hi. Hello. Howdy. Hola.
|
|
|
|
The example is simple and repetitive and long and somewhat boring,
|
|
|
|
but it serves a purpose. End.""".replace(
|
|
|
|
"\n",
|
|
|
|
"",
|
|
|
|
)
|
|
|
|
|
|
|
|
SHORT_PARAGRAPHS = """This is a story.
|
|
|
|
|
|
|
|
This is a story that doesn't matter because it is just being used as an example.
|
|
|
|
|
|
|
|
Hi.
|
|
|
|
|
|
|
|
Hello.
|
|
|
|
|
|
|
|
Howdy.
|
|
|
|
|
|
|
|
Hola.
|
|
|
|
|
|
|
|
The example is simple and repetitive and long and somewhat boring, but it serves a purpose.
|
|
|
|
|
|
|
|
End.
|
|
|
|
"""
|
|
|
|
|
2023-01-09 11:08:08 -06:00
|
|
|
|
2023-03-06 15:07:33 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding"),
|
2023-07-26 15:10:14 -04:00
|
|
|
[
|
|
|
|
("fake-text.txt", "utf-8"),
|
|
|
|
("fake-text.txt", None),
|
|
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
|
|
],
|
2023-03-06 15:07:33 -08:00
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_filename(filename: str, encoding: Optional[str]):
|
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
2023-07-05 15:02:22 -05:00
|
|
|
elements = partition_text(filename=filename_path, encoding=encoding)
|
2023-01-09 11:08:08 -06:00
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == filename
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
|
|
assert {element.metadata.detection_origin for element in elements} == {"text"}
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_from_filename_with_metadata_filename():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = partition_text(
|
|
|
|
filename=filename_path,
|
|
|
|
encoding="utf-8",
|
|
|
|
metadata_filename="test",
|
|
|
|
)
|
2023-07-05 15:02:22 -05:00
|
|
|
assert elements == EXPECTED_OUTPUT
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2023-05-23 15:35:38 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_filename_default_encoding(filename: str):
|
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
2023-07-05 15:02:22 -05:00
|
|
|
elements = partition_text(filename=filename_path)
|
2023-05-23 15:35:38 -05:00
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == filename
|
2023-05-23 15:35:38 -05:00
|
|
|
|
|
|
|
|
2023-03-06 15:07:33 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding", "error"),
|
|
|
|
[
|
|
|
|
("fake-text.txt", "utf-16", UnicodeDecodeError),
|
|
|
|
("fake-text-utf-16-be.txt", "utf-16", UnicodeError),
|
|
|
|
],
|
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_filename_raises_econding_error(
|
|
|
|
filename: str,
|
|
|
|
encoding: Optional[str],
|
|
|
|
error: Type[BaseException],
|
|
|
|
):
|
2023-03-06 15:07:33 -08:00
|
|
|
with pytest.raises(error):
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
2023-03-06 15:07:33 -08:00
|
|
|
partition_text(filename=filename, encoding=encoding)
|
|
|
|
|
|
|
|
|
2023-01-13 16:39:53 -05:00
|
|
|
def test_partition_text_from_file():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
with open(filename, "rb") as f:
|
2023-01-09 11:08:08 -06:00
|
|
|
elements = partition_text(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_from_file_with_metadata_filename():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-05 15:02:22 -05:00
|
|
|
elements = partition_text(file=f, metadata_filename="test")
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2023-05-23 15:35:38 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_file_default_encoding(filename: str):
|
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
|
|
|
with open(filename_path, "rb") as f:
|
2023-05-23 15:35:38 -05:00
|
|
|
elements = partition_text(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-05-23 15:35:38 -05:00
|
|
|
|
|
|
|
|
2023-04-03 11:12:12 -07:00
|
|
|
def test_partition_text_from_bytes_file():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-04-03 11:12:12 -07:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_text(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-04-03 11:12:12 -07:00
|
|
|
|
|
|
|
|
2023-05-23 15:35:38 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
|
|
|
["fake-text-utf-16.txt", "fake-text-utf-16-le.txt", "fake-text-utf-32.txt"],
|
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_bytes_file_default_encoding(filename: str):
|
|
|
|
filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
2023-07-05 15:02:22 -05:00
|
|
|
with open(filename_path, "rb") as f:
|
2023-05-23 15:35:38 -05:00
|
|
|
elements = partition_text(file=f)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-05-23 15:35:38 -05:00
|
|
|
|
|
|
|
|
2023-10-10 20:47:56 -05:00
|
|
|
def test_text_partition_element_metadata_user_provided_languages():
|
2023-09-26 14:09:27 -04:00
|
|
|
filename = "example-docs/book-war-and-peace-1p.txt"
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_text(filename=filename, strategy="fast", languages=["en"])
|
2023-09-26 14:09:27 -04:00
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
2023-01-13 16:39:53 -05:00
|
|
|
def test_partition_text_from_text():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-01-09 11:08:08 -06:00
|
|
|
text = f.read()
|
|
|
|
elements = partition_text(text=text)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements == EXPECTED_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-01-09 11:08:08 -06:00
|
|
|
|
|
|
|
|
2023-03-28 17:03:51 -04:00
|
|
|
def test_partition_text_from_text_works_with_empty_string():
|
|
|
|
assert partition_text(text="") == []
|
|
|
|
|
|
|
|
|
2023-01-13 16:39:53 -05:00
|
|
|
def test_partition_text_raises_with_none_specified():
|
2023-01-09 11:08:08 -06:00
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_text()
|
|
|
|
|
|
|
|
|
2023-01-13 16:39:53 -05:00
|
|
|
def test_partition_text_raises_with_too_many_specified():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-01-09 11:08:08 -06:00
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_text(filename=filename, text=text)
|
2023-01-26 10:52:25 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_captures_everything_even_with_linebreaks():
|
|
|
|
text = """
|
|
|
|
VERY IMPORTANT MEMO
|
|
|
|
DOYLESTOWN, PA 18901
|
|
|
|
"""
|
|
|
|
elements = partition_text(text=text)
|
|
|
|
assert elements == [
|
|
|
|
Title(text="VERY IMPORTANT MEMO"),
|
|
|
|
Address(text="DOYLESTOWN, PA 18901"),
|
|
|
|
]
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-04-06 14:35:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_groups_broken_paragraphs():
|
|
|
|
text = """The big brown fox
|
|
|
|
was walking down the lane.
|
|
|
|
|
|
|
|
At the end of the lane,
|
|
|
|
the fox met a bear."""
|
|
|
|
|
|
|
|
elements = partition_text(text=text, paragraph_grouper=group_broken_paragraphs)
|
|
|
|
assert elements == [
|
|
|
|
NarrativeText(text="The big brown fox was walking down the lane."),
|
|
|
|
NarrativeText(text="At the end of the lane, the fox met a bear."),
|
|
|
|
]
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-06-16 10:10:56 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_extract_regex_metadata():
|
|
|
|
text = "SPEAKER 1: It is my turn to speak now!"
|
|
|
|
|
|
|
|
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
|
|
|
|
assert elements[0].metadata.regex_metadata == {
|
|
|
|
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
|
|
|
|
}
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
2023-06-28 12:49:12 -04:00
|
|
|
|
|
|
|
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_splits_long_text():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-12-13 12:22:25 -08:00
|
|
|
elements = partition_text(filename=filename)
|
2023-06-28 15:26:01 -04:00
|
|
|
assert len(elements) > 0
|
|
|
|
assert elements[0].text.startswith("Iwan Roberts")
|
|
|
|
assert elements[-1].text.endswith("External links")
|
|
|
|
|
|
|
|
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_splits_long_text_max_partition():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-12-13 12:22:25 -08:00
|
|
|
elements = partition_text(filename=filename)
|
|
|
|
elements_max_part = partition_text(filename=filename, max_partition=500)
|
2023-08-07 18:37:18 -04:00
|
|
|
# NOTE(klaijan) - I edited the operation here from < to <=
|
|
|
|
# Please revert back if this does not make sense
|
|
|
|
assert len(elements) <= len(elements_max_part)
|
2023-08-04 09:32:42 -04:00
|
|
|
for element in elements_max_part:
|
|
|
|
assert len(element.text) <= 500
|
2023-07-24 10:57:24 -05:00
|
|
|
|
2023-08-04 09:32:42 -04:00
|
|
|
# Make sure combined text is all the same
|
|
|
|
assert " ".join([el.text for el in elements]) == " ".join([el.text for el in elements_max_part])
|
2023-07-24 10:57:24 -05:00
|
|
|
|
2023-08-04 09:32:42 -04:00
|
|
|
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_splits_max_min_partition():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-12-13 12:22:25 -08:00
|
|
|
elements = partition_text(filename=filename)
|
|
|
|
elements_max_part = partition_text(filename=filename, min_partition=1000, max_partition=1500)
|
2023-08-04 09:32:42 -04:00
|
|
|
for i, element in enumerate(elements_max_part):
|
|
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
|
|
# so it can be short
|
|
|
|
if i < len(elements_max_part) - 1:
|
|
|
|
assert len(element.text) <= 1500
|
|
|
|
assert len(element.text) >= 1000
|
|
|
|
|
2023-08-07 18:37:18 -04:00
|
|
|
import re
|
|
|
|
|
|
|
|
from unstructured.nlp.patterns import BULLETS_PATTERN
|
|
|
|
|
|
|
|
# NOTE(klaijan) - clean the asterik out of both text.
|
|
|
|
# The `elements` was partitioned by new line and thus makes line 56 (shown below)
|
|
|
|
# "*Club domestic league appearances and goals"
|
|
|
|
# be considered as a bullet point by the function is_bulleted_text
|
|
|
|
# and so the asterik was removed from the paragraph
|
|
|
|
# whereas `elements_max_part` was partitioned differently and thus none of the line
|
|
|
|
# starts with any of the BULLETS_PATTERN.
|
|
|
|
|
|
|
|
# TODO(klaijan) - when edit the function partition_text to support non-bullet paragraph
|
|
|
|
# that starts with bullet-like BULLETS_PATTERN, remove the re.sub part from the assert below.
|
|
|
|
|
2023-08-04 09:32:42 -04:00
|
|
|
# Make sure combined text is all the same
|
2023-08-07 18:37:18 -04:00
|
|
|
assert re.sub(BULLETS_PATTERN, "", " ".join([el.text for el in elements])) == re.sub(
|
|
|
|
BULLETS_PATTERN,
|
|
|
|
"",
|
|
|
|
" ".join([el.text for el in elements_max_part]),
|
|
|
|
)
|
2023-08-04 09:32:42 -04:00
|
|
|
|
|
|
|
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_min_max():
|
2023-12-13 12:22:25 -08:00
|
|
|
segments = partition_text(text=SHORT_PARAGRAPHS, min_partition=6)
|
2023-08-04 09:32:42 -04:00
|
|
|
for i, segment in enumerate(segments):
|
|
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
|
|
# so it can be short
|
|
|
|
if i < len(segments) - 1:
|
|
|
|
assert len(segment.text) >= 6
|
2023-07-24 10:57:24 -05:00
|
|
|
|
2023-12-13 12:22:25 -08:00
|
|
|
segments = partition_text(text=SHORT_PARAGRAPHS, max_partition=20, min_partition=7)
|
2023-08-04 09:32:42 -04:00
|
|
|
for i, segment in enumerate(segments):
|
|
|
|
# NOTE(robinson) - the last element does not have a next element to merge with,
|
|
|
|
# so it can be short
|
|
|
|
if i < len(segments) - 1:
|
|
|
|
assert len(segment.text) >= 7
|
|
|
|
assert len(segment.text) <= 20
|
2023-07-24 10:57:24 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_split_content_to_fit_max():
|
2023-11-01 17:44:55 -05:00
|
|
|
segments = _split_content_to_fit_max(
|
2023-07-24 10:57:24 -05:00
|
|
|
content=MIN_MAX_TEXT,
|
|
|
|
max_partition=75,
|
|
|
|
)
|
|
|
|
assert segments == [
|
|
|
|
"This is a story.",
|
|
|
|
"This is a story that doesn't matter because",
|
|
|
|
"it is just being used as an example. Hi. Hello. Howdy. Hola.",
|
|
|
|
"The example is simple and repetitive and long",
|
|
|
|
"and somewhat boring, but it serves a purpose. End.",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_combine_paragraphs_less_than_min():
|
2023-11-01 17:44:55 -05:00
|
|
|
segments = _combine_paragraphs_less_than_min(
|
2024-09-23 15:23:10 -07:00
|
|
|
cast(list[str], SHORT_PARAGRAPHS.split("\n\n")),
|
2023-07-24 10:57:24 -05:00
|
|
|
max_partition=1500,
|
|
|
|
min_partition=7,
|
|
|
|
)
|
|
|
|
assert len(segments) < len(SHORT_PARAGRAPHS)
|
|
|
|
|
|
|
|
|
2023-06-28 12:49:12 -04:00
|
|
|
def test_partition_text_doesnt_get_page_breaks():
|
|
|
|
text = "--------------------"
|
2023-12-13 12:22:25 -08:00
|
|
|
elements = partition_text(text=text)
|
2023-06-28 12:49:12 -04:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert elements[0].text == text
|
|
|
|
assert not isinstance(elements[0], ListItem)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding"),
|
2023-07-26 15:10:14 -04:00
|
|
|
[
|
|
|
|
("fake-text.txt", "utf-8"),
|
|
|
|
("fake-text.txt", None),
|
|
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
|
|
],
|
2023-06-30 09:44:46 -05:00
|
|
|
)
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_partition_text_from_filename_exclude_metadata(filename: str, encoding: Optional[str]):
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = partition_text(
|
|
|
|
filename=filename,
|
|
|
|
encoding=encoding,
|
|
|
|
include_metadata=False,
|
|
|
|
)
|
2023-06-30 09:44:46 -05:00
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_from_file_exclude_metadata():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
|
|
|
with open(filename, "rb") as f:
|
2023-06-30 09:44:46 -05:00
|
|
|
elements = partition_text(file=f, include_metadata=False)
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_file_path_gets_last_modified_from_filesystem(mocker: MockerFixture):
|
|
|
|
filesystem_last_modified = "2029-07-05T09:24:28"
|
2023-07-26 15:10:14 -04:00
|
|
|
mocker.patch(
|
2024-09-23 15:23:10 -07:00
|
|
|
"unstructured.partition.text.get_last_modified_date", return_value=filesystem_last_modified
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
elements = partition_text(example_doc_path("fake-text.txt"))
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_file_gets_last_modified_None():
|
|
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
|
|
elements = partition_text(file=f)
|
2024-03-18 02:09:44 +01:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
2024-03-18 02:09:44 +01:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_text_gets_last_modified_None():
|
|
|
|
with open(example_doc_path("fake-text.txt")) as f:
|
|
|
|
text = f.read()
|
2024-03-18 02:09:44 +01:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
elements = partition_text(text=text)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified is None for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_file_path_prefers_metadata_last_modified(mocker: MockerFixture):
|
|
|
|
filesystem_last_modified = "2029-07-05T09:24:28"
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
2023-07-26 15:10:14 -04:00
|
|
|
mocker.patch(
|
2024-09-23 15:23:10 -07:00
|
|
|
"unstructured.partition.text.get_last_modified_date", return_value=filesystem_last_modified
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
elements = partition_text(
|
|
|
|
example_doc_path("fake-text.txt"), metadata_last_modified=metadata_last_modified
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_file_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
with open(example_doc_path("fake-text.txt"), "rb") as f:
|
|
|
|
elements = partition_text(file=f, metadata_last_modified=metadata_last_modified)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_text_from_text_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
with open(example_doc_path("fake-text.txt")) as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
text = f.read()
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
elements = partition_text(text=text, metadata_last_modified=metadata_last_modified)
|
2023-08-11 07:02:37 -04:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2023-08-11 07:02:37 -04:00
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# ------------------------------------------------------------------------------------------------
|
2024-03-18 02:09:44 +01:00
|
|
|
|
|
|
|
|
2024-04-24 09:05:20 +02:00
|
|
|
def test_Text_element_assigns_id_hashes_that_are_unique_and_deterministic():
|
|
|
|
ids = [element.id for element in partition_text(text="hello\nhello\nhello")]
|
|
|
|
assert ids == [
|
|
|
|
"8657c0ec31a4cfc822f6cd4a5684cafd",
|
|
|
|
"72aefb4a12be063ad160931fdb380163",
|
|
|
|
"ba8c1a216ca585aecdd365a72e6124f1",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_Text_element_assings_UUID_when_unique_element_ids_is_True():
|
|
|
|
elements = partition_text(text="hello\nhello\nhello", unique_element_ids=True)
|
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
assert uuid.UUID(element.id, version=4)
|
2023-08-11 07:02:37 -04:00
|
|
|
|
2024-04-24 09:05:20 +02:00
|
|
|
# Test that the element is JSON serializable. This should run without an error
|
|
|
|
json.dumps(element.to_dict())
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-10-12 12:47:55 -07:00
|
|
|
("file_name", "encoding"),
|
2023-08-29 16:59:26 -04:00
|
|
|
[
|
|
|
|
("fake-text.txt", "utf-8"),
|
|
|
|
("fake-text.txt", None),
|
|
|
|
("fake-text-utf-16-be.txt", "utf-16-be"),
|
|
|
|
],
|
|
|
|
)
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_text_with_json(file_name: str, encoding: str | None):
|
|
|
|
elements = partition_text(example_doc_path(file_name), encoding=encoding)
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2023-11-01 17:44:55 -05:00
|
|
|
def test_add_chunking_strategy_on_partition_text():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-09-11 16:00:14 -05:00
|
|
|
elements = partition_text(filename=filename)
|
|
|
|
chunk_elements = partition_text(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_element_metadata_has_languages():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_text(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_respects_detect_language_per_element():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "eng_spa_mult.txt")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_text(filename=filename, detect_language_per_element=True)
|
|
|
|
langs = [element.metadata.languages for element in elements]
|
|
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_respects_languages_arg():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_text(filename=filename, languages=["deu"])
|
|
|
|
assert elements[0].metadata.languages == ["deu"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_element_metadata_raises_TypeError():
|
|
|
|
with pytest.raises(TypeError):
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
|
|
|
partition_text(filename=filename, languages="eng") # type: ignore
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_text_detects_more_than_3_languages():
|
2023-11-01 17:44:55 -05:00
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "language-docs", "UDHR_first_article_all.txt")
|
2023-10-10 20:47:56 -05:00
|
|
|
elements = partition_text(filename=filename, detect_language_per_element=True)
|
2023-10-14 17:46:24 -05:00
|
|
|
langs = list(
|
|
|
|
{element.metadata.languages[0] for element in elements if element.metadata.languages},
|
|
|
|
)
|
2023-10-10 20:47:56 -05:00
|
|
|
assert len(langs) > 10
|