mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-09 10:05:56 +00:00
154 lines
5.6 KiB
Python
154 lines
5.6 KiB
Python
![]() |
from __future__ import annotations
|
||
|
|
||
|
import json
|
||
|
import logging
|
||
|
import os
|
||
|
|
||
|
import pytest
|
||
|
from _pytest.logging import LogCaptureFixture
|
||
|
|
||
|
from test_unstructured.unit_utils import (
|
||
|
FixtureRequest,
|
||
|
Mock,
|
||
|
example_doc_path,
|
||
|
function_mock,
|
||
|
method_mock,
|
||
|
)
|
||
|
from unstructured.documents.elements import CompositeElement
|
||
|
from unstructured.ingest.interfaces import ChunkingConfig, PartitionConfig
|
||
|
from unstructured.ingest.pipeline.interfaces import PipelineContext
|
||
|
from unstructured.ingest.pipeline.reformat.chunking import Chunker
|
||
|
|
||
|
ELEMENTS_JSON_FILE = example_doc_path(
|
||
|
"test_evaluate_files/unstructured_output/Bank Good Credit Loan.pptx.json"
|
||
|
)
|
||
|
|
||
|
|
||
|
class DescribeChunker:
|
||
|
"""Unit tests for ingest.pipeline.reformat.chunking.Chunker"""
|
||
|
|
||
|
# -- Chunker.run() -----------------------------------------------------------------------------
|
||
|
|
||
|
# -- integration test --
|
||
|
def it_creates_JSON_elements(self, _ingest_docs_map_: Mock, tmpdir: str):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy="by_title"),
|
||
|
pipeline_context=PipelineContext(work_dir=tmpdir),
|
||
|
partition_config=PartitionConfig(),
|
||
|
)
|
||
|
# -- `Chunker.chunk()` defaults to writing to "{work_dir}/chunked", which is located in
|
||
|
# -- "/.cache" of a user's profile.
|
||
|
# -- Define `work_dir` add the "/chunked" subdirectory to it:
|
||
|
os.makedirs(os.path.join(tmpdir, "chunked"), exist_ok=True)
|
||
|
|
||
|
filename = chunker.run(ELEMENTS_JSON_FILE) or ""
|
||
|
|
||
|
head, tail = os.path.split(filename if filename else "")
|
||
|
# -- Check that a json file was created in `/chunked` --
|
||
|
assert head.endswith("chunked")
|
||
|
assert tail.endswith(".json")
|
||
|
# -- Check contents of file --
|
||
|
with open(filename) as json_f:
|
||
|
json_data = json.load(json_f)
|
||
|
assert all(d.get("type") == "CompositeElement" for d in json_data)
|
||
|
assert len(json_data) == 5
|
||
|
|
||
|
def it_returns_None_and_logs_message_without_chunking_strategy(
|
||
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
||
|
):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(),
|
||
|
)
|
||
|
caplog.set_level(logging.INFO)
|
||
|
|
||
|
assert chunker.run(ELEMENTS_JSON_FILE) is None
|
||
|
assert "chunking_strategy is None, skipping chunking for" in caplog.text
|
||
|
|
||
|
def it_logs_error_on_invalid_remote_chunking_strategy(
|
||
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
||
|
):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy="by_invalid"),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(partition_by_api=True),
|
||
|
)
|
||
|
|
||
|
chunker.run(ELEMENTS_JSON_FILE)
|
||
|
|
||
|
assert "Input should be 'basic', 'by_page', 'by_similarity'" in caplog.text
|
||
|
|
||
|
def it_warns_with_nonlocal_chunking_strategy_and_partition_by_api_False(
|
||
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
||
|
):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(partition_by_api=False),
|
||
|
)
|
||
|
|
||
|
chunker.run(ELEMENTS_JSON_FILE)
|
||
|
|
||
|
assert "There is no locally available chunking_strategy:" in caplog.text
|
||
|
|
||
|
# -- Chunker.chunk() ---------------------------------------------------------------------------
|
||
|
|
||
|
def it_skips_chunking_if_strategy_is_None(self):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy=None),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(),
|
||
|
)
|
||
|
|
||
|
assert chunker.chunk(ELEMENTS_JSON_FILE) is None
|
||
|
|
||
|
# -- integration test --
|
||
|
@pytest.mark.parametrize("strategy", ["by_title", "basic"])
|
||
|
def it_chunks_locally(self, strategy: str, _ingest_docs_map_: Mock):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy=strategy),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(),
|
||
|
)
|
||
|
|
||
|
chunked_elements = chunker.chunk(ELEMENTS_JSON_FILE)
|
||
|
|
||
|
assert all(isinstance(elem, CompositeElement) for elem in chunked_elements) # type: ignore
|
||
|
|
||
|
def it_chunks_remotely(self, _ingest_docs_map_: Mock, _partition_via_api_: Mock):
|
||
|
chunker = Chunker(
|
||
|
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
|
||
|
pipeline_context=PipelineContext(),
|
||
|
partition_config=PartitionConfig(
|
||
|
partition_by_api=True, api_key="aaaaaaaaaaaaaaaaaaaaa"
|
||
|
),
|
||
|
)
|
||
|
|
||
|
chunker.chunk(ELEMENTS_JSON_FILE)
|
||
|
|
||
|
_partition_via_api_.assert_called_once_with(
|
||
|
filename=ELEMENTS_JSON_FILE,
|
||
|
api_key="aaaaaaaaaaaaaaaaaaaaa",
|
||
|
api_url="https://api.unstructured.io/general/v0/general",
|
||
|
chunking_strategy="by_similarity",
|
||
|
combine_under_n_chars=None,
|
||
|
max_characters=None,
|
||
|
multipage_sections=None,
|
||
|
new_after_n_chars=None,
|
||
|
# overlap=None,
|
||
|
# overlap_all=None,
|
||
|
)
|
||
|
|
||
|
# -- fixtures --------------------------------------------------------------------------------
|
||
|
|
||
|
@pytest.fixture()
|
||
|
def _ingest_docs_map_(self, request: FixtureRequest):
|
||
|
return method_mock(request, PipelineContext, "ingest_docs_map")
|
||
|
|
||
|
@pytest.fixture()
|
||
|
def _partition_via_api_(self, request: FixtureRequest):
|
||
|
return function_mock(
|
||
|
request, "unstructured.ingest.pipeline.reformat.chunking.partition_via_api"
|
||
|
)
|