mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-08 09:33:43 +00:00

Update: The cli shell script works when sending documents to the free api, but the paid api is down, so waiting to test against it. - The first commit adds docstrings and fixes type hints. - The second commit reorganizes `test_unstructured_ingest` so it matches the structure of `unstructured/ingest`. - The third commit contains the primary changes for this PR. - The `.chunk()` method responsible for sending elements to the correct method is moved from `ChunkingConfig` to `Chunker` so that `ChunkingConfig` acts as a config object instead of containing implementation logic. `Chunker.chunk()` also now takes a json file instead of a list of elements. This is done to avoid redundant serialization if the file is to be sent to the api for chunking. --------- Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
154 lines
5.6 KiB
Python
154 lines
5.6 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
|
|
import pytest
|
|
from _pytest.logging import LogCaptureFixture
|
|
|
|
from test_unstructured.unit_utils import (
|
|
FixtureRequest,
|
|
Mock,
|
|
example_doc_path,
|
|
function_mock,
|
|
method_mock,
|
|
)
|
|
from unstructured.documents.elements import CompositeElement
|
|
from unstructured.ingest.interfaces import ChunkingConfig, PartitionConfig
|
|
from unstructured.ingest.pipeline.interfaces import PipelineContext
|
|
from unstructured.ingest.pipeline.reformat.chunking import Chunker
|
|
|
|
ELEMENTS_JSON_FILE = example_doc_path(
|
|
"test_evaluate_files/unstructured_output/Bank Good Credit Loan.pptx.json"
|
|
)
|
|
|
|
|
|
class DescribeChunker:
|
|
"""Unit tests for ingest.pipeline.reformat.chunking.Chunker"""
|
|
|
|
# -- Chunker.run() -----------------------------------------------------------------------------
|
|
|
|
# -- integration test --
|
|
def it_creates_JSON_elements(self, _ingest_docs_map_: Mock, tmpdir: str):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy="by_title"),
|
|
pipeline_context=PipelineContext(work_dir=tmpdir),
|
|
partition_config=PartitionConfig(),
|
|
)
|
|
# -- `Chunker.chunk()` defaults to writing to "{work_dir}/chunked", which is located in
|
|
# -- "/.cache" of a user's profile.
|
|
# -- Define `work_dir` add the "/chunked" subdirectory to it:
|
|
os.makedirs(os.path.join(tmpdir, "chunked"), exist_ok=True)
|
|
|
|
filename = chunker.run(ELEMENTS_JSON_FILE) or ""
|
|
|
|
head, tail = os.path.split(filename if filename else "")
|
|
# -- Check that a json file was created in `/chunked` --
|
|
assert head.endswith("chunked")
|
|
assert tail.endswith(".json")
|
|
# -- Check contents of file --
|
|
with open(filename) as json_f:
|
|
json_data = json.load(json_f)
|
|
assert all(d.get("type") == "CompositeElement" for d in json_data)
|
|
assert len(json_data) == 5
|
|
|
|
def it_returns_None_and_logs_message_without_chunking_strategy(
|
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
|
):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(),
|
|
)
|
|
caplog.set_level(logging.INFO)
|
|
|
|
assert chunker.run(ELEMENTS_JSON_FILE) is None
|
|
assert "chunking_strategy is None, skipping chunking for" in caplog.text
|
|
|
|
def it_logs_error_on_invalid_remote_chunking_strategy(
|
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
|
):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy="by_invalid"),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(partition_by_api=True),
|
|
)
|
|
|
|
chunker.run(ELEMENTS_JSON_FILE)
|
|
|
|
assert "Input should be 'basic', 'by_page', 'by_similarity'" in caplog.text
|
|
|
|
def it_warns_with_nonlocal_chunking_strategy_and_partition_by_api_False(
|
|
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
|
|
):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(partition_by_api=False),
|
|
)
|
|
|
|
chunker.run(ELEMENTS_JSON_FILE)
|
|
|
|
assert "There is no locally available chunking_strategy:" in caplog.text
|
|
|
|
# -- Chunker.chunk() ---------------------------------------------------------------------------
|
|
|
|
def it_skips_chunking_if_strategy_is_None(self):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy=None),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(),
|
|
)
|
|
|
|
assert chunker.chunk(ELEMENTS_JSON_FILE) is None
|
|
|
|
# -- integration test --
|
|
@pytest.mark.parametrize("strategy", ["by_title", "basic"])
|
|
def it_chunks_locally(self, strategy: str, _ingest_docs_map_: Mock):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy=strategy),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(),
|
|
)
|
|
|
|
chunked_elements = chunker.chunk(ELEMENTS_JSON_FILE)
|
|
|
|
assert all(isinstance(elem, CompositeElement) for elem in chunked_elements) # type: ignore
|
|
|
|
def it_chunks_remotely(self, _ingest_docs_map_: Mock, _partition_via_api_: Mock):
|
|
chunker = Chunker(
|
|
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
|
|
pipeline_context=PipelineContext(),
|
|
partition_config=PartitionConfig(
|
|
partition_by_api=True, api_key="aaaaaaaaaaaaaaaaaaaaa"
|
|
),
|
|
)
|
|
|
|
chunker.chunk(ELEMENTS_JSON_FILE)
|
|
|
|
_partition_via_api_.assert_called_once_with(
|
|
filename=ELEMENTS_JSON_FILE,
|
|
api_key="aaaaaaaaaaaaaaaaaaaaa",
|
|
api_url="https://api.unstructured.io/general/v0/general",
|
|
chunking_strategy="by_similarity",
|
|
combine_under_n_chars=None,
|
|
max_characters=None,
|
|
multipage_sections=None,
|
|
new_after_n_chars=None,
|
|
# overlap=None,
|
|
# overlap_all=None,
|
|
)
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
@pytest.fixture()
|
|
def _ingest_docs_map_(self, request: FixtureRequest):
|
|
return method_mock(request, PipelineContext, "ingest_docs_map")
|
|
|
|
@pytest.fixture()
|
|
def _partition_via_api_(self, request: FixtureRequest):
|
|
return function_mock(
|
|
request, "unstructured.ingest.pipeline.reformat.chunking.partition_via_api"
|
|
)
|