John 3843af666e
feat: Enable remote chunking via unstructured-ingest (#2905)
Update: The cli shell script works when sending documents to the free
api, but the paid api is down, so waiting to test against it.

- The first commit adds docstrings and fixes type hints.
- The second commit reorganizes `test_unstructured_ingest` so it matches
the structure of `unstructured/ingest`.
- The third commit contains the primary changes for this PR.
- The `.chunk()` method responsible for sending elements to the correct
method is moved from `ChunkingConfig` to `Chunker` so that
`ChunkingConfig` acts as a config object instead of containing
implementation logic. `Chunker.chunk()` also now takes a json file
instead of a list of elements. This is done to avoid redundant
serialization if the file is to be sent to the api for chunking.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
2024-04-25 00:24:58 +00:00

154 lines
5.6 KiB
Python

from __future__ import annotations
import json
import logging
import os
import pytest
from _pytest.logging import LogCaptureFixture
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
example_doc_path,
function_mock,
method_mock,
)
from unstructured.documents.elements import CompositeElement
from unstructured.ingest.interfaces import ChunkingConfig, PartitionConfig
from unstructured.ingest.pipeline.interfaces import PipelineContext
from unstructured.ingest.pipeline.reformat.chunking import Chunker
ELEMENTS_JSON_FILE = example_doc_path(
"test_evaluate_files/unstructured_output/Bank Good Credit Loan.pptx.json"
)
class DescribeChunker:
"""Unit tests for ingest.pipeline.reformat.chunking.Chunker"""
# -- Chunker.run() -----------------------------------------------------------------------------
# -- integration test --
def it_creates_JSON_elements(self, _ingest_docs_map_: Mock, tmpdir: str):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy="by_title"),
pipeline_context=PipelineContext(work_dir=tmpdir),
partition_config=PartitionConfig(),
)
# -- `Chunker.chunk()` defaults to writing to "{work_dir}/chunked", which is located in
# -- "/.cache" of a user's profile.
# -- Define `work_dir` add the "/chunked" subdirectory to it:
os.makedirs(os.path.join(tmpdir, "chunked"), exist_ok=True)
filename = chunker.run(ELEMENTS_JSON_FILE) or ""
head, tail = os.path.split(filename if filename else "")
# -- Check that a json file was created in `/chunked` --
assert head.endswith("chunked")
assert tail.endswith(".json")
# -- Check contents of file --
with open(filename) as json_f:
json_data = json.load(json_f)
assert all(d.get("type") == "CompositeElement" for d in json_data)
assert len(json_data) == 5
def it_returns_None_and_logs_message_without_chunking_strategy(
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
):
chunker = Chunker(
chunking_config=ChunkingConfig(),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(),
)
caplog.set_level(logging.INFO)
assert chunker.run(ELEMENTS_JSON_FILE) is None
assert "chunking_strategy is None, skipping chunking for" in caplog.text
def it_logs_error_on_invalid_remote_chunking_strategy(
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy="by_invalid"),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(partition_by_api=True),
)
chunker.run(ELEMENTS_JSON_FILE)
assert "Input should be 'basic', 'by_page', 'by_similarity'" in caplog.text
def it_warns_with_nonlocal_chunking_strategy_and_partition_by_api_False(
self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture
):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(partition_by_api=False),
)
chunker.run(ELEMENTS_JSON_FILE)
assert "There is no locally available chunking_strategy:" in caplog.text
# -- Chunker.chunk() ---------------------------------------------------------------------------
def it_skips_chunking_if_strategy_is_None(self):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy=None),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(),
)
assert chunker.chunk(ELEMENTS_JSON_FILE) is None
# -- integration test --
@pytest.mark.parametrize("strategy", ["by_title", "basic"])
def it_chunks_locally(self, strategy: str, _ingest_docs_map_: Mock):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy=strategy),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(),
)
chunked_elements = chunker.chunk(ELEMENTS_JSON_FILE)
assert all(isinstance(elem, CompositeElement) for elem in chunked_elements) # type: ignore
def it_chunks_remotely(self, _ingest_docs_map_: Mock, _partition_via_api_: Mock):
chunker = Chunker(
chunking_config=ChunkingConfig(chunking_strategy="by_similarity"),
pipeline_context=PipelineContext(),
partition_config=PartitionConfig(
partition_by_api=True, api_key="aaaaaaaaaaaaaaaaaaaaa"
),
)
chunker.chunk(ELEMENTS_JSON_FILE)
_partition_via_api_.assert_called_once_with(
filename=ELEMENTS_JSON_FILE,
api_key="aaaaaaaaaaaaaaaaaaaaa",
api_url="https://api.unstructured.io/general/v0/general",
chunking_strategy="by_similarity",
combine_under_n_chars=None,
max_characters=None,
multipage_sections=None,
new_after_n_chars=None,
# overlap=None,
# overlap_all=None,
)
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def _ingest_docs_map_(self, request: FixtureRequest):
return method_mock(request, PipelineContext, "ingest_docs_map")
@pytest.fixture()
def _partition_via_api_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.ingest.pipeline.reformat.chunking.partition_via_api"
)