John 3843af666e
feat: Enable remote chunking via unstructured-ingest (#2905)
Update: The cli shell script works when sending documents to the free
api, but the paid api is down, so waiting to test against it.

- The first commit adds docstrings and fixes type hints.
- The second commit reorganizes `test_unstructured_ingest` so it matches
the structure of `unstructured/ingest`.
- The third commit contains the primary changes for this PR.
- The `.chunk()` method responsible for sending elements to the correct
method is moved from `ChunkingConfig` to `Chunker` so that
`ChunkingConfig` acts as a config object instead of containing
implementation logic. `Chunker.chunk()` also now takes a json file
instead of a list of elements. This is done to avoid redundant
serialization if the file is to be sent to the api for chunking.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
2024-04-25 00:24:58 +00:00

61 lines
1.8 KiB
Python

import json
from dataclasses import Field, dataclass, fields
import pytest
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
from unstructured.ingest.enhanced_dataclass.dataclasses import EnhancedField
@dataclass
class AuthData(EnhancedDataClassJsonMixin):
username: str
password: str = enhanced_field(sensitive=True)
date: int = enhanced_field(overload_name="time")
auth = AuthData(username="my name", password="top secret", date=3)
def test_enhanced_field():
fs = fields(AuthData)
for f in fs:
if f.name == "username":
assert isinstance(f, Field)
assert hasattr(f, "sensitive") is False
else:
assert isinstance(f, EnhancedField)
if f.name == "password":
assert f.sensitive is True
else:
assert not f.sensitive
@pytest.mark.parametrize(
("apply_name_overload", "expected_dict"),
[
(True, {"username": "my name", "password": "THIS IS REDACTED", "time": 3}),
(False, {"username": "my name", "password": "THIS IS REDACTED", "date": 3}),
],
)
def test_to_json(apply_name_overload: bool, expected_dict: dict):
j = auth.to_json(
redact_sensitive=True,
redacted_text="THIS IS REDACTED",
apply_name_overload=apply_name_overload,
)
expected = json.dumps(expected_dict)
assert j == expected
@pytest.mark.parametrize(
("apply_name_overload", "expected_dict"),
[
(True, {"username": "my name", "password": "***REDACTED***", "time": 3}),
(False, {"username": "my name", "password": "***REDACTED***", "date": 3}),
],
)
def test_to_dict(apply_name_overload: bool, expected_dict: dict):
d = auth.to_dict(redact_sensitive=True, apply_name_overload=apply_name_overload)
assert d == expected_dict