John 3843af666e
feat: Enable remote chunking via unstructured-ingest (#2905)
Update: The cli shell script works when sending documents to the free
api, but the paid api is down, so waiting to test against it.

- The first commit adds docstrings and fixes type hints.
- The second commit reorganizes `test_unstructured_ingest` so it matches
the structure of `unstructured/ingest`.
- The third commit contains the primary changes for this PR.
- The `.chunk()` method responsible for sending elements to the correct
method is moved from `ChunkingConfig` to `Chunker` so that
`ChunkingConfig` acts as a config object instead of containing
implementation logic. `Chunker.chunk()` also now takes a json file
instead of a list of elements. This is done to avoid redundant
serialization if the file is to be sent to the api for chunking.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
2024-04-25 00:24:58 +00:00

224 lines
7.9 KiB
Python

from dataclasses import dataclass
from pathlib import Path
import pytest
from unstructured.ingest.connector.fsspec.dropbox import (
DropboxIngestDoc,
)
from unstructured.ingest.connector.fsspec.fsspec import (
FsspecIngestDoc,
)
from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig
from unstructured.ingest.interfaces import (
FsspecConfig,
)
@dataclass
class FakeConfigDropboxRoot:
output_dir = "/fakeuser/fake_output"
dir_path = " "
download_dir = "/fakeuser/fake_download"
path_without_protocol = " "
@dataclass
class FakeConfigFolder:
output_dir = "/fakeuser/fake_output"
dir_path = "fake_folder"
download_dir = "/fakeuser/fake_download"
path_without_protocol = "fake_folder"
def test_dropbox_root_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="/fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_root_succeeds2():
"""
Test that path joining method works for Dropbox root folder.
Note lack of slash in front of remote_file_path. This still works.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_folder_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note no slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_dropbox_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_fsspec_folder_succeeds():
"""
Test that path joining method works for root folder.
Note no slash in front of remote_file_path.
"""
dbox = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_fsspec_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
fstest = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = fstest._output_filename
download_filename = fstest._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_post_init_invalid_protocol():
"""Validate that an invalid protocol raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="ftp://example.com/path/to/file.txt")
def test_fsspec_path_extraction_dropbox_root():
"""Validate that the path extraction works for dropbox root"""
config = FsspecConfig(remote_url="dropbox:// /")
assert config.protocol == "dropbox"
assert config.path_without_protocol == " /"
assert config.dir_path == " "
assert config.file_path == ""
def test_fsspec_path_extraction_dropbox_subfolder():
"""Validate that the path extraction works for dropbox subfolder"""
config = FsspecConfig(remote_url="dropbox://path")
assert config.protocol == "dropbox"
assert config.path_without_protocol == "path"
assert config.dir_path == "path"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_bucket_only():
"""Validate that the path extraction works for s3 bucket without filename"""
config = FsspecConfig(remote_url="s3://bucket-name")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name"
assert config.dir_path == "bucket-name"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_valid_path():
"""Validate that the path extraction works for s3 bucket with filename"""
config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name/path/to/file.txt"
assert config.dir_path == "bucket-name"
assert config.file_path == "path/to/file.txt"
def test_fsspec_path_extraction_s3_invalid_path():
"""Validate that an invalid s3 path (that mimics triple slash for dropbox)
raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="s3:///bucket-name/path/to")
def test_sftp_path_extraction_post_init_with_extension():
"""Validate that the path extraction works for sftp with file extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_without_extension():
"""Validate that the path extraction works for sftp without extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/directory",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == ""
assert config.dir_path == "path/to/directory"
assert config.path_without_protocol == "path/to/directory"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_with_port():
"""Validate that the path extraction works for sftp with a non-default port"""
config = SimpleSftpConfig(
remote_url="sftp://example.com:47474/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 47474