unstructured/test_unstructured_ingest/unit/test_paths.py

from dataclasses import dataclass
from pathlib import Path

import pytest

from unstructured.ingest.connector.fsspec.dropbox import (
    DropboxIngestDoc,
)
from unstructured.ingest.connector.fsspec.fsspec import (
    FsspecIngestDoc,
)
from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig
from unstructured.ingest.interfaces import (
    FsspecConfig,
)


@dataclass
class FakeConfigDropboxRoot:
    output_dir = "/fakeuser/fake_output"
    dir_path = " "
    download_dir = "/fakeuser/fake_download"
    path_without_protocol = " "


@dataclass
class FakeConfigFolder:
    output_dir = "/fakeuser/fake_output"
    dir_path = "fake_folder"
    download_dir = "/fakeuser/fake_download"
    path_without_protocol = "fake_folder"


def test_dropbox_root_succeeds():
    """
    Test that path joining method works for Dropbox root folder.
    Note slash in front of remote_file_path.
    """
    dbox = DropboxIngestDoc(
        connector_config=FakeConfigDropboxRoot,
        read_config=FakeConfigDropboxRoot,
        processor_config=FakeConfigDropboxRoot,
        remote_file_path="/fake_file.txt",
    )
    output_filename = dbox._output_filename
    download_filename = dbox._tmp_download_file()

    assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
    assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")


def test_dropbox_root_succeeds2():
    """
    Test that path joining method works for Dropbox root folder.
    Note lack of slash in front of remote_file_path. This still works.
    """
    dbox = DropboxIngestDoc(
        connector_config=FakeConfigDropboxRoot,
        read_config=FakeConfigDropboxRoot,
        processor_config=FakeConfigDropboxRoot,
        remote_file_path="fake_file.txt",
    )
    output_filename = dbox._output_filename
    download_filename = dbox._tmp_download_file()

    assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
    assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")


def test_dropbox_folder_succeeds():
    """
    Test that path joining method works for Dropbox root folder.
    Note no slash in front of remote_file_path.
    """
    dbox = DropboxIngestDoc(
        connector_config=FakeConfigFolder,
        read_config=FakeConfigFolder,
        processor_config=FakeConfigFolder,
        remote_file_path="fake_file2.txt",
    )
    output_filename = dbox._output_filename
    download_filename = dbox._tmp_download_file()

    assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
    assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")


def test_dropbox_folder_fails():
    """Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
    Path joining is sensitive. Note that the path is MISSING the folders."""
    dbox = DropboxIngestDoc(
        connector_config=FakeConfigFolder,
        read_config=FakeConfigFolder,
        processor_config=FakeConfigFolder,
        remote_file_path="/fake_file2.txt",
    )
    output_filename = dbox._output_filename
    download_filename = dbox._tmp_download_file()

    assert output_filename == Path("/fake_file2.txt.json")
    assert download_filename == Path("/fake_file2.txt")


def test_fsspec_folder_succeeds():
    """
    Test that path joining method works for root folder.
    Note no slash in front of remote_file_path.
    """
    dbox = FsspecIngestDoc(
        connector_config=FakeConfigFolder,
        read_config=FakeConfigFolder,
        processor_config=FakeConfigFolder,
        remote_file_path="fake_file2.txt",
    )
    output_filename = dbox._output_filename
    download_filename = dbox._tmp_download_file()

    assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
    assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")


def test_fsspec_folder_fails():
    """Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
    Path joining is sensitive. Note that the path is MISSING the folders."""
    fstest = FsspecIngestDoc(
        connector_config=FakeConfigFolder,
        read_config=FakeConfigFolder,
        processor_config=FakeConfigFolder,
        remote_file_path="/fake_file2.txt",
    )
    output_filename = fstest._output_filename
    download_filename = fstest._tmp_download_file()

    assert output_filename == Path("/fake_file2.txt.json")
    assert download_filename == Path("/fake_file2.txt")


def test_post_init_invalid_protocol():
    """Validate that an invalid protocol raises a ValueError"""
    with pytest.raises(ValueError):
        FsspecConfig(remote_url="ftp://example.com/path/to/file.txt")


def test_fsspec_path_extraction_dropbox_root():
    """Validate that the path extraction works for dropbox root"""
    config = FsspecConfig(remote_url="dropbox:// /")
    assert config.protocol == "dropbox"
    assert config.path_without_protocol == " /"
    assert config.dir_path == " "
    assert config.file_path == ""


def test_fsspec_path_extraction_dropbox_subfolder():
    """Validate that the path extraction works for dropbox subfolder"""
    config = FsspecConfig(remote_url="dropbox://path")
    assert config.protocol == "dropbox"
    assert config.path_without_protocol == "path"
    assert config.dir_path == "path"
    assert config.file_path == ""


def test_fsspec_path_extraction_s3_bucket_only():
    """Validate that the path extraction works for s3 bucket without filename"""
    config = FsspecConfig(remote_url="s3://bucket-name")
    assert config.protocol == "s3"
    assert config.path_without_protocol == "bucket-name"
    assert config.dir_path == "bucket-name"
    assert config.file_path == ""


def test_fsspec_path_extraction_s3_valid_path():
    """Validate that the path extraction works for s3 bucket with filename"""
    config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt")
    assert config.protocol == "s3"
    assert config.path_without_protocol == "bucket-name/path/to/file.txt"
    assert config.dir_path == "bucket-name"
    assert config.file_path == "path/to/file.txt"


def test_fsspec_path_extraction_s3_invalid_path():
    """Validate that an invalid s3 path (that mimics triple slash for dropbox)
    raises a ValueError"""
    with pytest.raises(ValueError):
        FsspecConfig(remote_url="s3:///bucket-name/path/to")


def test_sftp_path_extraction_post_init_with_extension():
    """Validate that the path extraction works for sftp with file extension"""
    config = SimpleSftpConfig(
        remote_url="sftp://example.com/path/to/file.txt",
        access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
    )
    assert config.file_path == "file.txt"
    assert config.dir_path == "path/to"
    assert config.path_without_protocol == "path/to"
    assert config.access_config.host == "example.com"
    assert config.access_config.port == 22


def test_sftp_path_extraction_without_extension():
    """Validate that the path extraction works for sftp without extension"""
    config = SimpleSftpConfig(
        remote_url="sftp://example.com/path/to/directory",
        access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
    )
    assert config.file_path == ""
    assert config.dir_path == "path/to/directory"
    assert config.path_without_protocol == "path/to/directory"
    assert config.access_config.host == "example.com"
    assert config.access_config.port == 22


def test_sftp_path_extraction_with_port():
    """Validate that the path extraction works for sftp with a non-default port"""
    config = SimpleSftpConfig(
        remote_url="sftp://example.com:47474/path/to/file.txt",
        access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
    )
    assert config.file_path == "file.txt"
    assert config.dir_path == "path/to"
    assert config.path_without_protocol == "path/to"
    assert config.access_config.host == "example.com"
    assert config.access_config.port == 47474