mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-23 09:00:40 +00:00

### Description When passing in a remote path for fsspec-based source connectors, the base directory was always being included in the output path itself. This was updated to exclude the base directory any only include any child directories relative to the base one. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
130 lines
4.3 KiB
Python
130 lines
4.3 KiB
Python
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from unstructured.ingest.connector.dropbox import (
|
|
DropboxIngestDoc,
|
|
)
|
|
from unstructured.ingest.connector.fsspec import (
|
|
FsspecIngestDoc,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class FakeConfigDropboxRoot:
|
|
output_dir = "/fakeuser/fake_output"
|
|
dir_path = " "
|
|
download_dir = "/fakeuser/fake_download"
|
|
path_without_protocol = " "
|
|
|
|
|
|
@dataclass
|
|
class FakeConfigFolder:
|
|
output_dir = "/fakeuser/fake_output"
|
|
dir_path = "fake_folder"
|
|
download_dir = "/fakeuser/fake_download"
|
|
path_without_protocol = "fake_folder"
|
|
|
|
|
|
def test_dropbox_root_succeeds():
|
|
"""
|
|
Test that path joining method works for Dropbox root folder.
|
|
Note slash in front of remote_file_path.
|
|
"""
|
|
dbox = DropboxIngestDoc(
|
|
connector_config=FakeConfigDropboxRoot,
|
|
read_config=FakeConfigDropboxRoot,
|
|
processor_config=FakeConfigDropboxRoot,
|
|
remote_file_path="/fake_file.txt",
|
|
)
|
|
output_filename = dbox._output_filename
|
|
download_filename = dbox._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
|
|
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
|
|
|
|
|
|
def test_dropbox_root_succeeds2():
|
|
"""
|
|
Test that path joining method works for Dropbox root folder.
|
|
Note lack of slash in front of remote_file_path. This still works.
|
|
"""
|
|
dbox = DropboxIngestDoc(
|
|
connector_config=FakeConfigDropboxRoot,
|
|
read_config=FakeConfigDropboxRoot,
|
|
processor_config=FakeConfigDropboxRoot,
|
|
remote_file_path="fake_file.txt",
|
|
)
|
|
output_filename = dbox._output_filename
|
|
download_filename = dbox._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
|
|
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
|
|
|
|
|
|
def test_dropbox_folder_succeeds():
|
|
"""
|
|
Test that path joining method works for Dropbox root folder.
|
|
Note no slash in front of remote_file_path.
|
|
"""
|
|
dbox = DropboxIngestDoc(
|
|
connector_config=FakeConfigFolder,
|
|
read_config=FakeConfigFolder,
|
|
processor_config=FakeConfigFolder,
|
|
remote_file_path="fake_file2.txt",
|
|
)
|
|
output_filename = dbox._output_filename
|
|
download_filename = dbox._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
|
|
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
|
|
|
|
|
|
def test_dropbox_folder_fails():
|
|
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
|
|
Path joining is sensitive. Note that the path is MISSING the folders."""
|
|
dbox = DropboxIngestDoc(
|
|
connector_config=FakeConfigFolder,
|
|
read_config=FakeConfigFolder,
|
|
processor_config=FakeConfigFolder,
|
|
remote_file_path="/fake_file2.txt",
|
|
)
|
|
output_filename = dbox._output_filename
|
|
download_filename = dbox._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fake_file2.txt.json")
|
|
assert download_filename == Path("/fake_file2.txt")
|
|
|
|
|
|
def test_fsspec_folder_succeeds():
|
|
"""
|
|
Test that path joining method works for root folder.
|
|
Note no slash in front of remote_file_path.
|
|
"""
|
|
dbox = FsspecIngestDoc(
|
|
connector_config=FakeConfigFolder,
|
|
read_config=FakeConfigFolder,
|
|
processor_config=FakeConfigFolder,
|
|
remote_file_path="fake_file2.txt",
|
|
)
|
|
output_filename = dbox._output_filename
|
|
download_filename = dbox._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
|
|
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
|
|
|
|
|
|
def test_fsspec_folder_fails():
|
|
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
|
|
Path joining is sensitive. Note that the path is MISSING the folders."""
|
|
fstest = FsspecIngestDoc(
|
|
connector_config=FakeConfigFolder,
|
|
read_config=FakeConfigFolder,
|
|
processor_config=FakeConfigFolder,
|
|
remote_file_path="/fake_file2.txt",
|
|
)
|
|
output_filename = fstest._output_filename
|
|
download_filename = fstest._tmp_download_file()
|
|
|
|
assert output_filename == Path("/fake_file2.txt.json")
|
|
assert download_filename == Path("/fake_file2.txt")
|