rvztz 950e5d68f9
feat: adds postgresql/sqlite destination connector (#2005)
- Adds a destination connector to upload processed output into a
PostgreSQL/Sqlite database instance.
- Users are responsible to provide their instances. This PR includes a
couple of configuration examples.
- Defines the scripts required to setup a PostgreSQL instance with the
unstructured elements schema.
- Validates postgres/pgvector embedding storage and retrieval

---------

Co-authored-by: potter-potter <david.potter@gmail.com>
2024-01-04 19:33:16 +00:00

224 lines
7.9 KiB
Python

from dataclasses import dataclass
from pathlib import Path
import pytest
from unstructured.ingest.connector.fsspec.dropbox import (
DropboxIngestDoc,
)
from unstructured.ingest.connector.fsspec.fsspec import (
FsspecIngestDoc,
)
from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig
from unstructured.ingest.interfaces import (
FsspecConfig,
)
@dataclass
class FakeConfigDropboxRoot:
output_dir = "/fakeuser/fake_output"
dir_path = " "
download_dir = "/fakeuser/fake_download"
path_without_protocol = " "
@dataclass
class FakeConfigFolder:
output_dir = "/fakeuser/fake_output"
dir_path = "fake_folder"
download_dir = "/fakeuser/fake_download"
path_without_protocol = "fake_folder"
def test_dropbox_root_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="/fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_root_succeeds2():
"""
Test that path joining method works for Dropbox root folder.
Note lack of slash in front of remote_file_path. This still works.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_folder_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note no slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_dropbox_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_fsspec_folder_succeeds():
"""
Test that path joining method works for root folder.
Note no slash in front of remote_file_path.
"""
dbox = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_fsspec_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
fstest = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = fstest._output_filename
download_filename = fstest._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_post_init_invalid_protocol():
"""Validate that an invalid protocol raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="ftp://example.com/path/to/file.txt")
def test_fsspec_path_extraction_dropbox_root():
"""Validate that the path extraction works for dropbox root"""
config = FsspecConfig(remote_url="dropbox:// /")
assert config.protocol == "dropbox"
assert config.path_without_protocol == " /"
assert config.dir_path == " "
assert config.file_path == ""
def test_fsspec_path_extraction_dropbox_subfolder():
"""Validate that the path extraction works for dropbox subfolder"""
config = FsspecConfig(remote_url="dropbox://path")
assert config.protocol == "dropbox"
assert config.path_without_protocol == "path"
assert config.dir_path == "path"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_bucket_only():
"""Validate that the path extraction works for s3 bucket without filename"""
config = FsspecConfig(remote_url="s3://bucket-name")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name"
assert config.dir_path == "bucket-name"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_valid_path():
"""Validate that the path extraction works for s3 bucket with filename"""
config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name/path/to/file.txt"
assert config.dir_path == "bucket-name"
assert config.file_path == "path/to/file.txt"
def test_fsspec_path_extraction_s3_invalid_path():
"""Validate that an invalid s3 path (that mimics triple slash for dropbox)
raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="s3:///bucket-name/path/to")
def test_sftp_path_extraction_post_init_with_extension():
"""Validate that the path extraction works for sftp with file extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_without_extension():
"""Validate that the path extraction works for sftp without extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/directory",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == ""
assert config.dir_path == "path/to/directory"
assert config.path_without_protocol == "path/to/directory"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_with_port():
"""Validate that the path extraction works for sftp with a non-default port"""
config = SimpleSftpConfig(
remote_url="sftp://example.com:47474/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 47474