Roman Isecke cc05e948ff
chore: sensitive info connector audit (#2227)
### Description
All other connectors that were not included in
https://github.com/Unstructured-IO/unstructured/pull/2194 are now
updated to follow the new pattern and mark any variables as sensitive
where it makes sense.
Core changes:
* All connectors now support an `AccessConfig` to mark data that's
needed for auth (i.e. username, password) and those that are sensitive
are designated appropriately using the new enhanced field.
* All cli configs on the cli definition now inherit from the base config
in the connector file to reuse the variables set on that dataclass
* The base writer class was updated to better generalize the new
approach given better use of dataclasses
* The base cli classes were refactored to also take into account the
need for a connector and write config when creating the respective
runner/writer classes.
* Any mismatch between the cli field name and the dataclass field name
were updated on the dataclass side to not impact the user but maintain
consistency
* Add custom redaction logic for mongodb URIs since the password is
expected to be a part of it. Now this:
`"mongodb+srv://ingest-test-user:r4hK3BD07b@ingest-test.hgaig.mongodb.net/"`
->
`"mongodb+srv://ingest-test-user:***REDACTED***@ingest-test.hgaig.mongodb.net/"`
in the logs
* Bundle all fsspec based files into their own packages. 
* Refactor custom `_decode_dataclass` used for enhanced json mixin by
using a monkey-patch approach. The original approach was breaking on
optional nested dataclasses when serializing since the other methods in
`dataclasses_json_core` weren't using the new method. By monkey-patching
the original method with a new one, all other methods in that library
would use the new one.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-12-11 17:37:49 +00:00

130 lines
4.3 KiB
Python

from dataclasses import dataclass
from pathlib import Path
from unstructured.ingest.connector.fsspec.dropbox import (
DropboxIngestDoc,
)
from unstructured.ingest.connector.fsspec.fsspec import (
FsspecIngestDoc,
)
@dataclass
class FakeConfigDropboxRoot:
output_dir = "/fakeuser/fake_output"
dir_path = " "
download_dir = "/fakeuser/fake_download"
path_without_protocol = " "
@dataclass
class FakeConfigFolder:
output_dir = "/fakeuser/fake_output"
dir_path = "fake_folder"
download_dir = "/fakeuser/fake_download"
path_without_protocol = "fake_folder"
def test_dropbox_root_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="/fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_root_succeeds2():
"""
Test that path joining method works for Dropbox root folder.
Note lack of slash in front of remote_file_path. This still works.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
processor_config=FakeConfigDropboxRoot,
remote_file_path="fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_folder_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note no slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_dropbox_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_fsspec_folder_succeeds():
"""
Test that path joining method works for root folder.
Note no slash in front of remote_file_path.
"""
dbox = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_fsspec_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
fstest = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = fstest._output_filename
download_filename = fstest._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")