224 lines
7.9 KiB
Python
Raw Normal View History

from dataclasses import dataclass
from pathlib import Path
import pytest
chore: sensitive info connector audit (#2227) ### Description All other connectors that were not included in https://github.com/Unstructured-IO/unstructured/pull/2194 are now updated to follow the new pattern and mark any variables as sensitive where it makes sense. Core changes: * All connectors now support an `AccessConfig` to mark data that's needed for auth (i.e. username, password) and those that are sensitive are designated appropriately using the new enhanced field. * All cli configs on the cli definition now inherit from the base config in the connector file to reuse the variables set on that dataclass * The base writer class was updated to better generalize the new approach given better use of dataclasses * The base cli classes were refactored to also take into account the need for a connector and write config when creating the respective runner/writer classes. * Any mismatch between the cli field name and the dataclass field name were updated on the dataclass side to not impact the user but maintain consistency * Add custom redaction logic for mongodb URIs since the password is expected to be a part of it. Now this: `"mongodb+srv://ingest-test-user:r4hK3BD07b@ingest-test.hgaig.mongodb.net/"` -> `"mongodb+srv://ingest-test-user:***REDACTED***@ingest-test.hgaig.mongodb.net/"` in the logs * Bundle all fsspec based files into their own packages. * Refactor custom `_decode_dataclass` used for enhanced json mixin by using a monkey-patch approach. The original approach was breaking on optional nested dataclasses when serializing since the other methods in `dataclasses_json_core` weren't using the new method. By monkey-patching the original method with a new one, all other methods in that library would use the new one. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-12-11 12:37:49 -05:00
from unstructured.ingest.connector.fsspec.dropbox import (
DropboxIngestDoc,
)
chore: sensitive info connector audit (#2227) ### Description All other connectors that were not included in https://github.com/Unstructured-IO/unstructured/pull/2194 are now updated to follow the new pattern and mark any variables as sensitive where it makes sense. Core changes: * All connectors now support an `AccessConfig` to mark data that's needed for auth (i.e. username, password) and those that are sensitive are designated appropriately using the new enhanced field. * All cli configs on the cli definition now inherit from the base config in the connector file to reuse the variables set on that dataclass * The base writer class was updated to better generalize the new approach given better use of dataclasses * The base cli classes were refactored to also take into account the need for a connector and write config when creating the respective runner/writer classes. * Any mismatch between the cli field name and the dataclass field name were updated on the dataclass side to not impact the user but maintain consistency * Add custom redaction logic for mongodb URIs since the password is expected to be a part of it. Now this: `"mongodb+srv://ingest-test-user:r4hK3BD07b@ingest-test.hgaig.mongodb.net/"` -> `"mongodb+srv://ingest-test-user:***REDACTED***@ingest-test.hgaig.mongodb.net/"` in the logs * Bundle all fsspec based files into their own packages. * Refactor custom `_decode_dataclass` used for enhanced json mixin by using a monkey-patch approach. The original approach was breaking on optional nested dataclasses when serializing since the other methods in `dataclasses_json_core` weren't using the new method. By monkey-patching the original method with a new one, all other methods in that library would use the new one. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-12-11 12:37:49 -05:00
from unstructured.ingest.connector.fsspec.fsspec import (
FsspecIngestDoc,
)
from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig
from unstructured.ingest.interfaces import (
FsspecConfig,
)
@dataclass
class FakeConfigDropboxRoot:
output_dir = "/fakeuser/fake_output"
dir_path = " "
download_dir = "/fakeuser/fake_download"
path_without_protocol = " "
@dataclass
class FakeConfigFolder:
output_dir = "/fakeuser/fake_output"
dir_path = "fake_folder"
download_dir = "/fakeuser/fake_download"
path_without_protocol = "fake_folder"
def test_dropbox_root_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigDropboxRoot,
remote_file_path="/fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_root_succeeds2():
"""
Test that path joining method works for Dropbox root folder.
Note lack of slash in front of remote_file_path. This still works.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigDropboxRoot,
read_config=FakeConfigDropboxRoot,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigDropboxRoot,
remote_file_path="fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_folder_succeeds():
"""
Test that path joining method works for Dropbox root folder.
Note no slash in front of remote_file_path.
"""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_dropbox_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
dbox = DropboxIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_fsspec_folder_succeeds():
"""
Test that path joining method works for root folder.
Note no slash in front of remote_file_path.
"""
dbox = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_fsspec_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
fstest = FsspecIngestDoc(
connector_config=FakeConfigFolder,
read_config=FakeConfigFolder,
refactor: unstructured ingest as a pipeline (#1551) ### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram ![ingest_pipeline](https://github.com/Unstructured-IO/unstructured/assets/136338424/be485606-cfe0-4931-8b81-c2bf569cf1e2)
2023-10-06 14:49:29 -04:00
processor_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = fstest._output_filename
download_filename = fstest._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_post_init_invalid_protocol():
"""Validate that an invalid protocol raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="ftp://example.com/path/to/file.txt")
def test_fsspec_path_extraction_dropbox_root():
"""Validate that the path extraction works for dropbox root"""
config = FsspecConfig(remote_url="dropbox:// /")
assert config.protocol == "dropbox"
assert config.path_without_protocol == " /"
assert config.dir_path == " "
assert config.file_path == ""
def test_fsspec_path_extraction_dropbox_subfolder():
"""Validate that the path extraction works for dropbox subfolder"""
config = FsspecConfig(remote_url="dropbox://path")
assert config.protocol == "dropbox"
assert config.path_without_protocol == "path"
assert config.dir_path == "path"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_bucket_only():
"""Validate that the path extraction works for s3 bucket without filename"""
config = FsspecConfig(remote_url="s3://bucket-name")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name"
assert config.dir_path == "bucket-name"
assert config.file_path == ""
def test_fsspec_path_extraction_s3_valid_path():
"""Validate that the path extraction works for s3 bucket with filename"""
config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt")
assert config.protocol == "s3"
assert config.path_without_protocol == "bucket-name/path/to/file.txt"
assert config.dir_path == "bucket-name"
assert config.file_path == "path/to/file.txt"
def test_fsspec_path_extraction_s3_invalid_path():
"""Validate that an invalid s3 path (that mimics triple slash for dropbox)
raises a ValueError"""
with pytest.raises(ValueError):
FsspecConfig(remote_url="s3:///bucket-name/path/to")
def test_sftp_path_extraction_post_init_with_extension():
"""Validate that the path extraction works for sftp with file extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_without_extension():
"""Validate that the path extraction works for sftp without extension"""
config = SimpleSftpConfig(
remote_url="sftp://example.com/path/to/directory",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == ""
assert config.dir_path == "path/to/directory"
assert config.path_without_protocol == "path/to/directory"
assert config.access_config.host == "example.com"
assert config.access_config.port == 22
def test_sftp_path_extraction_with_port():
"""Validate that the path extraction works for sftp with a non-default port"""
config = SimpleSftpConfig(
remote_url="sftp://example.com:47474/path/to/file.txt",
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
)
assert config.file_path == "file.txt"
assert config.dir_path == "path/to"
assert config.path_without_protocol == "path/to"
assert config.access_config.host == "example.com"
assert config.access_config.port == 47474