mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-19 15:06:21 +00:00

MongoDB connector: Issue: [MongoDB documentation](https://www.mongodb.com/docs/manual/reference/connection-string/) states that characters `$ : / ? # [ ] @` must be percent encoded. URI with password containing such special character will not be redacted. Fix: This fix removes usage of `unquote_plus` on password which allows detected password to match with one inside URI and successfully replace it. Git connector: Added very basic unit tests for repository filtering methods. Their impact is rather minimal but showcases current limitation in `is_file_type_supported` method.
62 lines
2.2 KiB
Python
62 lines
2.2 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from unstructured.ingest.connector.git import GitAccessConfig, GitSourceConnector, SimpleGitConfig
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("given_file_path", "then_is_supported"),
|
|
[
|
|
(Path("src/submodule/document.md"), True),
|
|
(Path("src/submodule/document.txt"), True),
|
|
(Path("src/submodule/document.pdf"), True),
|
|
(Path("src/submodule/document.doc"), True),
|
|
(Path("src/submodule/document.docx"), True),
|
|
(Path("src/submodule/document.eml"), True),
|
|
(Path("src/submodule/document.html"), True),
|
|
(Path("src/submodule/document.png"), True),
|
|
(Path("src/submodule/document.jpg"), True),
|
|
(Path("src/submodule/document.ppt"), True),
|
|
(Path("src/submodule/document.pptx"), True),
|
|
(Path("src/submodule/document.xml"), True),
|
|
(Path("src/submodule/code.py"), False),
|
|
(Path("src/submodule/Dockerfile"), False),
|
|
(Path("src/submodule/Makefile"), False),
|
|
(Path("src/submodule/LICENSE"), False),
|
|
],
|
|
)
|
|
def test_connector_supports_file(given_file_path, then_is_supported):
|
|
when_is_supported = GitSourceConnector.is_file_type_supported(str(given_file_path))
|
|
|
|
assert when_is_supported == then_is_supported
|
|
|
|
|
|
class FakeGitSourceConnectorImpl(GitSourceConnector):
|
|
def get_ingest_docs(self):
|
|
pass
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("given_file_path", "given_file_glob", "then_matches_glob"),
|
|
[
|
|
(Path("LICENSE"), None, True),
|
|
(Path("Makefile"), ["Makefile"], True),
|
|
(Path("src/my/super/module/main.py"), ["**/*.py"], True),
|
|
(Path("src/my/super/module/main.pyc"), ["**/*.py"], False),
|
|
],
|
|
)
|
|
def test_connector_does_path_match_glob(given_file_path, given_file_glob, then_matches_glob):
|
|
connector_config = SimpleGitConfig(
|
|
url="some_fake_url",
|
|
access_config=GitAccessConfig(access_token="some_fake_token"),
|
|
file_glob=given_file_glob,
|
|
)
|
|
connector = FakeGitSourceConnectorImpl(
|
|
processor_config=None, read_config=None, connector_config=connector_config
|
|
)
|
|
|
|
when_matches_glob = connector.does_path_match_glob(str(given_file_path))
|
|
|
|
assert when_matches_glob == then_matches_glob
|