From 411aa98bbf6879f5a72f2557c6f3588bdd98d95a Mon Sep 17 00:00:00 2001 From: jakub-sandomierz-deepsense-ai <116811567+jakub-sandomierz-deepsense-ai@users.noreply.github.com> Date: Thu, 11 Jan 2024 12:15:24 +0100 Subject: [PATCH] feat: Salesforce connector accepts key path or value (#2321) (#2327) Solution to issue https://github.com/Unstructured-IO/unstructured/issues/2321. simple_salesforce API allows for passing private key path or value. This PR introduces this support for Ingest connector. Salesforce parameter "private-key-file" has been renamed to "private-key". It can contain one of following: - path to PEM encoded key file (as string) - key contents (PEM encoded string) If the provided value cannot be parsed as PEM encoded private key, then the file existence is checked. This way private key contents are not exposed to unnecessary underlying function calls. --- CHANGELOG.md | 1 + .../source_connectors/code/bash/salesforce.sh | 2 +- .../code/bash/salesforce_api.sh | 2 +- .../code/python/salesforce_api.py | 2 +- examples/ingest/salesforce/ingest.sh | 4 +- test_unstructured_ingest/src/salesforce.sh | 2 +- .../unit/test_salesforce_connector.py | 57 +++++++++++++++++++ unstructured/ingest/cli/cmds/salesforce.py | 8 +-- unstructured/ingest/connector/salesforce.py | 23 +++++++- 9 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 test_unstructured_ingest/unit/test_salesforce_connector.py diff --git a/CHANGELOG.md b/CHANGELOG.md index bcf637a2d..01872aa4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries. * **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks. +* **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents. ### Features diff --git a/docs/source/ingest/source_connectors/code/bash/salesforce.sh b/docs/source/ingest/source_connectors/code/bash/salesforce.sh index 9ddae6c89..0996aa5f6 100644 --- a/docs/source/ingest/source_connectors/code/bash/salesforce.sh +++ b/docs/source/ingest/source_connectors/code/bash/salesforce.sh @@ -4,7 +4,7 @@ unstructured-ingest \ salesforce \ --username "$SALESFORCE_USERNAME" \ --consumer-key "$SALESFORCE_CONSUMER_KEY" \ - --private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \ + --private-key "$SALESFORCE_PRIVATE_KEY_PATH" \ --categories "EmailMessage,Account,Lead,Case,Campaign" \ --output-dir salesforce-output \ --num-processes 2 \ diff --git a/docs/source/ingest/source_connectors/code/bash/salesforce_api.sh b/docs/source/ingest/source_connectors/code/bash/salesforce_api.sh index 6336d73de..e2ae76d9f 100644 --- a/docs/source/ingest/source_connectors/code/bash/salesforce_api.sh +++ b/docs/source/ingest/source_connectors/code/bash/salesforce_api.sh @@ -4,7 +4,7 @@ unstructured-ingest \ salesforce \ --username "$SALESFORCE_USERNAME" \ --consumer-key "$SALESFORCE_CONSUMER_KEY" \ - --private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \ + --private-key "$SALESFORCE_PRIVATE_KEY_PATH" \ --categories "EmailMessage,Account,Lead,Case,Campaign" \ --output-dir salesforce-output \ --num-processes 2 \ diff --git a/docs/source/ingest/source_connectors/code/python/salesforce_api.py b/docs/source/ingest/source_connectors/code/python/salesforce_api.py index 04631d08d..2c6a095b7 100644 --- a/docs/source/ingest/source_connectors/code/python/salesforce_api.py +++ b/docs/source/ingest/source_connectors/code/python/salesforce_api.py @@ -21,7 +21,7 @@ if __name__ == "__main__": consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"), ), username=os.getenv("SALESFORCE_USERNAME"), - private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"), + private_key=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"), categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"], recursive=True, ), diff --git a/examples/ingest/salesforce/ingest.sh b/examples/ingest/salesforce/ingest.sh index d40b7bebd..884deea0d 100755 --- a/examples/ingest/salesforce/ingest.sh +++ b/examples/ingest/salesforce/ingest.sh @@ -11,7 +11,7 @@ # https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm # https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm -# private-key-path is the path to the key file +# private-key is the path to the key file or key contents SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) cd "$SCRIPT_DIR"/../../.. || exit 1 @@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ salesforce \ --username "$SALESFORCE_USERNAME" \ --consumer-key "$SALESFORCE_CONSUMER_KEY" \ - --private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \ + --private-key "$SALESFORCE_PRIVATE_KEY_PATH" \ --categories "EmailMessage,Account,Lead,Case,Campaign" \ --output-dir salesforce-output \ --preserve-downloads \ diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 623c19fb1..7e3f55f72 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -50,7 +50,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --download-dir "$DOWNLOAD_DIR" \ --username "$SALESFORCE_USERNAME" \ --consumer-key "$SALESFORCE_CONSUMER_KEY" \ - --private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \ + --private-key "$SALESFORCE_PRIVATE_KEY_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --preserve-downloads \ diff --git a/test_unstructured_ingest/unit/test_salesforce_connector.py b/test_unstructured_ingest/unit/test_salesforce_connector.py new file mode 100644 index 000000000..4d0b5166d --- /dev/null +++ b/test_unstructured_ingest/unit/test_salesforce_connector.py @@ -0,0 +1,57 @@ +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from cryptography.hazmat.primitives import serialization +from cryptography.hazmat.primitives.asymmetric import dsa, ec, rsa + +from unstructured.ingest.connector.salesforce import SalesforceAccessConfig + + +def pkey_to_str(key) -> str: + return key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ).decode("utf-8") + + +def rsa_private_key() -> str: + return pkey_to_str(rsa.generate_private_key(0x10001, 512)) + + +def brainpoolp512r1_private_key() -> str: + return pkey_to_str(ec.generate_private_key(ec.BrainpoolP512R1)) + + +def dsa_private_key() -> str: + return pkey_to_str(dsa.generate_private_key(1024)) + + +@pytest.mark.parametrize( + ("private_key", "private_key_type"), + [ + (rsa_private_key(), str), + (brainpoolp512r1_private_key(), str), + (dsa_private_key(), str), + ("some_path/priv.key", Path), + ], +) +def test_private_key_type(mocker, private_key, private_key_type): + mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") + mocked_isfile.return_value = True + + config = SalesforceAccessConfig(consumer_key="asdf", private_key=private_key) + actual_pkey_value, actual_pkey_type = config.get_private_key_value_and_type() + assert actual_pkey_type == private_key_type + assert actual_pkey_value == private_key + + +def test_private_key_type_fail(mocker): + mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") + mocked_isfile.return_value = False + + given_nonexistent_path = "some_path/priv.key" + with pytest.raises(expected_exception=ValueError): + config = SalesforceAccessConfig(consumer_key="asdf", private_key=given_nonexistent_path) + config.get_private_key_value_and_type() diff --git a/unstructured/ingest/cli/cmds/salesforce.py b/unstructured/ingest/cli/cmds/salesforce.py index 7a9afaeb3..a6d7119a1 100644 --- a/unstructured/ingest/cli/cmds/salesforce.py +++ b/unstructured/ingest/cli/cmds/salesforce.py @@ -31,11 +31,11 @@ class SalesforceCliConfig(SimpleSalesforceConfig, CliConfig): help="For the Salesforce JWT auth. Found in Consumer Details.", ), click.Option( - ["--private-key-path"], + ["--private-key"], required=True, - type=click.Path(file_okay=True, exists=True, dir_okay=False), - help="Path to the private key for the Salesforce JWT auth. " - "Usually named server.key.", + type=str, + help="Path to the private key or its contents for the Salesforce JWT auth. " + "Key file is usually named server.key.", ), click.Option( ["--categories"], diff --git a/unstructured/ingest/connector/salesforce.py b/unstructured/ingest/connector/salesforce.py index 29c331f41..af8c89952 100644 --- a/unstructured/ingest/connector/salesforce.py +++ b/unstructured/ingest/connector/salesforce.py @@ -63,6 +63,23 @@ $htmlbody @dataclass class SalesforceAccessConfig(AccessConfig): consumer_key: str = enhanced_field(sensitive=True) + private_key: str = enhanced_field(sensitive=True) + + @requires_dependencies(["cryptography"]) + def get_private_key_value_and_type(self) -> t.Tuple[str, t.Type]: + from cryptography.hazmat.primitives import serialization + + try: + serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None) + except ValueError: + pass + else: + return self.private_key, str + + if Path(self.private_key).is_file(): + return self.private_key, Path + + raise ValueError("private_key does not contain PEM private key or path") @dataclass @@ -72,17 +89,19 @@ class SimpleSalesforceConfig(BaseConnectorConfig): access_config: SalesforceAccessConfig categories: t.List[str] username: str - private_key_path: str recursive: bool = False @requires_dependencies(["simple_salesforce"], extras="salesforce") def get_client(self): from simple_salesforce import Salesforce + pkey_value, pkey_type = self.access_config.get_private_key_value_and_type() + return Salesforce( username=self.username, consumer_key=self.access_config.consumer_key, - privatekey_file=self.private_key_path, + privatekey_file=pkey_value if pkey_type is Path else None, + privatekey=pkey_value if pkey_type is str else None, version=SALESFORCE_API_VERSION, )