feat: Salesforce connector accepts key path or value (#2321) (#2327)

Solution to issue
https://github.com/Unstructured-IO/unstructured/issues/2321.

simple_salesforce API allows for passing private key path or value. This
PR introduces this support for Ingest connector.

Salesforce parameter "private-key-file" has been renamed to
"private-key".
It can contain one of following:
- path to PEM encoded key file (as string)
- key contents (PEM encoded string)

If the provided value cannot be parsed as PEM encoded private key, then
the file existence is checked. This way private key contents are not
exposed to unnecessary underlying function calls.
This commit is contained in:
jakub-sandomierz-deepsense-ai 2024-01-11 12:15:24 +01:00 committed by GitHub
parent 5581e6a4c4
commit 411aa98bbf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 89 additions and 12 deletions

View File

@ -4,6 +4,7 @@
* **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries.
* **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks.
* **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents.
### Features

View File

@ -4,7 +4,7 @@ unstructured-ingest \
salesforce \
--username "$SALESFORCE_USERNAME" \
--consumer-key "$SALESFORCE_CONSUMER_KEY" \
--private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \
--private-key "$SALESFORCE_PRIVATE_KEY_PATH" \
--categories "EmailMessage,Account,Lead,Case,Campaign" \
--output-dir salesforce-output \
--num-processes 2 \

View File

@ -4,7 +4,7 @@ unstructured-ingest \
salesforce \
--username "$SALESFORCE_USERNAME" \
--consumer-key "$SALESFORCE_CONSUMER_KEY" \
--private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \
--private-key "$SALESFORCE_PRIVATE_KEY_PATH" \
--categories "EmailMessage,Account,Lead,Case,Campaign" \
--output-dir salesforce-output \
--num-processes 2 \

View File

@ -21,7 +21,7 @@ if __name__ == "__main__":
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
),
username=os.getenv("SALESFORCE_USERNAME"),
private_key_path=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
private_key=os.getenv("SALESFORCE_PRIVATE_KEY_PATH"),
categories=["EmailMessage", "Account", "Lead", "Case", "Campaign"],
recursive=True,
),

View File

@ -11,7 +11,7 @@
# https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
# https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
# private-key-path is the path to the key file
# private-key is the path to the key file or key contents
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1
@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
salesforce \
--username "$SALESFORCE_USERNAME" \
--consumer-key "$SALESFORCE_CONSUMER_KEY" \
--private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \
--private-key "$SALESFORCE_PRIVATE_KEY_PATH" \
--categories "EmailMessage,Account,Lead,Case,Campaign" \
--output-dir salesforce-output \
--preserve-downloads \

View File

@ -50,7 +50,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--download-dir "$DOWNLOAD_DIR" \
--username "$SALESFORCE_USERNAME" \
--consumer-key "$SALESFORCE_CONSUMER_KEY" \
--private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \
--private-key "$SALESFORCE_PRIVATE_KEY_PATH" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--preserve-downloads \

View File

@ -0,0 +1,57 @@
from pathlib import Path
from unittest.mock import MagicMock
import pytest
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import dsa, ec, rsa
from unstructured.ingest.connector.salesforce import SalesforceAccessConfig
def pkey_to_str(key) -> str:
return key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption(),
).decode("utf-8")
def rsa_private_key() -> str:
return pkey_to_str(rsa.generate_private_key(0x10001, 512))
def brainpoolp512r1_private_key() -> str:
return pkey_to_str(ec.generate_private_key(ec.BrainpoolP512R1))
def dsa_private_key() -> str:
return pkey_to_str(dsa.generate_private_key(1024))
@pytest.mark.parametrize(
("private_key", "private_key_type"),
[
(rsa_private_key(), str),
(brainpoolp512r1_private_key(), str),
(dsa_private_key(), str),
("some_path/priv.key", Path),
],
)
def test_private_key_type(mocker, private_key, private_key_type):
mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file")
mocked_isfile.return_value = True
config = SalesforceAccessConfig(consumer_key="asdf", private_key=private_key)
actual_pkey_value, actual_pkey_type = config.get_private_key_value_and_type()
assert actual_pkey_type == private_key_type
assert actual_pkey_value == private_key
def test_private_key_type_fail(mocker):
mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file")
mocked_isfile.return_value = False
given_nonexistent_path = "some_path/priv.key"
with pytest.raises(expected_exception=ValueError):
config = SalesforceAccessConfig(consumer_key="asdf", private_key=given_nonexistent_path)
config.get_private_key_value_and_type()

View File

@ -31,11 +31,11 @@ class SalesforceCliConfig(SimpleSalesforceConfig, CliConfig):
help="For the Salesforce JWT auth. Found in Consumer Details.",
),
click.Option(
["--private-key-path"],
["--private-key"],
required=True,
type=click.Path(file_okay=True, exists=True, dir_okay=False),
help="Path to the private key for the Salesforce JWT auth. "
"Usually named server.key.",
type=str,
help="Path to the private key or its contents for the Salesforce JWT auth. "
"Key file is usually named server.key.",
),
click.Option(
["--categories"],

View File

@ -63,6 +63,23 @@ $htmlbody
@dataclass
class SalesforceAccessConfig(AccessConfig):
consumer_key: str = enhanced_field(sensitive=True)
private_key: str = enhanced_field(sensitive=True)
@requires_dependencies(["cryptography"])
def get_private_key_value_and_type(self) -> t.Tuple[str, t.Type]:
from cryptography.hazmat.primitives import serialization
try:
serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
except ValueError:
pass
else:
return self.private_key, str
if Path(self.private_key).is_file():
return self.private_key, Path
raise ValueError("private_key does not contain PEM private key or path")
@dataclass
@ -72,17 +89,19 @@ class SimpleSalesforceConfig(BaseConnectorConfig):
access_config: SalesforceAccessConfig
categories: t.List[str]
username: str
private_key_path: str
recursive: bool = False
@requires_dependencies(["simple_salesforce"], extras="salesforce")
def get_client(self):
from simple_salesforce import Salesforce
pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
return Salesforce(
username=self.username,
consumer_key=self.access_config.consumer_key,
privatekey_file=self.private_key_path,
privatekey_file=pkey_value if pkey_type is Path else None,
privatekey=pkey_value if pkey_type is str else None,
version=SALESFORCE_API_VERSION,
)