rfctr [P6M-398]: salesforce connector v2 (#3344)

Updates salesforce source connector to v2.
This commit is contained in:
David Potter 2024-07-09 09:46:58 -07:00 committed by GitHub
parent 176875bf26
commit db1e6993a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 2876 additions and 2003 deletions

View File

@ -1,4 +1,4 @@
## 0.14.11-dev2
## 0.14.11-dev3
### Enhancements

View File

@ -1,98 +1,110 @@
[
{
"type": "NarrativeText",
"element_id": "0773da7069b4dacf1ec43e7a1d39f26e",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:34:16",
"date_modified": "2023-08-21T14:34:16",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1"
},
"text": "Jane. This is a test of sending you an email from Salesforce!",
"type": "NarrativeText"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErPIAU"
},
"date_created": "1692542056.0",
"date_modified": "1692628456.0"
}
}
},
{
"type": "UncategorizedText",
"element_id": "ad17b8af68c2c668b2219926d2fecde9",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:34:16",
"date_modified": "2023-08-21T14:34:16",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1"
},
"text": "_____________________________________________________________________",
"type": "UncategorizedText"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErPIAU"
},
"date_created": "1692542056.0",
"date_modified": "1692628456.0"
}
}
},
{
"type": "Title",
"element_id": "02b3c747f7c5e0e553feb0f99c8a561f",
"text": "Powered by Salesforce",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:34:16",
"date_modified": "2023-08-21T14:34:16",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1"
},
"text": "Powered by Salesforce",
"type": "Title"
"subject": "Test of email 1",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErPIAU"
},
"date_created": "1692542056.0",
"date_modified": "1692628456.0"
}
}
},
{
"type": "Title",
"element_id": "07a4a605bad4cb7206377108a949ce59",
"text": "http://www.salesforce.com/",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:34:16",
"date_modified": "2023-08-21T14:34:16",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1"
},
"text": "http://www.salesforce.com/",
"type": "Title"
"subject": "Test of email 1",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErPIAU"
},
"date_created": "1692542056.0",
"date_modified": "1692628456.0"
}
}
}
]

View File

@ -1,170 +1,191 @@
[
{
"type": "Title",
"element_id": "bec7e6ad2ef7e1e8f16ef452637b26b5",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "Hey Sean.",
"type": "Title"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "NarrativeText",
"element_id": "a3e4b2201edb4fecdc7ef055e871310e",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "Testing email parsing here.",
"type": "NarrativeText"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "Title",
"element_id": "ff15ab0b7f162b02420efbb16866a2fe",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "Type: email",
"type": "Title"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "NarrativeText",
"element_id": "7b314c7653fd2890729cd4910778ad04",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "Just testing the email system",
"type": "NarrativeText"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "UncategorizedText",
"element_id": "c8805f36ed5571165d1f36ce32514b9a",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "_____________________________________________________________________",
"type": "UncategorizedText"
"metadata": {
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "Title",
"element_id": "d600397e8498fd001b8fc9efac09a97a",
"text": "Powered by Salesforce",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "Powered by Salesforce",
"type": "Title"
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
},
{
"type": "Title",
"element_id": "10c6ea4dc1b92ab41aefb37d7ca73013",
"text": "http://www.salesforce.com/",
"metadata": {
"data_source": {
"date_created": "2023-08-20T14:35:49",
"date_modified": "2023-08-20T14:35:55",
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "2023-09-14T11:40:03.000+0000"
},
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2"
},
"text": "http://www.salesforce.com/",
"type": "Title"
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
}
]

View File

@ -54,7 +54,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--preserve-downloads \
--recursive \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \

View File

@ -1 +1 @@
__version__ = "0.14.11-dev2" # pragma: no cover
__version__ = "0.14.11-dev3" # pragma: no cover

View File

@ -19,6 +19,7 @@ from .mongodb import mongodb_dest_cmd
from .onedrive import onedrive_drive_src_cmd
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
from .pinecone import pinecone_dest_cmd
from .salesforce import salesforce_src_cmd
from .sharepoint import sharepoint_drive_src_cmd
from .singlestore import singlestore_dest_cmd
from .sql import sql_dest_cmd
@ -35,6 +36,7 @@ src_cmds = [
onedrive_drive_src_cmd,
opensearch_src_cmd,
s3_src_cmd,
salesforce_src_cmd,
sharepoint_drive_src_cmd,
sftp_src_cmd,
]

View File

@ -0,0 +1,79 @@
from dataclasses import dataclass
import click
from unstructured.ingest.v2.cli.base import SrcCmd
from unstructured.ingest.v2.cli.interfaces import CliConfig
from unstructured.ingest.v2.cli.utils import DelimitedString
from unstructured.ingest.v2.processes.connectors.salesforce import (
ACCEPTED_CATEGORIES,
CONNECTOR_TYPE,
)
@dataclass
class SalesforceCliConnectionConfig(CliConfig):
@staticmethod
def get_cli_options() -> list[click.Option]:
options = [
click.Option(
["--username"],
required=True,
type=str,
help="Salesforce username usually looks like an email.",
),
click.Option(
["--consumer-key"],
required=True,
type=str,
help="For the Salesforce JWT auth. Found in Consumer Details.",
),
click.Option(
["--private-key"],
required=True,
type=str,
help="Path to the private key or its contents for the Salesforce JWT auth. "
"Key file is usually named server.key.",
),
]
return options
@dataclass
class SalesforceCliIndexerConfig(CliConfig):
@staticmethod
def get_cli_options() -> list[click.Option]:
possible_categories = ACCEPTED_CATEGORIES
options = [
click.Option(
["--categories"],
default=None,
required=True,
type=DelimitedString(choices=possible_categories),
help="Comma-delimited salesforce categories to download. "
"Currently only {}.".format(", ".join(possible_categories)),
),
]
return options
@dataclass
class SalesforceCliDownloadConfig(CliConfig):
@staticmethod
def get_cli_options() -> list[click.Option]:
options = [
click.Option(
["--download-dir"],
help="Where files are downloaded to, defaults to a location at"
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
),
]
return options
salesforce_src_cmd = SrcCmd(
cmd_name=CONNECTOR_TYPE,
connection_config=SalesforceCliConnectionConfig,
indexer_config=SalesforceCliIndexerConfig,
downloader_config=SalesforceCliDownloadConfig,
)

View File

@ -0,0 +1,43 @@
import os
from pathlib import Path
from unstructured.ingest.v2.interfaces import ProcessorConfig
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
from unstructured.ingest.v2.processes.connectors.local import (
LocalUploaderConfig,
)
from unstructured.ingest.v2.processes.connectors.salesforce import (
SalesforceAccessConfig,
SalesforceConnectionConfig,
SalesforceDownloaderConfig,
SalesforceIndexerConfig,
)
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
base_path = Path(__file__).parent.parent.parent.parent.parent
docs_path = base_path / "example-docs"
work_dir = base_path / "tmp_ingest"
output_path = work_dir / "output"
download_path = work_dir / "download"
if __name__ == "__main__":
logger.info(f"Writing all content in: {work_dir.resolve()}")
Pipeline.from_configs(
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]),
downloader_config=SalesforceDownloaderConfig(download_dir=download_path),
source_connection_config=SalesforceConnectionConfig(
SalesforceAccessConfig(
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
private_key=os.getenv("SALESFORCE_PRIVATE_KEY"),
),
username=os.getenv("SALESFORCE_USERNAME"),
),
partitioner_config=PartitionerConfig(strategy="fast"),
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
).run()

View File

@ -22,6 +22,8 @@ from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
from .onedrive import onedrive_source_entry
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
from .opensearch import opensearch_destination_entry, opensearch_source_entry
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
from .salesforce import salesforce_source_entry
from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
from .sql import sql_destination_entry
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
@ -48,6 +50,8 @@ add_destination_entry(
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
)
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
add_destination_entry(

View File

@ -0,0 +1,293 @@
"""
Salesforce Connector
Able to download Account, Case, Campaign, EmailMessage, Lead
Salesforce returns everything as a list of json.
This saves each entry as a separate file to be partitioned.
Using JWT authorization
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
"""
import json
from collections import OrderedDict
from dataclasses import dataclass, field
from email.utils import formatdate
from pathlib import Path
from string import Template
from textwrap import dedent
from typing import TYPE_CHECKING, Any, Generator, Type
from dateutil import parser
from unstructured.documents.elements import DataSourceMetadata
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionNetworkError
from unstructured.ingest.v2.interfaces import (
AccessConfig,
ConnectionConfig,
Downloader,
DownloaderConfig,
DownloadResponse,
FileData,
Indexer,
IndexerConfig,
SourceIdentifiers,
)
from unstructured.ingest.v2.logger import logger
from unstructured.ingest.v2.processes.connector_registry import (
SourceRegistryEntry,
)
from unstructured.utils import requires_dependencies
class MissingCategoryError(Exception):
"""There are no categories with that name."""
CONNECTOR_TYPE = "salesforce"
if TYPE_CHECKING:
from simple_salesforce import Salesforce
SALESFORCE_API_VERSION = "57.0"
# TODO: Add more categories as needed
ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
# Generic minimal email template used only
# to process EmailMessage records as .eml files
EMAIL_TEMPLATE = Template(
"""MIME-Version: 1.0
Date: $date
Message-ID: $message_identifier
Subject: $subject
From: $from_email
To: $to_email
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
--00000000000095c9b205eff92630
Content-Type: text/plain; charset="UTF-8"
$textbody
--00000000000095c9b205eff92630
Content-Type: text/html; charset="UTF-8"
$htmlbody
--00000000000095c9b205eff92630--
""",
)
@dataclass
class SalesforceAccessConfig(AccessConfig):
consumer_key: str
private_key: str
@requires_dependencies(["cryptography"])
def get_private_key_value_and_type(self) -> tuple[str, Type]:
from cryptography.hazmat.primitives import serialization
try:
serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
except ValueError:
pass
else:
return self.private_key, str
if Path(self.private_key).is_file():
return self.private_key, Path
raise ValueError("private_key does not contain PEM private key or path")
@dataclass
class SalesforceConnectionConfig(ConnectionConfig):
username: str
access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
@requires_dependencies(["simple_salesforce"], extras="salesforce")
def get_client(self) -> "Salesforce":
from simple_salesforce import Salesforce
pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
return Salesforce(
username=self.username,
consumer_key=self.access_config.consumer_key,
privatekey_file=pkey_value if pkey_type is Path else None,
privatekey=pkey_value if pkey_type is str else None,
version=SALESFORCE_API_VERSION,
)
@dataclass
class SalesforceIndexerConfig(IndexerConfig):
categories: list[str]
@dataclass
class SalesforceIndexer(Indexer):
connection_config: SalesforceConnectionConfig
index_config: SalesforceIndexerConfig
def __post_init__(self):
for record_type in self.index_config.categories:
if record_type not in ACCEPTED_CATEGORIES:
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
def get_file_extension(self, record_type) -> str:
if record_type == "EmailMessage":
extension = ".eml"
elif record_type in ["Account", "Lead", "Case", "Campaign"]:
extension = ".xml"
else:
raise MissingCategoryError(
f"There are no categories with the name: {record_type}",
)
return extension
@requires_dependencies(["simple_salesforce"], extras="salesforce")
def list_files(self) -> list[FileData]:
"""Get Salesforce Ids for the records.
Send them to next phase where each doc gets downloaded into the
appropriate format for partitioning.
"""
from simple_salesforce.exceptions import SalesforceMalformedRequest
client = self.connection_config.get_client()
files_list = []
for record_type in self.index_config.categories:
try:
# Get ids from Salesforce
records = client.query_all_iter(
f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
)
for record in records:
record_with_extension = record["Id"] + self.get_file_extension(
record["attributes"]["type"]
)
files_list.append(
FileData(
connector_type=CONNECTOR_TYPE,
identifier=record["Id"],
source_identifiers=SourceIdentifiers(
filename=record_with_extension,
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
),
metadata=DataSourceMetadata(
url=record["attributes"]["url"],
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
date_modified=str(
parser.parse(record["LastModifiedDate"]).timestamp()
),
record_locator={"id": record["Id"]},
),
additional_metadata={"record_type": record["attributes"]["type"]},
)
)
except SalesforceMalformedRequest as e:
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
return files_list
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
for f in self.list_files():
yield f
@dataclass
class SalesforceDownloaderConfig(DownloaderConfig):
pass
@dataclass
class SalesforceDownloader(Downloader):
connection_config: SalesforceConnectionConfig
download_config: SalesforceDownloaderConfig = field(
default_factory=lambda: SalesforceDownloaderConfig()
)
connector_type: str = CONNECTOR_TYPE
def get_download_path(self, file_data: FileData) -> Path:
rel_path = file_data.source_identifiers.relative_path
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
return self.download_dir / Path(rel_path)
def _xml_for_record(self, record: OrderedDict) -> str:
"""Creates partitionable xml file from a record"""
import xml.etree.ElementTree as ET
def create_xml_doc(data, parent, prefix=""):
for key, value in data.items():
if isinstance(value, OrderedDict):
create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
else:
item = ET.Element("item")
item.text = f"{prefix}{key}: {value}"
parent.append(item)
root = ET.Element("root")
create_xml_doc(record, root)
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
return xml_string
def _eml_for_record(self, email_json: dict[str, Any]) -> str:
"""Recreates standard expected .eml format using template."""
eml = EMAIL_TEMPLATE.substitute(
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
message_identifier=email_json.get("MessageIdentifier"),
subject=email_json.get("Subject"),
from_email=email_json.get("FromAddress"),
to_email=email_json.get("ToAddress"),
textbody=email_json.get("TextBody"),
htmlbody=email_json.get("HtmlBody"),
)
return dedent(eml)
@SourceConnectionNetworkError.wrap
def _get_response(self, file_data: FileData) -> OrderedDict:
client = self.connection_config.get_client()
return client.query(
f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
)
def get_record(self, file_data: FileData) -> OrderedDict:
# Get record from Salesforce based on id
response = self._get_response(file_data)
logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
records = response["records"]
if not records:
raise ValueError(
f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
)
record_json = records[0]
return record_json
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
record = self.get_record(file_data)
try:
if file_data.additional_metadata["record_type"] == "EmailMessage":
document = self._eml_for_record(record)
else:
document = self._xml_for_record(record)
download_path = self.get_download_path(file_data=file_data)
download_path.parent.mkdir(parents=True, exist_ok=True)
with open(download_path, "w") as page_file:
page_file.write(document)
except Exception as e:
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
return self.generate_download_response(file_data=file_data, download_path=download_path)
salesforce_source_entry = SourceRegistryEntry(
connection_config=SalesforceConnectionConfig,
indexer_config=SalesforceIndexerConfig,
indexer=SalesforceIndexer,
downloader_config=SalesforceDownloaderConfig,
downloader=SalesforceDownloader,
)