mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
rfctr [P6M-398]: salesforce connector v2 (#3344)
Updates salesforce source connector to v2.
This commit is contained in:
parent
176875bf26
commit
db1e6993a8
@ -1,4 +1,4 @@
|
||||
## 0.14.11-dev2
|
||||
## 0.14.11-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,98 +1,110 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "0773da7069b4dacf1ec43e7a1d39f26e",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:34:16",
|
||||
"date_modified": "2023-08-21T14:34:16",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1"
|
||||
},
|
||||
"text": "Jane. This is a test of sending you an email from Salesforce!",
|
||||
"type": "NarrativeText"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErPIAU"
|
||||
},
|
||||
"date_created": "1692542056.0",
|
||||
"date_modified": "1692628456.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ad17b8af68c2c668b2219926d2fecde9",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:34:16",
|
||||
"date_modified": "2023-08-21T14:34:16",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1"
|
||||
},
|
||||
"text": "_____________________________________________________________________",
|
||||
"type": "UncategorizedText"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErPIAU"
|
||||
},
|
||||
"date_created": "1692542056.0",
|
||||
"date_modified": "1692628456.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "02b3c747f7c5e0e553feb0f99c8a561f",
|
||||
"text": "Powered by Salesforce",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:34:16",
|
||||
"date_modified": "2023-08-21T14:34:16",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1"
|
||||
},
|
||||
"text": "Powered by Salesforce",
|
||||
"type": "Title"
|
||||
"subject": "Test of email 1",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErPIAU"
|
||||
},
|
||||
"date_created": "1692542056.0",
|
||||
"date_modified": "1692628456.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "07a4a605bad4cb7206377108a949ce59",
|
||||
"text": "http://www.salesforce.com/",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:34:16",
|
||||
"date_modified": "2023-08-21T14:34:16",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1"
|
||||
},
|
||||
"text": "http://www.salesforce.com/",
|
||||
"type": "Title"
|
||||
"subject": "Test of email 1",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErPIAU"
|
||||
},
|
||||
"date_created": "1692542056.0",
|
||||
"date_modified": "1692628456.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -1,170 +1,191 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "bec7e6ad2ef7e1e8f16ef452637b26b5",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "Hey Sean.",
|
||||
"type": "Title"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a3e4b2201edb4fecdc7ef055e871310e",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "Testing email parsing here.",
|
||||
"type": "NarrativeText"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "ff15ab0b7f162b02420efbb16866a2fe",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "Type: email",
|
||||
"type": "Title"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7b314c7653fd2890729cd4910778ad04",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "Just testing the email system",
|
||||
"type": "NarrativeText"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "c8805f36ed5571165d1f36ce32514b9a",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "_____________________________________________________________________",
|
||||
"type": "UncategorizedText"
|
||||
"metadata": {
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "d600397e8498fd001b8fc9efac09a97a",
|
||||
"text": "Powered by Salesforce",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "Powered by Salesforce",
|
||||
"type": "Title"
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "10c6ea4dc1b92ab41aefb37d7ca73013",
|
||||
"text": "http://www.salesforce.com/",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-08-20T14:35:49",
|
||||
"date_modified": "2023-08-20T14:35:55",
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "2023-09-14T11:40:03.000+0000"
|
||||
},
|
||||
"filetype": "message/rfc822",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
],
|
||||
"sent_to": [
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2"
|
||||
},
|
||||
"text": "http://www.salesforce.com/",
|
||||
"type": "Title"
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
"record_locator": {
|
||||
"id": "02sHu00001efErQIAU"
|
||||
},
|
||||
"date_created": "1692542149.0",
|
||||
"date_modified": "1692542155.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -54,7 +54,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--num-processes "$max_processes" \
|
||||
--preserve-downloads \
|
||||
--recursive \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.11-dev2" # pragma: no cover
|
||||
__version__ = "0.14.11-dev3" # pragma: no cover
|
||||
|
||||
@ -19,6 +19,7 @@ from .mongodb import mongodb_dest_cmd
|
||||
from .onedrive import onedrive_drive_src_cmd
|
||||
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
||||
from .pinecone import pinecone_dest_cmd
|
||||
from .salesforce import salesforce_src_cmd
|
||||
from .sharepoint import sharepoint_drive_src_cmd
|
||||
from .singlestore import singlestore_dest_cmd
|
||||
from .sql import sql_dest_cmd
|
||||
@ -35,6 +36,7 @@ src_cmds = [
|
||||
onedrive_drive_src_cmd,
|
||||
opensearch_src_cmd,
|
||||
s3_src_cmd,
|
||||
salesforce_src_cmd,
|
||||
sharepoint_drive_src_cmd,
|
||||
sftp_src_cmd,
|
||||
]
|
||||
|
||||
79
unstructured/ingest/v2/cli/cmds/salesforce.py
Normal file
79
unstructured/ingest/v2/cli/cmds/salesforce.py
Normal file
@ -0,0 +1,79 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
import click
|
||||
|
||||
from unstructured.ingest.v2.cli.base import SrcCmd
|
||||
from unstructured.ingest.v2.cli.interfaces import CliConfig
|
||||
from unstructured.ingest.v2.cli.utils import DelimitedString
|
||||
from unstructured.ingest.v2.processes.connectors.salesforce import (
|
||||
ACCEPTED_CATEGORIES,
|
||||
CONNECTOR_TYPE,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceCliConnectionConfig(CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> list[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--username"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="Salesforce username usually looks like an email.",
|
||||
),
|
||||
click.Option(
|
||||
["--consumer-key"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="For the Salesforce JWT auth. Found in Consumer Details.",
|
||||
),
|
||||
click.Option(
|
||||
["--private-key"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="Path to the private key or its contents for the Salesforce JWT auth. "
|
||||
"Key file is usually named server.key.",
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceCliIndexerConfig(CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> list[click.Option]:
|
||||
possible_categories = ACCEPTED_CATEGORIES
|
||||
options = [
|
||||
click.Option(
|
||||
["--categories"],
|
||||
default=None,
|
||||
required=True,
|
||||
type=DelimitedString(choices=possible_categories),
|
||||
help="Comma-delimited salesforce categories to download. "
|
||||
"Currently only {}.".format(", ".join(possible_categories)),
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceCliDownloadConfig(CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> list[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--download-dir"],
|
||||
help="Where files are downloaded to, defaults to a location at"
|
||||
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
salesforce_src_cmd = SrcCmd(
|
||||
cmd_name=CONNECTOR_TYPE,
|
||||
connection_config=SalesforceCliConnectionConfig,
|
||||
indexer_config=SalesforceCliIndexerConfig,
|
||||
downloader_config=SalesforceCliDownloadConfig,
|
||||
)
|
||||
43
unstructured/ingest/v2/examples/example_salesforce.py
Normal file
43
unstructured/ingest/v2/examples/example_salesforce.py
Normal file
@ -0,0 +1,43 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from unstructured.ingest.v2.interfaces import ProcessorConfig
|
||||
from unstructured.ingest.v2.logger import logger
|
||||
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
|
||||
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
|
||||
from unstructured.ingest.v2.processes.connectors.local import (
|
||||
LocalUploaderConfig,
|
||||
)
|
||||
from unstructured.ingest.v2.processes.connectors.salesforce import (
|
||||
SalesforceAccessConfig,
|
||||
SalesforceConnectionConfig,
|
||||
SalesforceDownloaderConfig,
|
||||
SalesforceIndexerConfig,
|
||||
)
|
||||
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
|
||||
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
|
||||
|
||||
base_path = Path(__file__).parent.parent.parent.parent.parent
|
||||
docs_path = base_path / "example-docs"
|
||||
work_dir = base_path / "tmp_ingest"
|
||||
output_path = work_dir / "output"
|
||||
download_path = work_dir / "download"
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info(f"Writing all content in: {work_dir.resolve()}")
|
||||
Pipeline.from_configs(
|
||||
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
||||
indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]),
|
||||
downloader_config=SalesforceDownloaderConfig(download_dir=download_path),
|
||||
source_connection_config=SalesforceConnectionConfig(
|
||||
SalesforceAccessConfig(
|
||||
consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"),
|
||||
private_key=os.getenv("SALESFORCE_PRIVATE_KEY"),
|
||||
),
|
||||
username=os.getenv("SALESFORCE_USERNAME"),
|
||||
),
|
||||
partitioner_config=PartitionerConfig(strategy="fast"),
|
||||
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
||||
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
|
||||
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
|
||||
).run()
|
||||
@ -22,6 +22,8 @@ from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
||||
from .onedrive import onedrive_source_entry
|
||||
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
||||
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
||||
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
||||
from .salesforce import salesforce_source_entry
|
||||
from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
|
||||
from .sql import sql_destination_entry
|
||||
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
||||
@ -48,6 +50,8 @@ add_destination_entry(
|
||||
destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
|
||||
)
|
||||
|
||||
add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
|
||||
|
||||
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
||||
|
||||
add_destination_entry(
|
||||
|
||||
293
unstructured/ingest/v2/processes/connectors/salesforce.py
Normal file
293
unstructured/ingest/v2/processes/connectors/salesforce.py
Normal file
@ -0,0 +1,293 @@
|
||||
"""
|
||||
Salesforce Connector
|
||||
Able to download Account, Case, Campaign, EmailMessage, Lead
|
||||
Salesforce returns everything as a list of json.
|
||||
This saves each entry as a separate file to be partitioned.
|
||||
Using JWT authorization
|
||||
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm
|
||||
https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm
|
||||
"""
|
||||
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass, field
|
||||
from email.utils import formatdate
|
||||
from pathlib import Path
|
||||
from string import Template
|
||||
from textwrap import dedent
|
||||
from typing import TYPE_CHECKING, Any, Generator, Type
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
from unstructured.documents.elements import DataSourceMetadata
|
||||
from unstructured.ingest.enhanced_dataclass import enhanced_field
|
||||
from unstructured.ingest.error import SourceConnectionNetworkError
|
||||
from unstructured.ingest.v2.interfaces import (
|
||||
AccessConfig,
|
||||
ConnectionConfig,
|
||||
Downloader,
|
||||
DownloaderConfig,
|
||||
DownloadResponse,
|
||||
FileData,
|
||||
Indexer,
|
||||
IndexerConfig,
|
||||
SourceIdentifiers,
|
||||
)
|
||||
from unstructured.ingest.v2.logger import logger
|
||||
from unstructured.ingest.v2.processes.connector_registry import (
|
||||
SourceRegistryEntry,
|
||||
)
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
class MissingCategoryError(Exception):
|
||||
"""There are no categories with that name."""
|
||||
|
||||
|
||||
CONNECTOR_TYPE = "salesforce"
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from simple_salesforce import Salesforce
|
||||
|
||||
SALESFORCE_API_VERSION = "57.0"
|
||||
|
||||
# TODO: Add more categories as needed
|
||||
ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"]
|
||||
|
||||
# Generic minimal email template used only
|
||||
# to process EmailMessage records as .eml files
|
||||
EMAIL_TEMPLATE = Template(
|
||||
"""MIME-Version: 1.0
|
||||
Date: $date
|
||||
Message-ID: $message_identifier
|
||||
Subject: $subject
|
||||
From: $from_email
|
||||
To: $to_email
|
||||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/plain; charset="UTF-8"
|
||||
$textbody
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
$htmlbody
|
||||
--00000000000095c9b205eff92630--
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceAccessConfig(AccessConfig):
|
||||
consumer_key: str
|
||||
private_key: str
|
||||
|
||||
@requires_dependencies(["cryptography"])
|
||||
def get_private_key_value_and_type(self) -> tuple[str, Type]:
|
||||
from cryptography.hazmat.primitives import serialization
|
||||
|
||||
try:
|
||||
serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return self.private_key, str
|
||||
|
||||
if Path(self.private_key).is_file():
|
||||
return self.private_key, Path
|
||||
|
||||
raise ValueError("private_key does not contain PEM private key or path")
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceConnectionConfig(ConnectionConfig):
|
||||
username: str
|
||||
access_config: SalesforceAccessConfig = enhanced_field(sensitive=True)
|
||||
|
||||
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
||||
def get_client(self) -> "Salesforce":
|
||||
from simple_salesforce import Salesforce
|
||||
|
||||
pkey_value, pkey_type = self.access_config.get_private_key_value_and_type()
|
||||
|
||||
return Salesforce(
|
||||
username=self.username,
|
||||
consumer_key=self.access_config.consumer_key,
|
||||
privatekey_file=pkey_value if pkey_type is Path else None,
|
||||
privatekey=pkey_value if pkey_type is str else None,
|
||||
version=SALESFORCE_API_VERSION,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceIndexerConfig(IndexerConfig):
|
||||
categories: list[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceIndexer(Indexer):
|
||||
connection_config: SalesforceConnectionConfig
|
||||
index_config: SalesforceIndexerConfig
|
||||
|
||||
def __post_init__(self):
|
||||
for record_type in self.index_config.categories:
|
||||
if record_type not in ACCEPTED_CATEGORIES:
|
||||
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
||||
|
||||
def get_file_extension(self, record_type) -> str:
|
||||
if record_type == "EmailMessage":
|
||||
extension = ".eml"
|
||||
elif record_type in ["Account", "Lead", "Case", "Campaign"]:
|
||||
extension = ".xml"
|
||||
else:
|
||||
raise MissingCategoryError(
|
||||
f"There are no categories with the name: {record_type}",
|
||||
)
|
||||
return extension
|
||||
|
||||
@requires_dependencies(["simple_salesforce"], extras="salesforce")
|
||||
def list_files(self) -> list[FileData]:
|
||||
"""Get Salesforce Ids for the records.
|
||||
Send them to next phase where each doc gets downloaded into the
|
||||
appropriate format for partitioning.
|
||||
"""
|
||||
from simple_salesforce.exceptions import SalesforceMalformedRequest
|
||||
|
||||
client = self.connection_config.get_client()
|
||||
|
||||
files_list = []
|
||||
for record_type in self.index_config.categories:
|
||||
try:
|
||||
# Get ids from Salesforce
|
||||
records = client.query_all_iter(
|
||||
f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}",
|
||||
)
|
||||
for record in records:
|
||||
record_with_extension = record["Id"] + self.get_file_extension(
|
||||
record["attributes"]["type"]
|
||||
)
|
||||
files_list.append(
|
||||
FileData(
|
||||
connector_type=CONNECTOR_TYPE,
|
||||
identifier=record["Id"],
|
||||
source_identifiers=SourceIdentifiers(
|
||||
filename=record_with_extension,
|
||||
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
||||
),
|
||||
metadata=DataSourceMetadata(
|
||||
url=record["attributes"]["url"],
|
||||
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
||||
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
||||
date_modified=str(
|
||||
parser.parse(record["LastModifiedDate"]).timestamp()
|
||||
),
|
||||
record_locator={"id": record["Id"]},
|
||||
),
|
||||
additional_metadata={"record_type": record["attributes"]["type"]},
|
||||
)
|
||||
)
|
||||
except SalesforceMalformedRequest as e:
|
||||
raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}")
|
||||
|
||||
return files_list
|
||||
|
||||
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
||||
for f in self.list_files():
|
||||
yield f
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceDownloaderConfig(DownloaderConfig):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class SalesforceDownloader(Downloader):
|
||||
connection_config: SalesforceConnectionConfig
|
||||
download_config: SalesforceDownloaderConfig = field(
|
||||
default_factory=lambda: SalesforceDownloaderConfig()
|
||||
)
|
||||
connector_type: str = CONNECTOR_TYPE
|
||||
|
||||
def get_download_path(self, file_data: FileData) -> Path:
|
||||
rel_path = file_data.source_identifiers.relative_path
|
||||
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
||||
return self.download_dir / Path(rel_path)
|
||||
|
||||
def _xml_for_record(self, record: OrderedDict) -> str:
|
||||
"""Creates partitionable xml file from a record"""
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def create_xml_doc(data, parent, prefix=""):
|
||||
for key, value in data.items():
|
||||
if isinstance(value, OrderedDict):
|
||||
create_xml_doc(value, parent, prefix=f"{prefix}{key}.")
|
||||
else:
|
||||
item = ET.Element("item")
|
||||
item.text = f"{prefix}{key}: {value}"
|
||||
parent.append(item)
|
||||
|
||||
root = ET.Element("root")
|
||||
create_xml_doc(record, root)
|
||||
|
||||
xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode()
|
||||
return xml_string
|
||||
|
||||
def _eml_for_record(self, email_json: dict[str, Any]) -> str:
|
||||
"""Recreates standard expected .eml format using template."""
|
||||
eml = EMAIL_TEMPLATE.substitute(
|
||||
date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()),
|
||||
message_identifier=email_json.get("MessageIdentifier"),
|
||||
subject=email_json.get("Subject"),
|
||||
from_email=email_json.get("FromAddress"),
|
||||
to_email=email_json.get("ToAddress"),
|
||||
textbody=email_json.get("TextBody"),
|
||||
htmlbody=email_json.get("HtmlBody"),
|
||||
)
|
||||
return dedent(eml)
|
||||
|
||||
@SourceConnectionNetworkError.wrap
|
||||
def _get_response(self, file_data: FileData) -> OrderedDict:
|
||||
client = self.connection_config.get_client()
|
||||
return client.query(
|
||||
f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501
|
||||
)
|
||||
|
||||
def get_record(self, file_data: FileData) -> OrderedDict:
|
||||
# Get record from Salesforce based on id
|
||||
response = self._get_response(file_data)
|
||||
logger.debug(f"response was returned for salesforce record id: {file_data.identifier}")
|
||||
records = response["records"]
|
||||
if not records:
|
||||
raise ValueError(
|
||||
f"No record found with record id {file_data.identifier}: {json.dumps(response)}"
|
||||
)
|
||||
record_json = records[0]
|
||||
return record_json
|
||||
|
||||
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
||||
record = self.get_record(file_data)
|
||||
|
||||
try:
|
||||
if file_data.additional_metadata["record_type"] == "EmailMessage":
|
||||
document = self._eml_for_record(record)
|
||||
else:
|
||||
document = self._xml_for_record(record)
|
||||
download_path = self.get_download_path(file_data=file_data)
|
||||
download_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(download_path, "w") as page_file:
|
||||
page_file.write(document)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
||||
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
||||
|
||||
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
||||
|
||||
|
||||
salesforce_source_entry = SourceRegistryEntry(
|
||||
connection_config=SalesforceConnectionConfig,
|
||||
indexer_config=SalesforceIndexerConfig,
|
||||
indexer=SalesforceIndexer,
|
||||
downloader_config=SalesforceDownloaderConfig,
|
||||
downloader=SalesforceDownloader,
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user