feat: adds Outlook connector (#939)

* bonus: fixes issue with email partitioning where From field was being assigned the To field value.
This commit is contained in:
David Potter 2023-07-25 21:09:26 -07:00 committed by GitHub
parent d694cd53bf
commit f7e46af22f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 623 additions and 26 deletions

View File

@ -192,7 +192,10 @@ jobs:
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
run: |
source .venv/bin/activate
sudo apt-get update
@ -212,6 +215,7 @@ jobs:
make install-ingest-github
make install-ingest-gitlab
make install-ingest-onedrive
make install-ingest-outlook
make install-ingest-slack
make install-ingest-wikipedia
./test_unstructured_ingest/test-ingest.sh

View File

@ -67,6 +67,9 @@ jobs:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
OVERWRITE_FIXTURES: "true"
run: |
source .venv/bin/activate
@ -87,6 +90,7 @@ jobs:
make install-ingest-github
make install-ingest-gitlab
make install-ingest-onedrive
make install-ingest-outlook
make install-ingest-slack
make install-ingest-wikipedia
./test_unstructured_ingest/test-ingest.sh

View File

@ -1,3 +1,15 @@
## 0.8.3
### Enhancements
### Features
* Adds Outlook connector
### Fixes
* Fixes issue with email partitioning where From field was being assigned the To field value.
## 0.8.2-dev7
### Enhancements

View File

@ -12,3 +12,5 @@ include requirements/ingest-reddit.in
include requirements/ingest-slack.in
include requirements/ingest-wikipedia.in
include requirements/ingest-google-drive.in
include requirements/ingest-outlook.in
include requirements/ingest-onedrive.in

View File

@ -90,6 +90,10 @@ install-ingest-gitlab:
install-ingest-onedrive:
python3 -m pip install -r requirements/ingest-onedrive.txt
.PHONY: install-ingest-outlook
install-ingest-outlook:
python3 -m pip install -r requirements/ingest-outlook.txt
.PHONY: install-ingest-reddit
install-ingest-reddit:
python3 -m pip install -r requirements/ingest-reddit.txt

View File

@ -25,7 +25,7 @@ NOTE: Keep in mind that you will need to have all the appropriate extras and dep
--------------------
You can batch process documents stored in your Azure Blob Container using the `Azure Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/azure.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/azure/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[azure]``
To install all dependencies for this connector run: ``pip install "unstructured[azure]"``
``BioMed Connector``
@ -37,49 +37,49 @@ You can process `National Center for Biotechnology Information <https://www.ncbi
----------------------
You can preprocess your Discord channel using the `Discord Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/discord.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/discord/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[discord]``
To install all dependencies for this connector run: ``pip install "unstructured[discord]"``
``Dropbox Connector``
----------------------
You can batch process unstructured documents in your Dropbox by using the `Dropbox Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/dropbox.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/dropbox/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[dropbox]``
To install all dependencies for this connector run: ``pip install "unstructured[dropbox]"``
``Elasticsearch Connector``
----------------------------
You can preprocess documents stored in Elasticsearch by using the `Elasticsearch Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/elasticsearch.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/elasticsearch/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[elasticsearch]``
To install all dependencies for this connector run: ``pip install "unstructured[elasticsearch]"``
``Google Cloud Storage Connector``
------------------
You can batch load the files you have stored in Google Cloud Storage with the `GCS Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gcs.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_cloud_storage/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[gcs]``
To install all dependencies for this connector run: ``pip install "unstructured[gcs]"``
``Github Connector``
---------------------
You can process files in a Github repository using the `Github Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/github.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/github/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[github]``
To install all dependencies for this connector run: ``pip install "unstructured[github]"``
``Gitlab Connector``
---------------------
You can batch load files in a Gitlab repository using the `Gitlab Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gitlab.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/gitlab/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[gitlab]``
To install all dependencies for this connector run: ``pip install "unstructured[gitlab]"``
``Google Drive Connector``
---------------------
You can batch process documents stored in your Google Drive with the `Google Drive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/google_drive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_drive/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[google-drive]``
To install all dependencies for this connector run: ``pip install "unstructured[google-drive]"``
``Local Connector``
@ -89,34 +89,42 @@ You can batch load your unstructured files in a local directory for preprocessin
``OneDrive Connector``
---------------------
You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/onedrive.sh>`_.
You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install "unstructured[onedrive]"``
``Outlook Connector``
---------------------
You can batch process email stored in Microsoft Outlook with the `Outlook Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/outlook.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/outlook/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install "unstructured[outlook]"``
To install all dependencies for this connector run: ``pip install unstructured[onedrive]``
``Reddit Connector``
---------------------
You can use the `Reddit Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/reddit.py>`_ to preprocess a Reddit thread. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/reddit/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[reddit]``
To install all dependencies for this connector run: ``pip install "unstructured[reddit]"``
``S3 Connector``
---------------------
You can process your files stored in S3 in batch using the `S3 Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/s3.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/s3-small-batch/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[s3]``
To install all dependencies for this connector run: ``pip install "unstructured[s3]"``
``Slack Connector``
---------------------
Using the `Slack Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ you can batch process a channel. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/slack/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[slack]``
To install all dependencies for this connector run: ``pip install "unstructured[slack]"``
``Wikipedia Connector``
---------------------
You can load and process a Wikipedia page using the `Wikipedia Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ to preprocess for your model. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/wikipedia/ingest.sh>`_.
To install all dependencies for this connector run: ``pip install unstructured[wikipedia]``
To install all dependencies for this connector run: ``pip install "unstructured[wikipedia]"``

View File

@ -3,7 +3,7 @@ Date: Fri, 16 Dec 2022 17:04:16 -0500
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
Subject: Test Email
From: Matthew Robinson <mrobinson@unstructured.io>
To: Matthew Robinson <mrobinson@unstructured.io>
To: NotMatthew <NotMatthew@notunstructured.com>
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
--00000000000095c9b205eff92630

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
# Processes Outlook emails through Unstructured's library. Does not download attachments.
# Structured outputs are stored in outlook-output/
# NOTE, this script is not ready-to-run!
# You must enter a Azure AD app client-id, client secret, tenant-id, and email
# before running.
# To get the credentials for your Azure AD app, follow these steps:
# https://learn.microsoft.com/en-us/graph/auth-register-app-v2
# https://learn.microsoft.com/en-us/graph/auth-v2-service
# Assign the neccesary permissions for the application to read from mail.
# https://learn.microsoft.com/en-us/graph/permissions-reference
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1
PYTHONPATH=. ./unstructured/ingest/main.py \
--ms-client-id "$MS_CLIENT_ID" \
--ms-client-cred "$MS_CLIENT_CRED" \
--ms-tenant "$MS_TENANT_ID" \
--ms-user-email "$MS_USER_EMAIL" \
--ms-outlook-folders Inbox,"Sent Items" \
--structured-output-dir outlook-output \
--num-processes 2 \
--recursive \
--verbose

View File

@ -0,0 +1,5 @@
-c constraints.in
-c base.txt
msal
Office365-REST-Python-Client
cryptography==41.0.2

View File

@ -0,0 +1,55 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-outlook.in
#
certifi==2023.5.7
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
cffi==1.15.1
# via
# -c requirements/base.txt
# cryptography
charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.2
# via
# -c requirements/base.txt
# -r requirements/ingest-outlook.in
# msal
# pyjwt
idna==3.4
# via
# -c requirements/base.txt
# requests
msal==1.22.0
# via
# -r requirements/ingest-outlook.in
# office365-rest-python-client
office365-rest-python-client==2.4.2
# via -r requirements/ingest-outlook.in
pycparser==2.21
# via
# -c requirements/base.txt
# cffi
pyjwt[crypto]==2.7.0
# via msal
pytz==2023.3
# via
# -c requirements/base.txt
# office365-rest-python-client
requests==2.31.0
# via
# -c requirements/base.txt
# msal
# office365-rest-python-client
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests

View File

@ -85,6 +85,8 @@ setup(
"gcs": load_requirements("requirements/ingest-gcs.in"),
"elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"),
"dropbox": load_requirements("requirements/ingest-dropbox.in"),
"onedrive": load_requirements("requirements/ingest-onedrive.in"),
"outlook": load_requirements("requirements/ingest-outlook.in"),
"confluence": load_requirements("requirements/ingest-confluence.in"),
},
package_dir={"unstructured": "unstructured"},

View File

@ -290,7 +290,7 @@ def test_partition_email_from_file_with_header():
def test_partition_email_from_filename_has_metadata():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0
assert (
@ -302,7 +302,7 @@ def test_partition_email_from_filename_has_metadata():
page_number=None,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
subject="Test Email",
filetype="message/rfc822",
).to_dict()
@ -310,7 +310,7 @@ def test_partition_email_from_filename_has_metadata():
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
assert elements[0].metadata.get_date() == expected_dt
for element in elements:
assert element.metadata.filename == "fake-email-header.eml"
assert element.metadata.filename == "fake-email.eml"
def test_extract_email_text_matches_html():

View File

@ -0,0 +1,20 @@
[
{
"type": "Title",
"element_id": "a0f48ad299334e5716f85d225bfe2a16",
"metadata": {
"data_source": {},
"filename": "21be155fb0c95885.eml",
"date": "2023-07-15T08:35:51-07:00",
"filetype": "message/rfc822",
"sent_from": [
"David Potter <potterdavidm@gmail.com>"
],
"sent_to": [
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "integration test email 1"
},
"text": "integration test email"
}
]

View File

@ -0,0 +1,20 @@
[
{
"type": "NarrativeText",
"element_id": "cebc4803f41f12981b808ffd79d7b480",
"metadata": {
"data_source": {},
"filename": "497eba8c81c801c6.eml",
"date": "2023-07-24T18:25:52-07:00",
"filetype": "message/rfc822",
"sent_from": [
"Ryan Nikolaidis <ryan@unstructured.io>"
],
"sent_to": [
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "subfolder1_1"
},
"text": "this is a message for the subfolder1_1"
}
]

View File

@ -0,0 +1,20 @@
[
{
"type": "NarrativeText",
"element_id": "007ec3bff83ee17497e490b86a36e0dd",
"metadata": {
"data_source": {},
"filename": "4a16a411f162ebbb.eml",
"date": "2023-07-09T20:38:47-07:00",
"filetype": "message/rfc822",
"sent_from": [
"David Potter <potterdavidm@gmail.com>"
],
"sent_to": [
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "message for subfolder"
},
"text": "this is a message for the subfolder"
}
]

View File

@ -17,8 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--download-dir "$DOWNLOAD_DIR" \
--ms-client-cred "$MS_CLIENT_CRED" \
--ms-client-id "$MS_CLIENT_ID" \
--ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \
--ms-user-pname "devops@unstructuredio.onmicrosoft.com" \
--ms-tenant "$MS_TENANT_ID" \
--ms-user-pname "$MS_USER_PNAME" \
--ms-onedrive-folder '/utic-test-ingest-fixtures' \
--metadata-exclude file_directory,metadata.data_source.date_processed \
--num-processes 2 \

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=outlook
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
exit 0
fi
PYTHONPATH=. ./unstructured/ingest/main.py \
--download-dir "$DOWNLOAD_DIR" \
--ms-client-cred "$MS_CLIENT_CRED" \
--ms-client-id "$MS_CLIENT_ID" \
--ms-tenant "$MS_TENANT_ID" \
--ms-user-email "$MS_USER_EMAIL" \
--ms-outlook-folders IntegrationTest \
--metadata-exclude file_directory,metadata.data_source.date_processed \
--num-processes 2 \
--preserve-downloads \
--recursive \
--reprocess \
--structured-output-dir "$OUTPUT_DIR"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -22,6 +22,7 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-against-api.sh
./test_unstructured_ingest/test-ingest-gcs.sh
./test_unstructured_ingest/test-ingest-onedrive.sh
./test_unstructured_ingest/test-ingest-outlook.sh
./test_unstructured_ingest/test-ingest-elasticsearch.sh
./test_unstructured_ingest/test-ingest-confluence-diff.sh
./test_unstructured_ingest/test-ingest-confluence-large.sh

View File

@ -0,0 +1,115 @@
from dataclasses import dataclass
from pathlib import Path
from unstructured.ingest.connector.dropbox import (
DropboxIngestDoc,
)
from unstructured.ingest.connector.fsspec import (
FsspecIngestDoc,
)
from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
StandardConnectorConfig,
)
@dataclass
class FakeConfigDropboxRoot:
output_dir = "/fakeuser/fake_output"
dir_path = " "
download_dir = "/fakeuser/fake_download"
@dataclass
class FakeConfigFolder:
output_dir = "/fakeuser/fake_output"
dir_path = "fake_folder"
download_dir = "/fakeuser/fake_download"
def test_dropbox_root_succeeds():
"""Test that path joining method works for Dropbox root folder. Note slash in front of remote_file_path."""
dbox = DropboxIngestDoc(
config=FakeConfigDropboxRoot,
standard_config=FakeConfigDropboxRoot,
remote_file_path="/fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_root_succeeds2():
"""Test that path joining method works for Dropbox root folder. Note lack of slash in front of remote_file_path.
This still works."""
dbox = DropboxIngestDoc(
config=FakeConfigDropboxRoot,
standard_config=FakeConfigDropboxRoot,
remote_file_path="fake_file.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
def test_dropbox_folder_succeeds():
"""Test that path joining method works for Dropbox root folder. Note no slash in front of remote_file_path."""
dbox = DropboxIngestDoc(
config=FakeConfigFolder,
standard_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_dropbox_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
dbox = DropboxIngestDoc(
config=FakeConfigFolder,
standard_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")
def test_fsspec_folder_succeeds():
"""Test that path joining method works for root folder. Note no slash in front of remote_file_path."""
dbox = FsspecIngestDoc(
config=FakeConfigFolder,
standard_config=FakeConfigFolder,
remote_file_path="fake_file2.txt",
)
output_filename = dbox._output_filename
download_filename = dbox._tmp_download_file()
assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
def test_fsspec_folder_fails():
"""Test that path joining method gives WRONG path. Note slash in front of remote_file_path.
Path joining is sensitive. Note that the path is MISSING the folders."""
fstest = FsspecIngestDoc(
config=FakeConfigFolder,
standard_config=FakeConfigFolder,
remote_file_path="/fake_file2.txt",
)
output_filename = fstest._output_filename
download_filename = fstest._tmp_download_file()
assert output_filename == Path("/fake_file2.txt.json")
assert download_filename == Path("/fake_file2.txt")

View File

@ -1 +1 @@
__version__ = "0.8.2-dev7" # pragma: no cover
__version__ = "0.8.3" # pragma: no cover

View File

@ -166,9 +166,9 @@ class FsspecConnector(ConnectorCleanupMixin, BaseConnector):
def get_ingest_docs(self):
return [
self.ingest_doc_cls(
self.standard_config,
self.config,
file,
standard_config=self.standard_config,
config=self.config,
remote_file_path=file,
)
for file in self._list_files()
]

View File

@ -0,0 +1,230 @@
import hashlib
import os
from collections import defaultdict
from dataclasses import dataclass, field
from itertools import chain
from pathlib import Path
from typing import List
from office365.onedrive.driveitems.driveItem import DriveItem
from unstructured.ingest.interfaces import (
BaseConnector,
BaseConnectorConfig,
BaseIngestDoc,
ConnectorCleanupMixin,
IngestDocCleanupMixin,
StandardConnectorConfig,
)
from unstructured.ingest.logger import logger
from unstructured.utils import requires_dependencies
MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder
class MissingFolderError(Exception):
"""There are no root folders with those names."""
@dataclass
class SimpleOutlookConfig(BaseConnectorConfig):
"""This class is getting the token."""
client_id: str
client_credential: str = field(repr=False)
user_email: str
tenant: str = field(repr=False)
authority_url: str = field(repr=False)
ms_outlook_folders: List[str]
recursive: bool = False
def __post_init__(self):
if not (self.client_id and self.client_credential and self.user_email):
raise ValueError(
"Please provide one of the following mandatory values:"
"\n--ms-client_id\n--ms-client_cred\n--ms-user-email",
)
self.token_factory = self._acquire_token
@requires_dependencies(["msal"])
def _acquire_token(self):
from msal import ConfidentialClientApplication
try:
app = ConfidentialClientApplication(
authority=f"{self.authority_url}/{self.tenant}",
client_id=self.client_id,
client_credential=self.client_credential,
)
token = app.acquire_token_for_client(
scopes=["https://graph.microsoft.com/.default"],
)
except ValueError as exc:
logger.error("Couldn't set up credentials for Outlook")
raise exc
return token
@staticmethod
def parse_folders(folder_str: str) -> List[str]:
"""Parses a comma separated string of Outlook folders into a list."""
return [x.strip() for x in folder_str.split(",")]
@dataclass
class OutlookIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
config: SimpleOutlookConfig
file: DriveItem
def __post_init__(self):
self._set_download_paths()
def hash_mail_name(self, id):
"""Outlook email ids are 152 char long. Hash to shorten to 16."""
return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16]
def _set_download_paths(self) -> None:
"""Creates paths for downloading and parsing."""
download_path = Path(f"{self.standard_config.download_dir}")
output_path = Path(f"{self.standard_config.output_dir}")
self.download_dir = download_path
self.download_filepath = (
download_path / f"{self.hash_mail_name(self.file.id)}.eml"
).resolve()
oname = f"{self.hash_mail_name(self.file.id)}.eml.json"
self.output_dir = output_path
self.output_filepath = (output_path / oname).resolve()
@property
def filename(self):
return Path(self.download_filepath).resolve()
@property
def _output_filename(self):
return Path(self.output_filepath).resolve()
@BaseIngestDoc.skip_if_file_exists
@requires_dependencies(["office365"])
def get_file(self):
"""Relies on Office365 python sdk message object to do the download."""
try:
if not self.download_dir.is_dir():
logger.debug(f"Creating directory: {self.download_dir}")
self.download_dir.mkdir(parents=True, exist_ok=True)
with open(
os.path.join(
self.download_dir,
self.hash_mail_name(self.file.id) + ".eml",
),
"wb",
) as local_file:
self.file.download(
local_file,
).execute_query() # download MIME representation of a message
except Exception as e:
logger.error(
f"Error while downloading and saving file: {self.file.subject}.",
)
logger.error(e)
return
logger.info(f"File downloaded: {self.file.subject}")
return
class OutlookConnector(ConnectorCleanupMixin, BaseConnector):
config: SimpleOutlookConfig
def __init__(
self,
standard_config: StandardConnectorConfig,
config: SimpleOutlookConfig,
):
super().__init__(standard_config, config)
self._set_client()
self.get_folder_ids()
@requires_dependencies(["office365"])
def _set_client(self):
from office365.graph_client import GraphClient
self.client = GraphClient(self.config.token_factory)
def initialize(self):
pass
def recurse_folders(self, folder_id, main_folder_dict):
"""We only get a count of subfolders for any folder.
Have to make additional calls to get subfolder ids."""
subfolders = (
self.client.users[self.config.user_email]
.mail_folders[folder_id]
.child_folders.get()
.execute_query()
)
for subfolder in subfolders:
for k, v in main_folder_dict.items():
if subfolder.get_property("parentFolderId") in v:
v.append(subfolder.id)
if subfolder.get_property("childFolderCount") > 0:
self.recurse_folders(subfolder.id, main_folder_dict)
def get_folder_ids(self):
"""Sets the mail folder ids and subfolder ids for requested root mail folders."""
self.root_folders = defaultdict(list)
root_folders_with_subfolders = []
get_root_folders = (
self.client.users[self.config.user_email].mail_folders.get().execute_query()
)
for folder in get_root_folders:
self.root_folders[folder.display_name].append(folder.id)
if folder.get_property("childFolderCount") > 0:
root_folders_with_subfolders.append(folder.id)
for folder in root_folders_with_subfolders:
self.recurse_folders(folder, self.root_folders)
# Narrow down all mail folder ids (plus all subfolders) to the ones that were requested.
self.selected_folder_ids = list(
chain.from_iterable(
[
v
for k, v in self.root_folders.items()
if k.lower() in [x.lower() for x in self.config.ms_outlook_folders]
],
),
)
if not self.selected_folder_ids:
raise MissingFolderError(
f"There are no root folders with the names: {self.config.ms_outlook_folders}",
)
def get_ingest_docs(self):
"""Returns a list of all the message objects that are in the requested root folder(s)."""
filtered_messages = []
# Get all the relevant messages in the selected folders/subfolders.
for folder_id in self.selected_folder_ids:
messages = (
self.client.users[self.config.user_email]
.mail_folders[folder_id]
.messages.get()
.top(MAX_NUM_EMAILS) # Prevents the return from paging
.execute_query()
)
# Skip empty list if there are no messages in folder.
if messages:
filtered_messages.append(messages)
# Filtered messages have an un-downloadable resource path.
# So we get each message object individually.
individual_messages = []
for m in list(chain.from_iterable(filtered_messages)):
messages = (
self.client.users[self.config.user_email].messages[m.id].get().execute_query()
)
individual_messages.append(messages)
return [OutlookIngestDoc(self.standard_config, self.config, f) for f in individual_messages]

View File

@ -427,6 +427,17 @@ class MainProcess:
default=None,
help="Folder to start parsing files from.",
)
@click.option(
"--ms-user-email",
default=None,
help="Outlook email to download messages from.",
)
@click.option(
"--ms-outlook-folders",
default=None,
help="Comma separated list of folders to download email messages from. "
"Do not specify subfolders. Use quotes if spaces in folder names.",
)
@click.option(
"--elasticsearch-url",
default=None,
@ -568,6 +579,8 @@ def main(
ms_tenant,
ms_user_pname,
ms_onedrive_folder,
ms_user_email,
ms_outlook_folders,
elasticsearch_url,
elasticsearch_index_name,
jq_query,
@ -681,6 +694,8 @@ def main(
hashed_dir_name = hashlib.sha256(
f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
)
elif ms_user_email:
hashed_dir_name = hashlib.sha256(ms_user_email.encode("utf-8"))
elif confluence_url:
hashed_dir_name = hashlib.sha256(
f"{confluence_url}".encode("utf-8"),
@ -910,7 +925,7 @@ def main(
decay=biomed_decay,
),
)
elif ms_client_id or ms_user_pname:
elif ms_client_id and ms_user_pname:
from unstructured.ingest.connector.onedrive import (
OneDriveConnector,
SimpleOneDriveConfig,
@ -929,6 +944,25 @@ def main(
),
)
elif ms_client_id and ms_user_email:
from unstructured.ingest.connector.outlook import (
OutlookConnector,
SimpleOutlookConfig,
)
doc_connector = OutlookConnector( # type: ignore
standard_config=standard_config,
config=SimpleOutlookConfig(
client_id=ms_client_id,
client_credential=ms_client_cred,
user_email=ms_user_email,
tenant=ms_tenant,
authority_url=ms_authority_url,
ms_outlook_folders=SimpleOutlookConfig.parse_folders(ms_outlook_folders),
recursive=recursive,
),
)
elif local_input_path:
from unstructured.ingest.connector.local import (
LocalConnector,

View File

@ -111,7 +111,7 @@ def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetada
if email_date is not None:
email_date = convert_to_iso_8601(email_date)
sent_from = header_dict.get("To")
sent_from = header_dict.get("From")
if sent_from is not None:
sent_from = [sender.strip() for sender in sent_from.split(",")]