mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	feat: adds Outlook connector (#939)
* bonus: fixes issue with email partitioning where From field was being assigned the To field value.
This commit is contained in:
		
							parent
							
								
									d694cd53bf
								
							
						
					
					
						commit
						f7e46af22f
					
				
							
								
								
									
										4
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							@ -192,7 +192,10 @@ jobs:
 | 
				
			|||||||
        GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
 | 
					        GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
 | 
				
			||||||
        MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
 | 
					        MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
 | 
				
			||||||
        MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
 | 
					        MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
 | 
				
			||||||
 | 
					        MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
 | 
				
			||||||
 | 
					        MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
 | 
				
			||||||
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
					        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
				
			||||||
 | 
					        MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
 | 
				
			||||||
      run: |
 | 
					      run: |
 | 
				
			||||||
        source .venv/bin/activate
 | 
					        source .venv/bin/activate
 | 
				
			||||||
        sudo apt-get update
 | 
					        sudo apt-get update
 | 
				
			||||||
@ -212,6 +215,7 @@ jobs:
 | 
				
			|||||||
        make install-ingest-github
 | 
					        make install-ingest-github
 | 
				
			||||||
        make install-ingest-gitlab
 | 
					        make install-ingest-gitlab
 | 
				
			||||||
        make install-ingest-onedrive
 | 
					        make install-ingest-onedrive
 | 
				
			||||||
 | 
					        make install-ingest-outlook
 | 
				
			||||||
        make install-ingest-slack
 | 
					        make install-ingest-slack
 | 
				
			||||||
        make install-ingest-wikipedia
 | 
					        make install-ingest-wikipedia
 | 
				
			||||||
        ./test_unstructured_ingest/test-ingest.sh
 | 
					        ./test_unstructured_ingest/test-ingest.sh
 | 
				
			||||||
 | 
				
			|||||||
@ -67,6 +67,9 @@ jobs:
 | 
				
			|||||||
          UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
					          UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
				
			||||||
          MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
 | 
					          MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
 | 
				
			||||||
          MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
 | 
					          MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
 | 
				
			||||||
 | 
					          MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
 | 
				
			||||||
 | 
					          MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
 | 
				
			||||||
 | 
					          MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
 | 
				
			||||||
          OVERWRITE_FIXTURES: "true"
 | 
					          OVERWRITE_FIXTURES: "true"
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          source .venv/bin/activate
 | 
					          source .venv/bin/activate
 | 
				
			||||||
@ -87,6 +90,7 @@ jobs:
 | 
				
			|||||||
          make install-ingest-github
 | 
					          make install-ingest-github
 | 
				
			||||||
          make install-ingest-gitlab
 | 
					          make install-ingest-gitlab
 | 
				
			||||||
          make install-ingest-onedrive
 | 
					          make install-ingest-onedrive
 | 
				
			||||||
 | 
					          make install-ingest-outlook
 | 
				
			||||||
          make install-ingest-slack
 | 
					          make install-ingest-slack
 | 
				
			||||||
          make install-ingest-wikipedia
 | 
					          make install-ingest-wikipedia
 | 
				
			||||||
          ./test_unstructured_ingest/test-ingest.sh
 | 
					          ./test_unstructured_ingest/test-ingest.sh
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										12
									
								
								CHANGELOG.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								CHANGELOG.md
									
									
									
									
									
								
							@ -1,3 +1,15 @@
 | 
				
			|||||||
 | 
					## 0.8.3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Enhancements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Adds Outlook connector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Fixes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Fixes issue with email partitioning where From field was being assigned the To field value.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 0.8.2-dev7
 | 
					## 0.8.2-dev7
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Enhancements
 | 
					### Enhancements
 | 
				
			||||||
 | 
				
			|||||||
@ -12,3 +12,5 @@ include requirements/ingest-reddit.in
 | 
				
			|||||||
include requirements/ingest-slack.in
 | 
					include requirements/ingest-slack.in
 | 
				
			||||||
include requirements/ingest-wikipedia.in
 | 
					include requirements/ingest-wikipedia.in
 | 
				
			||||||
include requirements/ingest-google-drive.in
 | 
					include requirements/ingest-google-drive.in
 | 
				
			||||||
 | 
					include requirements/ingest-outlook.in
 | 
				
			||||||
 | 
					include requirements/ingest-onedrive.in
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										4
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								Makefile
									
									
									
									
									
								
							@ -90,6 +90,10 @@ install-ingest-gitlab:
 | 
				
			|||||||
install-ingest-onedrive:
 | 
					install-ingest-onedrive:
 | 
				
			||||||
	python3 -m pip install -r requirements/ingest-onedrive.txt
 | 
						python3 -m pip install -r requirements/ingest-onedrive.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.PHONY: install-ingest-outlook
 | 
				
			||||||
 | 
					install-ingest-outlook:
 | 
				
			||||||
 | 
						python3 -m pip install -r requirements/ingest-outlook.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.PHONY: install-ingest-reddit
 | 
					.PHONY: install-ingest-reddit
 | 
				
			||||||
install-ingest-reddit:
 | 
					install-ingest-reddit:
 | 
				
			||||||
	python3 -m pip install -r requirements/ingest-reddit.txt
 | 
						python3 -m pip install -r requirements/ingest-reddit.txt
 | 
				
			||||||
 | 
				
			|||||||
@ -25,7 +25,7 @@ NOTE: Keep in mind that you will need to have all the appropriate extras and dep
 | 
				
			|||||||
--------------------
 | 
					--------------------
 | 
				
			||||||
You can batch process documents stored in your Azure Blob Container using the `Azure Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/azure.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/azure/ingest.sh>`_.
 | 
					You can batch process documents stored in your Azure Blob Container using the `Azure Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/azure.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/azure/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[azure]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[azure]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``BioMed Connector``
 | 
					``BioMed Connector``
 | 
				
			||||||
@ -37,49 +37,49 @@ You can process `National Center for Biotechnology Information <https://www.ncbi
 | 
				
			|||||||
----------------------
 | 
					----------------------
 | 
				
			||||||
You can preprocess your Discord channel using the `Discord Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/discord.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/discord/ingest.sh>`_.
 | 
					You can preprocess your Discord channel using the `Discord Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/discord.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/discord/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[discord]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[discord]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Dropbox Connector``
 | 
					``Dropbox Connector``
 | 
				
			||||||
----------------------
 | 
					----------------------
 | 
				
			||||||
You can batch process unstructured documents in your Dropbox by using the `Dropbox Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/dropbox.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/dropbox/ingest.sh>`_.
 | 
					You can batch process unstructured documents in your Dropbox by using the `Dropbox Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/dropbox.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/dropbox/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[dropbox]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[dropbox]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Elasticsearch Connector``
 | 
					``Elasticsearch Connector``
 | 
				
			||||||
----------------------------
 | 
					----------------------------
 | 
				
			||||||
You can preprocess documents stored in Elasticsearch by using the `Elasticsearch Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/elasticsearch.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/elasticsearch/ingest.sh>`_.
 | 
					You can preprocess documents stored in Elasticsearch by using the `Elasticsearch Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/elasticsearch.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/elasticsearch/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[elasticsearch]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[elasticsearch]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Google Cloud Storage Connector``
 | 
					``Google Cloud Storage Connector``
 | 
				
			||||||
------------------
 | 
					------------------
 | 
				
			||||||
You can batch load the files you have stored in Google Cloud Storage with the `GCS Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gcs.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_cloud_storage/ingest.sh>`_.
 | 
					You can batch load the files you have stored in Google Cloud Storage with the `GCS Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gcs.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_cloud_storage/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[gcs]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[gcs]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Github Connector``
 | 
					``Github Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can process files in a Github repository using the `Github Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/github.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/github/ingest.sh>`_.
 | 
					You can process files in a Github repository using the `Github Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/github.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/github/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[github]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[github]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Gitlab Connector``
 | 
					``Gitlab Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can batch load files in a Gitlab repository using the `Gitlab Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gitlab.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/gitlab/ingest.sh>`_.
 | 
					You can batch load files in a Gitlab repository using the `Gitlab Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/gitlab.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/gitlab/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[gitlab]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[gitlab]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Google Drive Connector``
 | 
					``Google Drive Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can batch process documents stored in your Google Drive with the `Google Drive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/google_drive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_drive/ingest.sh>`_.
 | 
					You can batch process documents stored in your Google Drive with the `Google Drive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/google_drive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/google_drive/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[google-drive]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[google-drive]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Local Connector``
 | 
					``Local Connector``
 | 
				
			||||||
@ -89,34 +89,42 @@ You can batch load your unstructured files in a local directory for preprocessin
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
``OneDrive Connector``
 | 
					``OneDrive Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/onedrive.sh>`_.
 | 
					You can batch process documents stored in Microsoft OneDrive with the `OneDrive Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/onedrive.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/onedrive/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[onedrive]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					``Outlook Connector``
 | 
				
			||||||
 | 
					---------------------
 | 
				
			||||||
 | 
					You can batch process email stored in Microsoft Outlook with the `Outlook Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/outlook.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/main/examples/ingest/outlook/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[outlook]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[onedrive]``
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Reddit Connector``
 | 
					``Reddit Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can use the `Reddit Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/reddit.py>`_ to preprocess a Reddit thread. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/reddit/ingest.sh>`_.
 | 
					You can use the `Reddit Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/reddit.py>`_ to preprocess a Reddit thread. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/reddit/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[reddit]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[reddit]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``S3 Connector``
 | 
					``S3 Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can process your files stored in S3 in batch using the `S3 Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/s3.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/s3-small-batch/ingest.sh>`_.
 | 
					You can process your files stored in S3 in batch using the `S3 Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/s3.py>`_. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/s3-small-batch/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[s3]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[s3]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Slack Connector``
 | 
					``Slack Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
Using the `Slack Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ you can batch process a channel. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/slack/ingest.sh>`_.
 | 
					Using the `Slack Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ you can batch process a channel. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/slack/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[slack]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[slack]"``
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
``Wikipedia Connector``
 | 
					``Wikipedia Connector``
 | 
				
			||||||
---------------------
 | 
					---------------------
 | 
				
			||||||
You can load and process a Wikipedia page using the `Wikipedia Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ to preprocess for your model. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/wikipedia/ingest.sh>`_.
 | 
					You can load and process a Wikipedia page using the `Wikipedia Connector <https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/connector/slack.py>`_ to preprocess for your model. You can find an example of how to use it `here <https://github.com/Unstructured-IO/unstructured/blob/f5541c7b0b1e2fc47ec88da5e02080d60e1441e2/examples/ingest/wikipedia/ingest.sh>`_.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install all dependencies for this connector run: ``pip install unstructured[wikipedia]``
 | 
					To install all dependencies for this connector run: ``pip install "unstructured[wikipedia]"``
 | 
				
			||||||
 | 
				
			|||||||
@ -3,7 +3,7 @@ Date: Fri, 16 Dec 2022 17:04:16 -0500
 | 
				
			|||||||
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
 | 
					Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
 | 
				
			||||||
Subject: Test Email
 | 
					Subject: Test Email
 | 
				
			||||||
From: Matthew Robinson <mrobinson@unstructured.io>
 | 
					From: Matthew Robinson <mrobinson@unstructured.io>
 | 
				
			||||||
To: Matthew Robinson <mrobinson@unstructured.io>
 | 
					To: NotMatthew <NotMatthew@notunstructured.com>
 | 
				
			||||||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
 | 
					Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
--00000000000095c9b205eff92630
 | 
					--00000000000095c9b205eff92630
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										31
									
								
								examples/ingest/outlook/ingest.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										31
									
								
								examples/ingest/outlook/ingest.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,31 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Processes Outlook emails through Unstructured's library. Does not download attachments.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Structured outputs are stored in outlook-output/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# NOTE, this script is not ready-to-run!
 | 
				
			||||||
 | 
					# You must enter a Azure AD app client-id, client secret, tenant-id, and email   
 | 
				
			||||||
 | 
					# before running. 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# To get the credentials for your Azure AD app, follow these steps:
 | 
				
			||||||
 | 
					# https://learn.microsoft.com/en-us/graph/auth-register-app-v2
 | 
				
			||||||
 | 
					# https://learn.microsoft.com/en-us/graph/auth-v2-service
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Assign the neccesary permissions for the application to read from mail.
 | 
				
			||||||
 | 
					# https://learn.microsoft.com/en-us/graph/permissions-reference
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 | 
				
			||||||
 | 
					cd "$SCRIPT_DIR"/../../.. || exit 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
				
			||||||
 | 
					    --ms-client-id "$MS_CLIENT_ID" \
 | 
				
			||||||
 | 
					    --ms-client-cred "$MS_CLIENT_CRED" \
 | 
				
			||||||
 | 
					    --ms-tenant "$MS_TENANT_ID" \
 | 
				
			||||||
 | 
					    --ms-user-email "$MS_USER_EMAIL" \
 | 
				
			||||||
 | 
					    --ms-outlook-folders Inbox,"Sent Items" \
 | 
				
			||||||
 | 
					    --structured-output-dir outlook-output \
 | 
				
			||||||
 | 
					    --num-processes 2 \
 | 
				
			||||||
 | 
					    --recursive \
 | 
				
			||||||
 | 
					    --verbose
 | 
				
			||||||
							
								
								
									
										5
									
								
								requirements/ingest-outlook.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								requirements/ingest-outlook.in
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					-c constraints.in
 | 
				
			||||||
 | 
					-c base.txt
 | 
				
			||||||
 | 
					msal
 | 
				
			||||||
 | 
					Office365-REST-Python-Client
 | 
				
			||||||
 | 
					cryptography==41.0.2
 | 
				
			||||||
							
								
								
									
										55
									
								
								requirements/ingest-outlook.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								requirements/ingest-outlook.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,55 @@
 | 
				
			|||||||
 | 
					#
 | 
				
			||||||
 | 
					# This file is autogenerated by pip-compile with Python 3.8
 | 
				
			||||||
 | 
					# by the following command:
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#    pip-compile requirements/ingest-outlook.in
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					certifi==2023.5.7
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   -c requirements/constraints.in
 | 
				
			||||||
 | 
					    #   requests
 | 
				
			||||||
 | 
					cffi==1.15.1
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   cryptography
 | 
				
			||||||
 | 
					charset-normalizer==3.2.0
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   requests
 | 
				
			||||||
 | 
					cryptography==41.0.2
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   -r requirements/ingest-outlook.in
 | 
				
			||||||
 | 
					    #   msal
 | 
				
			||||||
 | 
					    #   pyjwt
 | 
				
			||||||
 | 
					idna==3.4
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   requests
 | 
				
			||||||
 | 
					msal==1.22.0
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -r requirements/ingest-outlook.in
 | 
				
			||||||
 | 
					    #   office365-rest-python-client
 | 
				
			||||||
 | 
					office365-rest-python-client==2.4.2
 | 
				
			||||||
 | 
					    # via -r requirements/ingest-outlook.in
 | 
				
			||||||
 | 
					pycparser==2.21
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   cffi
 | 
				
			||||||
 | 
					pyjwt[crypto]==2.7.0
 | 
				
			||||||
 | 
					    # via msal
 | 
				
			||||||
 | 
					pytz==2023.3
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   office365-rest-python-client
 | 
				
			||||||
 | 
					requests==2.31.0
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   msal
 | 
				
			||||||
 | 
					    #   office365-rest-python-client
 | 
				
			||||||
 | 
					urllib3==1.26.16
 | 
				
			||||||
 | 
					    # via
 | 
				
			||||||
 | 
					    #   -c requirements/base.txt
 | 
				
			||||||
 | 
					    #   -c requirements/constraints.in
 | 
				
			||||||
 | 
					    #   requests
 | 
				
			||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							@ -85,6 +85,8 @@ setup(
 | 
				
			|||||||
        "gcs": load_requirements("requirements/ingest-gcs.in"),
 | 
					        "gcs": load_requirements("requirements/ingest-gcs.in"),
 | 
				
			||||||
        "elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"),
 | 
					        "elasticsearch": load_requirements("requirements/ingest-elasticsearch.in"),
 | 
				
			||||||
        "dropbox": load_requirements("requirements/ingest-dropbox.in"),
 | 
					        "dropbox": load_requirements("requirements/ingest-dropbox.in"),
 | 
				
			||||||
 | 
					        "onedrive": load_requirements("requirements/ingest-onedrive.in"),
 | 
				
			||||||
 | 
					        "outlook": load_requirements("requirements/ingest-outlook.in"),
 | 
				
			||||||
        "confluence": load_requirements("requirements/ingest-confluence.in"),
 | 
					        "confluence": load_requirements("requirements/ingest-confluence.in"),
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    package_dir={"unstructured": "unstructured"},
 | 
					    package_dir={"unstructured": "unstructured"},
 | 
				
			||||||
 | 
				
			|||||||
@ -290,7 +290,7 @@ def test_partition_email_from_file_with_header():
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_partition_email_from_filename_has_metadata():
 | 
					def test_partition_email_from_filename_has_metadata():
 | 
				
			||||||
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
 | 
					    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
 | 
				
			||||||
    elements = partition_email(filename=filename)
 | 
					    elements = partition_email(filename=filename)
 | 
				
			||||||
    assert len(elements) > 0
 | 
					    assert len(elements) > 0
 | 
				
			||||||
    assert (
 | 
					    assert (
 | 
				
			||||||
@ -302,7 +302,7 @@ def test_partition_email_from_filename_has_metadata():
 | 
				
			|||||||
            page_number=None,
 | 
					            page_number=None,
 | 
				
			||||||
            url=None,
 | 
					            url=None,
 | 
				
			||||||
            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
 | 
					            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
 | 
				
			||||||
            sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
 | 
					            sent_to=["NotMatthew <NotMatthew@notunstructured.com>"],
 | 
				
			||||||
            subject="Test Email",
 | 
					            subject="Test Email",
 | 
				
			||||||
            filetype="message/rfc822",
 | 
					            filetype="message/rfc822",
 | 
				
			||||||
        ).to_dict()
 | 
					        ).to_dict()
 | 
				
			||||||
@ -310,7 +310,7 @@ def test_partition_email_from_filename_has_metadata():
 | 
				
			|||||||
    expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
 | 
					    expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
 | 
				
			||||||
    assert elements[0].metadata.get_date() == expected_dt
 | 
					    assert elements[0].metadata.get_date() == expected_dt
 | 
				
			||||||
    for element in elements:
 | 
					    for element in elements:
 | 
				
			||||||
        assert element.metadata.filename == "fake-email-header.eml"
 | 
					        assert element.metadata.filename == "fake-email.eml"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_extract_email_text_matches_html():
 | 
					def test_extract_email_text_matches_html():
 | 
				
			||||||
 | 
				
			|||||||
@ -0,0 +1,20 @@
 | 
				
			|||||||
 | 
					[
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "type": "Title",
 | 
				
			||||||
 | 
					    "element_id": "a0f48ad299334e5716f85d225bfe2a16",
 | 
				
			||||||
 | 
					    "metadata": {
 | 
				
			||||||
 | 
					      "data_source": {},
 | 
				
			||||||
 | 
					      "filename": "21be155fb0c95885.eml",
 | 
				
			||||||
 | 
					      "date": "2023-07-15T08:35:51-07:00",
 | 
				
			||||||
 | 
					      "filetype": "message/rfc822",
 | 
				
			||||||
 | 
					      "sent_from": [
 | 
				
			||||||
 | 
					        "David Potter <potterdavidm@gmail.com>"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "sent_to": [
 | 
				
			||||||
 | 
					        "devops@unstructuredio.onmicrosoft.com"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "subject": "integration test email 1"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "text": "integration test email"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
@ -0,0 +1,20 @@
 | 
				
			|||||||
 | 
					[
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "type": "NarrativeText",
 | 
				
			||||||
 | 
					    "element_id": "cebc4803f41f12981b808ffd79d7b480",
 | 
				
			||||||
 | 
					    "metadata": {
 | 
				
			||||||
 | 
					      "data_source": {},
 | 
				
			||||||
 | 
					      "filename": "497eba8c81c801c6.eml",
 | 
				
			||||||
 | 
					      "date": "2023-07-24T18:25:52-07:00",
 | 
				
			||||||
 | 
					      "filetype": "message/rfc822",
 | 
				
			||||||
 | 
					      "sent_from": [
 | 
				
			||||||
 | 
					        "Ryan Nikolaidis <ryan@unstructured.io>"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "sent_to": [
 | 
				
			||||||
 | 
					        "devops@unstructuredio.onmicrosoft.com"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "subject": "subfolder1_1"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "text": "this is a message for the subfolder1_1"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
@ -0,0 +1,20 @@
 | 
				
			|||||||
 | 
					[
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "type": "NarrativeText",
 | 
				
			||||||
 | 
					    "element_id": "007ec3bff83ee17497e490b86a36e0dd",
 | 
				
			||||||
 | 
					    "metadata": {
 | 
				
			||||||
 | 
					      "data_source": {},
 | 
				
			||||||
 | 
					      "filename": "4a16a411f162ebbb.eml",
 | 
				
			||||||
 | 
					      "date": "2023-07-09T20:38:47-07:00",
 | 
				
			||||||
 | 
					      "filetype": "message/rfc822",
 | 
				
			||||||
 | 
					      "sent_from": [
 | 
				
			||||||
 | 
					        "David Potter <potterdavidm@gmail.com>"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "sent_to": [
 | 
				
			||||||
 | 
					        "devops@unstructuredio.onmicrosoft.com"
 | 
				
			||||||
 | 
					      ],
 | 
				
			||||||
 | 
					      "subject": "message for subfolder"
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "text": "this is a message for the subfolder"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
@ -17,8 +17,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
				
			|||||||
    --download-dir "$DOWNLOAD_DIR" \
 | 
					    --download-dir "$DOWNLOAD_DIR" \
 | 
				
			||||||
    --ms-client-cred "$MS_CLIENT_CRED" \
 | 
					    --ms-client-cred "$MS_CLIENT_CRED" \
 | 
				
			||||||
    --ms-client-id "$MS_CLIENT_ID" \
 | 
					    --ms-client-id "$MS_CLIENT_ID" \
 | 
				
			||||||
    --ms-tenant "3d60a7e5-1e32-414e-839b-1c6e6782613d" \
 | 
					    --ms-tenant "$MS_TENANT_ID" \
 | 
				
			||||||
    --ms-user-pname "devops@unstructuredio.onmicrosoft.com" \
 | 
					    --ms-user-pname "$MS_USER_PNAME" \
 | 
				
			||||||
    --ms-onedrive-folder '/utic-test-ingest-fixtures' \
 | 
					    --ms-onedrive-folder '/utic-test-ingest-fixtures' \
 | 
				
			||||||
    --metadata-exclude file_directory,metadata.data_source.date_processed \
 | 
					    --metadata-exclude file_directory,metadata.data_source.date_processed \
 | 
				
			||||||
    --num-processes 2 \
 | 
					    --num-processes 2 \
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										30
									
								
								test_unstructured_ingest/test-ingest-outlook.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										30
									
								
								test_unstructured_ingest/test-ingest-outlook.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,30 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/env bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					set -e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SCRIPT_DIR=$(dirname "$(realpath "$0")")
 | 
				
			||||||
 | 
					cd "$SCRIPT_DIR"/.. || exit 1
 | 
				
			||||||
 | 
					OUTPUT_FOLDER_NAME=outlook
 | 
				
			||||||
 | 
					OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
 | 
				
			||||||
 | 
					DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
 | 
				
			||||||
 | 
					   echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
 | 
				
			||||||
 | 
					   exit 0
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
				
			||||||
 | 
					    --download-dir "$DOWNLOAD_DIR" \
 | 
				
			||||||
 | 
					    --ms-client-cred "$MS_CLIENT_CRED" \
 | 
				
			||||||
 | 
					    --ms-client-id "$MS_CLIENT_ID" \
 | 
				
			||||||
 | 
					    --ms-tenant "$MS_TENANT_ID" \
 | 
				
			||||||
 | 
					    --ms-user-email "$MS_USER_EMAIL" \
 | 
				
			||||||
 | 
					    --ms-outlook-folders IntegrationTest \
 | 
				
			||||||
 | 
					    --metadata-exclude file_directory,metadata.data_source.date_processed \
 | 
				
			||||||
 | 
					    --num-processes 2 \
 | 
				
			||||||
 | 
					    --preserve-downloads \
 | 
				
			||||||
 | 
					    --recursive \
 | 
				
			||||||
 | 
					    --reprocess \
 | 
				
			||||||
 | 
					    --structured-output-dir "$OUTPUT_DIR"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
 | 
				
			||||||
@ -22,6 +22,7 @@ export OMP_THREAD_LIMIT=1
 | 
				
			|||||||
./test_unstructured_ingest/test-ingest-against-api.sh
 | 
					./test_unstructured_ingest/test-ingest-against-api.sh
 | 
				
			||||||
./test_unstructured_ingest/test-ingest-gcs.sh
 | 
					./test_unstructured_ingest/test-ingest-gcs.sh
 | 
				
			||||||
./test_unstructured_ingest/test-ingest-onedrive.sh
 | 
					./test_unstructured_ingest/test-ingest-onedrive.sh
 | 
				
			||||||
 | 
					./test_unstructured_ingest/test-ingest-outlook.sh
 | 
				
			||||||
./test_unstructured_ingest/test-ingest-elasticsearch.sh
 | 
					./test_unstructured_ingest/test-ingest-elasticsearch.sh
 | 
				
			||||||
./test_unstructured_ingest/test-ingest-confluence-diff.sh
 | 
					./test_unstructured_ingest/test-ingest-confluence-diff.sh
 | 
				
			||||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
 | 
					./test_unstructured_ingest/test-ingest-confluence-large.sh
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										115
									
								
								test_unstructured_ingest/unit/test_paths.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								test_unstructured_ingest/unit/test_paths.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,115 @@
 | 
				
			|||||||
 | 
					from dataclasses import dataclass
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from unstructured.ingest.connector.dropbox import (
 | 
				
			||||||
 | 
					    DropboxIngestDoc,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from unstructured.ingest.connector.fsspec import (
 | 
				
			||||||
 | 
					    FsspecIngestDoc,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from unstructured.ingest.interfaces import (
 | 
				
			||||||
 | 
					    BaseConnectorConfig,
 | 
				
			||||||
 | 
					    BaseIngestDoc,
 | 
				
			||||||
 | 
					    StandardConnectorConfig,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class FakeConfigDropboxRoot:
 | 
				
			||||||
 | 
					    output_dir = "/fakeuser/fake_output"
 | 
				
			||||||
 | 
					    dir_path = " "
 | 
				
			||||||
 | 
					    download_dir = "/fakeuser/fake_download"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class FakeConfigFolder:
 | 
				
			||||||
 | 
					    output_dir = "/fakeuser/fake_output"
 | 
				
			||||||
 | 
					    dir_path = "fake_folder"
 | 
				
			||||||
 | 
					    download_dir = "/fakeuser/fake_download"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_dropbox_root_succeeds():
 | 
				
			||||||
 | 
					    """Test that path joining method works for Dropbox root folder. Note slash in front of remote_file_path."""
 | 
				
			||||||
 | 
					    dbox = DropboxIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigDropboxRoot,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigDropboxRoot,
 | 
				
			||||||
 | 
					        remote_file_path="/fake_file.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = dbox._output_filename
 | 
				
			||||||
 | 
					    download_filename = dbox._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_dropbox_root_succeeds2():
 | 
				
			||||||
 | 
					    """Test that path joining method works for Dropbox root folder. Note lack of slash in front of remote_file_path.
 | 
				
			||||||
 | 
					    This still works."""
 | 
				
			||||||
 | 
					    dbox = DropboxIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigDropboxRoot,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigDropboxRoot,
 | 
				
			||||||
 | 
					        remote_file_path="fake_file.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = dbox._output_filename
 | 
				
			||||||
 | 
					    download_filename = dbox._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fakeuser/fake_download/fake_file.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_dropbox_folder_succeeds():
 | 
				
			||||||
 | 
					    """Test that path joining method works for Dropbox root folder. Note no slash in front of remote_file_path."""
 | 
				
			||||||
 | 
					    dbox = DropboxIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        remote_file_path="fake_file2.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = dbox._output_filename
 | 
				
			||||||
 | 
					    download_filename = dbox._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_dropbox_folder_fails():
 | 
				
			||||||
 | 
					    """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. 
 | 
				
			||||||
 | 
					    Path joining is sensitive. Note that the path is MISSING the folders."""
 | 
				
			||||||
 | 
					    dbox = DropboxIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        remote_file_path="/fake_file2.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = dbox._output_filename
 | 
				
			||||||
 | 
					    download_filename = dbox._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fake_file2.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fake_file2.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_fsspec_folder_succeeds():
 | 
				
			||||||
 | 
					    """Test that path joining method works for root folder. Note no slash in front of remote_file_path."""
 | 
				
			||||||
 | 
					    dbox = FsspecIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        remote_file_path="fake_file2.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = dbox._output_filename
 | 
				
			||||||
 | 
					    download_filename = dbox._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_fsspec_folder_fails():
 | 
				
			||||||
 | 
					    """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. 
 | 
				
			||||||
 | 
					    Path joining is sensitive. Note that the path is MISSING the folders."""
 | 
				
			||||||
 | 
					    fstest = FsspecIngestDoc(
 | 
				
			||||||
 | 
					        config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        standard_config=FakeConfigFolder,
 | 
				
			||||||
 | 
					        remote_file_path="/fake_file2.txt",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    output_filename = fstest._output_filename
 | 
				
			||||||
 | 
					    download_filename = fstest._tmp_download_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert output_filename == Path("/fake_file2.txt.json")
 | 
				
			||||||
 | 
					    assert download_filename == Path("/fake_file2.txt")
 | 
				
			||||||
@ -1 +1 @@
 | 
				
			|||||||
__version__ = "0.8.2-dev7"  # pragma: no cover
 | 
					__version__ = "0.8.3"  # pragma: no cover
 | 
				
			||||||
 | 
				
			|||||||
@ -166,9 +166,9 @@ class FsspecConnector(ConnectorCleanupMixin, BaseConnector):
 | 
				
			|||||||
    def get_ingest_docs(self):
 | 
					    def get_ingest_docs(self):
 | 
				
			||||||
        return [
 | 
					        return [
 | 
				
			||||||
            self.ingest_doc_cls(
 | 
					            self.ingest_doc_cls(
 | 
				
			||||||
                self.standard_config,
 | 
					                standard_config=self.standard_config,
 | 
				
			||||||
                self.config,
 | 
					                config=self.config,
 | 
				
			||||||
                file,
 | 
					                remote_file_path=file,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            for file in self._list_files()
 | 
					            for file in self._list_files()
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										230
									
								
								unstructured/ingest/connector/outlook.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										230
									
								
								unstructured/ingest/connector/outlook.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,230 @@
 | 
				
			|||||||
 | 
					import hashlib
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					from dataclasses import dataclass, field
 | 
				
			||||||
 | 
					from itertools import chain
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import List
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from office365.onedrive.driveitems.driveItem import DriveItem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from unstructured.ingest.interfaces import (
 | 
				
			||||||
 | 
					    BaseConnector,
 | 
				
			||||||
 | 
					    BaseConnectorConfig,
 | 
				
			||||||
 | 
					    BaseIngestDoc,
 | 
				
			||||||
 | 
					    ConnectorCleanupMixin,
 | 
				
			||||||
 | 
					    IngestDocCleanupMixin,
 | 
				
			||||||
 | 
					    StandardConnectorConfig,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from unstructured.ingest.logger import logger
 | 
				
			||||||
 | 
					from unstructured.utils import requires_dependencies
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MAX_NUM_EMAILS = 1000000  # Maximum number of emails per folder
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class MissingFolderError(Exception):
 | 
				
			||||||
 | 
					    """There are no root folders with those names."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class SimpleOutlookConfig(BaseConnectorConfig):
 | 
				
			||||||
 | 
					    """This class is getting the token."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    client_id: str
 | 
				
			||||||
 | 
					    client_credential: str = field(repr=False)
 | 
				
			||||||
 | 
					    user_email: str
 | 
				
			||||||
 | 
					    tenant: str = field(repr=False)
 | 
				
			||||||
 | 
					    authority_url: str = field(repr=False)
 | 
				
			||||||
 | 
					    ms_outlook_folders: List[str]
 | 
				
			||||||
 | 
					    recursive: bool = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __post_init__(self):
 | 
				
			||||||
 | 
					        if not (self.client_id and self.client_credential and self.user_email):
 | 
				
			||||||
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                "Please provide one of the following mandatory values:"
 | 
				
			||||||
 | 
					                "\n--ms-client_id\n--ms-client_cred\n--ms-user-email",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        self.token_factory = self._acquire_token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @requires_dependencies(["msal"])
 | 
				
			||||||
 | 
					    def _acquire_token(self):
 | 
				
			||||||
 | 
					        from msal import ConfidentialClientApplication
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            app = ConfidentialClientApplication(
 | 
				
			||||||
 | 
					                authority=f"{self.authority_url}/{self.tenant}",
 | 
				
			||||||
 | 
					                client_id=self.client_id,
 | 
				
			||||||
 | 
					                client_credential=self.client_credential,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            token = app.acquire_token_for_client(
 | 
				
			||||||
 | 
					                scopes=["https://graph.microsoft.com/.default"],
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        except ValueError as exc:
 | 
				
			||||||
 | 
					            logger.error("Couldn't set up credentials for Outlook")
 | 
				
			||||||
 | 
					            raise exc
 | 
				
			||||||
 | 
					        return token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def parse_folders(folder_str: str) -> List[str]:
 | 
				
			||||||
 | 
					        """Parses a comma separated string of Outlook folders into a list."""
 | 
				
			||||||
 | 
					        return [x.strip() for x in folder_str.split(",")]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@dataclass
 | 
				
			||||||
 | 
					class OutlookIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
 | 
				
			||||||
 | 
					    config: SimpleOutlookConfig
 | 
				
			||||||
 | 
					    file: DriveItem
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __post_init__(self):
 | 
				
			||||||
 | 
					        self._set_download_paths()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def hash_mail_name(self, id):
 | 
				
			||||||
 | 
					        """Outlook email ids are 152 char long. Hash to shorten to 16."""
 | 
				
			||||||
 | 
					        return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _set_download_paths(self) -> None:
 | 
				
			||||||
 | 
					        """Creates paths for downloading and parsing."""
 | 
				
			||||||
 | 
					        download_path = Path(f"{self.standard_config.download_dir}")
 | 
				
			||||||
 | 
					        output_path = Path(f"{self.standard_config.output_dir}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.download_dir = download_path
 | 
				
			||||||
 | 
					        self.download_filepath = (
 | 
				
			||||||
 | 
					            download_path / f"{self.hash_mail_name(self.file.id)}.eml"
 | 
				
			||||||
 | 
					        ).resolve()
 | 
				
			||||||
 | 
					        oname = f"{self.hash_mail_name(self.file.id)}.eml.json"
 | 
				
			||||||
 | 
					        self.output_dir = output_path
 | 
				
			||||||
 | 
					        self.output_filepath = (output_path / oname).resolve()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def filename(self):
 | 
				
			||||||
 | 
					        return Path(self.download_filepath).resolve()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def _output_filename(self):
 | 
				
			||||||
 | 
					        return Path(self.output_filepath).resolve()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @BaseIngestDoc.skip_if_file_exists
 | 
				
			||||||
 | 
					    @requires_dependencies(["office365"])
 | 
				
			||||||
 | 
					    def get_file(self):
 | 
				
			||||||
 | 
					        """Relies on Office365 python sdk message object to do the download."""
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            if not self.download_dir.is_dir():
 | 
				
			||||||
 | 
					                logger.debug(f"Creating directory: {self.download_dir}")
 | 
				
			||||||
 | 
					                self.download_dir.mkdir(parents=True, exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            with open(
 | 
				
			||||||
 | 
					                os.path.join(
 | 
				
			||||||
 | 
					                    self.download_dir,
 | 
				
			||||||
 | 
					                    self.hash_mail_name(self.file.id) + ".eml",
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					                "wb",
 | 
				
			||||||
 | 
					            ) as local_file:
 | 
				
			||||||
 | 
					                self.file.download(
 | 
				
			||||||
 | 
					                    local_file,
 | 
				
			||||||
 | 
					                ).execute_query()  # download MIME representation of a message
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.error(
 | 
				
			||||||
 | 
					                f"Error while downloading and saving file: {self.file.subject}.",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            logger.error(e)
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        logger.info(f"File downloaded: {self.file.subject}")
 | 
				
			||||||
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class OutlookConnector(ConnectorCleanupMixin, BaseConnector):
 | 
				
			||||||
 | 
					    config: SimpleOutlookConfig
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        standard_config: StandardConnectorConfig,
 | 
				
			||||||
 | 
					        config: SimpleOutlookConfig,
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        super().__init__(standard_config, config)
 | 
				
			||||||
 | 
					        self._set_client()
 | 
				
			||||||
 | 
					        self.get_folder_ids()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @requires_dependencies(["office365"])
 | 
				
			||||||
 | 
					    def _set_client(self):
 | 
				
			||||||
 | 
					        from office365.graph_client import GraphClient
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.client = GraphClient(self.config.token_factory)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def initialize(self):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def recurse_folders(self, folder_id, main_folder_dict):
 | 
				
			||||||
 | 
					        """We only get a count of subfolders for any folder.
 | 
				
			||||||
 | 
					        Have to make additional calls to get subfolder ids."""
 | 
				
			||||||
 | 
					        subfolders = (
 | 
				
			||||||
 | 
					            self.client.users[self.config.user_email]
 | 
				
			||||||
 | 
					            .mail_folders[folder_id]
 | 
				
			||||||
 | 
					            .child_folders.get()
 | 
				
			||||||
 | 
					            .execute_query()
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        for subfolder in subfolders:
 | 
				
			||||||
 | 
					            for k, v in main_folder_dict.items():
 | 
				
			||||||
 | 
					                if subfolder.get_property("parentFolderId") in v:
 | 
				
			||||||
 | 
					                    v.append(subfolder.id)
 | 
				
			||||||
 | 
					            if subfolder.get_property("childFolderCount") > 0:
 | 
				
			||||||
 | 
					                self.recurse_folders(subfolder.id, main_folder_dict)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_folder_ids(self):
 | 
				
			||||||
 | 
					        """Sets the mail folder ids and subfolder ids for requested root mail folders."""
 | 
				
			||||||
 | 
					        self.root_folders = defaultdict(list)
 | 
				
			||||||
 | 
					        root_folders_with_subfolders = []
 | 
				
			||||||
 | 
					        get_root_folders = (
 | 
				
			||||||
 | 
					            self.client.users[self.config.user_email].mail_folders.get().execute_query()
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for folder in get_root_folders:
 | 
				
			||||||
 | 
					            self.root_folders[folder.display_name].append(folder.id)
 | 
				
			||||||
 | 
					            if folder.get_property("childFolderCount") > 0:
 | 
				
			||||||
 | 
					                root_folders_with_subfolders.append(folder.id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for folder in root_folders_with_subfolders:
 | 
				
			||||||
 | 
					            self.recurse_folders(folder, self.root_folders)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested.
 | 
				
			||||||
 | 
					        self.selected_folder_ids = list(
 | 
				
			||||||
 | 
					            chain.from_iterable(
 | 
				
			||||||
 | 
					                [
 | 
				
			||||||
 | 
					                    v
 | 
				
			||||||
 | 
					                    for k, v in self.root_folders.items()
 | 
				
			||||||
 | 
					                    if k.lower() in [x.lower() for x in self.config.ms_outlook_folders]
 | 
				
			||||||
 | 
					                ],
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        if not self.selected_folder_ids:
 | 
				
			||||||
 | 
					            raise MissingFolderError(
 | 
				
			||||||
 | 
					                f"There are no root folders with the names: {self.config.ms_outlook_folders}",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_ingest_docs(self):
 | 
				
			||||||
 | 
					        """Returns a list of all the message objects that are in the requested root folder(s)."""
 | 
				
			||||||
 | 
					        filtered_messages = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Get all the relevant messages in the selected folders/subfolders.
 | 
				
			||||||
 | 
					        for folder_id in self.selected_folder_ids:
 | 
				
			||||||
 | 
					            messages = (
 | 
				
			||||||
 | 
					                self.client.users[self.config.user_email]
 | 
				
			||||||
 | 
					                .mail_folders[folder_id]
 | 
				
			||||||
 | 
					                .messages.get()
 | 
				
			||||||
 | 
					                .top(MAX_NUM_EMAILS)  # Prevents the return from paging
 | 
				
			||||||
 | 
					                .execute_query()
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            # Skip empty list if there are no messages in folder.
 | 
				
			||||||
 | 
					            if messages:
 | 
				
			||||||
 | 
					                filtered_messages.append(messages)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Filtered messages have an un-downloadable resource path.
 | 
				
			||||||
 | 
					        # So we get each message object individually.
 | 
				
			||||||
 | 
					        individual_messages = []
 | 
				
			||||||
 | 
					        for m in list(chain.from_iterable(filtered_messages)):
 | 
				
			||||||
 | 
					            messages = (
 | 
				
			||||||
 | 
					                self.client.users[self.config.user_email].messages[m.id].get().execute_query()
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            individual_messages.append(messages)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return [OutlookIngestDoc(self.standard_config, self.config, f) for f in individual_messages]
 | 
				
			||||||
@ -427,6 +427,17 @@ class MainProcess:
 | 
				
			|||||||
    default=None,
 | 
					    default=None,
 | 
				
			||||||
    help="Folder to start parsing files from.",
 | 
					    help="Folder to start parsing files from.",
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					@click.option(
 | 
				
			||||||
 | 
					    "--ms-user-email",
 | 
				
			||||||
 | 
					    default=None,
 | 
				
			||||||
 | 
					    help="Outlook email to download messages from.",
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					@click.option(
 | 
				
			||||||
 | 
					    "--ms-outlook-folders",
 | 
				
			||||||
 | 
					    default=None,
 | 
				
			||||||
 | 
					    help="Comma separated list of folders to download email messages from. "
 | 
				
			||||||
 | 
					    "Do not specify subfolders. Use quotes if spaces in folder names.",
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
@click.option(
 | 
					@click.option(
 | 
				
			||||||
    "--elasticsearch-url",
 | 
					    "--elasticsearch-url",
 | 
				
			||||||
    default=None,
 | 
					    default=None,
 | 
				
			||||||
@ -568,6 +579,8 @@ def main(
 | 
				
			|||||||
    ms_tenant,
 | 
					    ms_tenant,
 | 
				
			||||||
    ms_user_pname,
 | 
					    ms_user_pname,
 | 
				
			||||||
    ms_onedrive_folder,
 | 
					    ms_onedrive_folder,
 | 
				
			||||||
 | 
					    ms_user_email,
 | 
				
			||||||
 | 
					    ms_outlook_folders,
 | 
				
			||||||
    elasticsearch_url,
 | 
					    elasticsearch_url,
 | 
				
			||||||
    elasticsearch_index_name,
 | 
					    elasticsearch_index_name,
 | 
				
			||||||
    jq_query,
 | 
					    jq_query,
 | 
				
			||||||
@ -681,6 +694,8 @@ def main(
 | 
				
			|||||||
            hashed_dir_name = hashlib.sha256(
 | 
					            hashed_dir_name = hashlib.sha256(
 | 
				
			||||||
                f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
 | 
					                f"{ms_tenant}_{ms_user_pname}".encode("utf-8"),
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					        elif ms_user_email:
 | 
				
			||||||
 | 
					            hashed_dir_name = hashlib.sha256(ms_user_email.encode("utf-8"))
 | 
				
			||||||
        elif confluence_url:
 | 
					        elif confluence_url:
 | 
				
			||||||
            hashed_dir_name = hashlib.sha256(
 | 
					            hashed_dir_name = hashlib.sha256(
 | 
				
			||||||
                f"{confluence_url}".encode("utf-8"),
 | 
					                f"{confluence_url}".encode("utf-8"),
 | 
				
			||||||
@ -910,7 +925,7 @@ def main(
 | 
				
			|||||||
                decay=biomed_decay,
 | 
					                decay=biomed_decay,
 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    elif ms_client_id or ms_user_pname:
 | 
					    elif ms_client_id and ms_user_pname:
 | 
				
			||||||
        from unstructured.ingest.connector.onedrive import (
 | 
					        from unstructured.ingest.connector.onedrive import (
 | 
				
			||||||
            OneDriveConnector,
 | 
					            OneDriveConnector,
 | 
				
			||||||
            SimpleOneDriveConfig,
 | 
					            SimpleOneDriveConfig,
 | 
				
			||||||
@ -929,6 +944,25 @@ def main(
 | 
				
			|||||||
            ),
 | 
					            ),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    elif ms_client_id and ms_user_email:
 | 
				
			||||||
 | 
					        from unstructured.ingest.connector.outlook import (
 | 
				
			||||||
 | 
					            OutlookConnector,
 | 
				
			||||||
 | 
					            SimpleOutlookConfig,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        doc_connector = OutlookConnector(  # type: ignore
 | 
				
			||||||
 | 
					            standard_config=standard_config,
 | 
				
			||||||
 | 
					            config=SimpleOutlookConfig(
 | 
				
			||||||
 | 
					                client_id=ms_client_id,
 | 
				
			||||||
 | 
					                client_credential=ms_client_cred,
 | 
				
			||||||
 | 
					                user_email=ms_user_email,
 | 
				
			||||||
 | 
					                tenant=ms_tenant,
 | 
				
			||||||
 | 
					                authority_url=ms_authority_url,
 | 
				
			||||||
 | 
					                ms_outlook_folders=SimpleOutlookConfig.parse_folders(ms_outlook_folders),
 | 
				
			||||||
 | 
					                recursive=recursive,
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    elif local_input_path:
 | 
					    elif local_input_path:
 | 
				
			||||||
        from unstructured.ingest.connector.local import (
 | 
					        from unstructured.ingest.connector.local import (
 | 
				
			||||||
            LocalConnector,
 | 
					            LocalConnector,
 | 
				
			||||||
 | 
				
			|||||||
@ -111,7 +111,7 @@ def build_email_metadata(msg: Message, filename: Optional[str]) -> ElementMetada
 | 
				
			|||||||
    if email_date is not None:
 | 
					    if email_date is not None:
 | 
				
			||||||
        email_date = convert_to_iso_8601(email_date)
 | 
					        email_date = convert_to_iso_8601(email_date)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    sent_from = header_dict.get("To")
 | 
					    sent_from = header_dict.get("From")
 | 
				
			||||||
    if sent_from is not None:
 | 
					    if sent_from is not None:
 | 
				
			||||||
        sent_from = [sender.strip() for sender in sent_from.split(",")]
 | 
					        sent_from = [sender.strip() for sender in sent_from.split(",")]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user