diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8084f3fdd..ea6b3b300 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -287,6 +287,7 @@ jobs: SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} run: | source .venv/bin/activate sudo apt-get update @@ -311,6 +312,7 @@ jobs: make install-ingest-outlook make install-ingest-slack make install-ingest-wikipedia + make install-ingest-notion ./test_unstructured_ingest/test-ingest.sh test_unstructured_api_unit: diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index fbd214f18..a9b873c0b 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -78,6 +78,7 @@ jobs: SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} OVERWRITE_FIXTURES: "true" run: | source .venv/bin/activate @@ -103,6 +104,7 @@ jobs: make install-ingest-outlook make install-ingest-slack make install-ingest-wikipedia + make install-ingest-notion ./test_unstructured_ingest/test-ingest.sh - name: Save branch name to environment file diff --git a/CHANGELOG.md b/CHANGELOG.md index 31d359c1a..948ef5881 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ * Update `partition_xlsx` to always use `soupparser_fromstring` to parse `html text` * Add functionality to switch `html` text parser based on whether the `html` text contains emoji * Add functionality to check if a string contains any emoji characters +* Add CI tests around Notion ### Features @@ -91,6 +92,8 @@ - better caching of models - another version of detectron2 available, though the default layout model is unchanged * Added UUID option for element_id +* Added UUID option for element_id +* CI improvements to run ingest tests in parallel ### Features diff --git a/Dockerfile b/Dockerfile index cb3a9b709..dd7da6a00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,11 +21,18 @@ RUN python3.8 -m pip install pip==${PIP_VERSION} && \ pip install --no-cache -r requirements/test.txt && \ pip install --no-cache -r requirements/huggingface.txt && \ pip install --no-cache -r requirements/dev.txt && \ - pip install --no-cache -r requirements/ingest-azure.txt && \ pip install --no-cache -r requirements/ingest-box.txt && \ + pip install --no-cache -r requirements/ingest-confluence.txt && \ + pip install --no-cache -r requirements/ingest-discord.txt && \ + pip install --no-cache -r requirements/ingest-dropbox.txt && \ + pip install --no-cache -r requirements/ingest-elasticsearch.txt && \ + pip install --no-cache -r requirements/ingest-gcs.txt && \ pip install --no-cache -r requirements/ingest-github.txt && \ pip install --no-cache -r requirements/ingest-gitlab.txt && \ pip install --no-cache -r requirements/ingest-google-drive.txt && \ + pip install --no-cache -r requirements/ingest-notion.txt && \ + pip install --no-cache -r requirements/ingest-onedrive.txt && \ + pip install --no-cache -r requirements/ingest-outlook.txt && \ pip install --no-cache -r requirements/ingest-reddit.txt && \ pip install --no-cache -r requirements/ingest-s3.txt && \ pip install --no-cache -r requirements/ingest-slack.txt && \ diff --git a/Makefile b/Makefile index 6aa3af451..8cf8f9ffe 100644 --- a/Makefile +++ b/Makefile @@ -132,6 +132,10 @@ install-ingest-discord: install-ingest-github: python3 -m pip install -r requirements/ingest-github.txt +.PHONY: install-ingest-biomed +install-ingest-biomed: + python3 -m pip install -r requirements/ingest-biomed.txt + .PHONY: install-ingest-gitlab install-ingest-gitlab: python3 -m pip install -r requirements/ingest-gitlab.txt @@ -172,6 +176,14 @@ install-ingest-airtable: install-ingest-sharepoint: python3 -m pip install -r requirements/ingest-sharepoint.txt +.PHONY: install-ingest-local +install-ingest-local: + echo "no unique dependencies for local connector" + +.PHONY: install-ingest-notion +install-ingest-notion: + python3 -m pip install -r requirements/ingest-notion.txt + .PHONY: install-unstructured-inference install-unstructured-inference: python3 -m pip install -r requirements/local-inference.txt @@ -211,6 +223,7 @@ pip-compile: # sphinx docs looks for additional requirements cp requirements/build.txt docs/requirements.txt pip-compile --upgrade requirements/ingest-s3.in + pip-compile --upgrade requirements/ingest-biomed.in pip-compile --upgrade requirements/ingest-box.in pip-compile --upgrade requirements/ingest-gcs.in pip-compile --upgrade requirements/ingest-dropbox.in diff --git a/requirements/ingest-biomed.in b/requirements/ingest-biomed.in new file mode 100644 index 000000000..0e87b2ac1 --- /dev/null +++ b/requirements/ingest-biomed.in @@ -0,0 +1,3 @@ +-c constraints.in +-c base.txt +bs4 diff --git a/requirements/ingest-biomed.txt b/requirements/ingest-biomed.txt new file mode 100644 index 000000000..92f9fdccc --- /dev/null +++ b/requirements/ingest-biomed.txt @@ -0,0 +1,12 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements/ingest-biomed.in +# +beautifulsoup4==4.12.2 + # via bs4 +bs4==0.0.1 + # via -r requirements/ingest-biomed.in +soupsieve==2.4.1 + # via beautifulsoup4 diff --git a/requirements/ingest-gcs.in b/requirements/ingest-gcs.in index 8c527ae9f..de522c3f4 100644 --- a/requirements/ingest-gcs.in +++ b/requirements/ingest-gcs.in @@ -2,3 +2,4 @@ -c base.txt gcsfs fsspec +bs4 diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 328bd62cf..91f8eac08 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -12,6 +12,10 @@ async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp +beautifulsoup4==4.12.2 + # via bs4 +bs4==0.0.1 + # via -r requirements/ingest-gcs.in cachetools==5.3.1 # via google-auth certifi==2023.7.22 @@ -94,6 +98,8 @@ rsa==4.9 # via google-auth six==1.16.0 # via google-auth +soupsieve==2.4.1 + # via beautifulsoup4 urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-onedrive.in b/requirements/ingest-onedrive.in index d5cd7d03c..fa72a6fd4 100644 --- a/requirements/ingest-onedrive.in +++ b/requirements/ingest-onedrive.in @@ -2,3 +2,4 @@ -c base.txt msal Office365-REST-Python-Client<2.4.3 +bs4 diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index c9c0c921d..aae9e6ce2 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -4,6 +4,12 @@ # # pip-compile requirements/ingest-onedrive.in # +beautifulsoup4==4.12.2 + # via + # -c requirements/base.txt + # bs4 +bs4==0.0.1 + # via -r requirements/ingest-onedrive.in certifi==2023.7.22 # via # -c requirements/base.txt @@ -42,6 +48,10 @@ requests==2.31.0 # -c requirements/base.txt # msal # office365-rest-python-client +soupsieve==2.4.1 + # via + # -c requirements/base.txt + # beautifulsoup4 urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/setup.py b/setup.py index d0bbf3a89..d2e9809c3 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,7 @@ setup( # Extra requirements for data connectors "s3": load_requirements("requirements/ingest-s3.in"), "azure": load_requirements("requirements/ingest-azure.in"), + "biomed": load_requirements("requirements/ingest-biomed.in"), "discord": load_requirements("requirements/ingest-discord.in"), "github": load_requirements("requirements/ingest-github.in"), "gitlab": load_requirements("requirements/ingest-gitlab.in"), diff --git a/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json b/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json new file mode 100644 index 000000000..6304ffa3f --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/122b2c22-996b-435b-9de2-ee0e9d2b04bc.json @@ -0,0 +1,981 @@ +[ + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "7b544ee99a84930c8049d5c91f8e7541", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "New Page" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "a3bc48c9c0c00bd86bfcefcb833d3fd4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Morale Events" + }, + { + "type": "Title", + "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Policies" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Policies" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "9eca9d6f69bb98c4ec616c4aec38d0d2", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T19:02:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "c502fd59c2cdff4881f98c3ce019dc77", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "New Page With Verification" + }, + { + "type": "UncategorizedText", + "element_id": "20079ac60749535ee21512b3091a61e0", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "expired" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "expired\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-08-11T04:00:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "c911244e369f9ee203656a820c260e4d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Vacation Policy" + }, + { + "type": "Title", + "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Policies" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Policies" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "94efbf7307081f8f45b11a183ad99254", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Mission, Vision, Values" + }, + { + "type": "UncategorizedText", + "element_id": "575d595cf4830f838cc79edf3a4bd5fc", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Vision", + "Company Updates" + ], + "emphasized_text_tags": [ + "span", + "span" + ] + }, + "text": "Vision\n \n \n Company Updates" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "b2d356b3e28717647c73b8767da6c485", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Recent Press" + }, + { + "type": "Title", + "element_id": "67538900b235164b3f1debd8a8d80b44", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Company Updates" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Company Updates" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "d00eca1bae6742803906ab42a831e8b5", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Getting Started" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "a9e87d3147c54fd5fa061709e15ed0bf", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-17T18:48:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "5687503bd741f54090d4c0557c0eea1a", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Page with every block" + }, + { + "type": "UncategorizedText", + "element_id": "d7501f757bf490f053005b707829e343", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Company Updates", + "Policies" + ], + "emphasized_text_tags": [ + "span", + "span" + ] + }, + "text": "Company Updates\n \n \n Policies" + }, + { + "type": "UncategorizedText", + "element_id": "aad0d1a0dbac83ea1906db66ecbff086", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "verified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "verified\n \n \n Roman Isecke\n \n \n 2023-08-04T04:00:00.000Z - 2023-11-02T04:00:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "b2c1cf36a9b45cdefac07d1899b96ff1", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Corporate Travel" + }, + { + "type": "Title", + "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Policies" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Policies" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "d8d2a2140ba63413c452dbefe499f90b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-04T18:31:00.000Z" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ], + "emphasized_text_contents": [ + "Roman Isecke" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Roman Isecke" + }, + { + "type": "Title", + "element_id": "8bcdb5d9bc2bda33af04bae4495f5e37", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Benefits Policies" + }, + { + "type": "Title", + "element_id": "d3ad1f1f8c9c4f5a4a593571085513a4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Policies" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Policies" + }, + { + "type": "Title", + "element_id": "97b7e2db799e2b79e65f418b42a7d305", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:44:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "unverified" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "unverified" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json b/test_unstructured_ingest/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json new file mode 100644 index 000000000..134ff5d80 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/438dbc49-2e06-4f01-8031-bf283be58a60.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "7b544ee99a84930c8049d5c91f8e7541", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "New Page" + }, + { + "type": "NarrativeText", + "element_id": "be23aed1a36d4a5aa33b4dc454eff351", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json b/test_unstructured_ingest/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json new file mode 100644 index 000000000..d7f0bfc80 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/4695ea53-f2b3-45b4-8638-2212fd054d73.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "a3bc48c9c0c00bd86bfcefcb833d3fd4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Morale Events" + }, + { + "type": "NarrativeText", + "element_id": "89f7608e949e257e04b601964f7fab2d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Morale events increase employee satisfaction, motivation, and well-being, while promoting community and teamwork, resulting in higher productivity and retention rates." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json b/test_unstructured_ingest/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json new file mode 100644 index 000000000..07cbd9635 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/5481f29c-799a-4d7b-93ce-b11bcaede531.json @@ -0,0 +1,41 @@ +[ + { + "type": "Title", + "element_id": "c502fd59c2cdff4881f98c3ce019dc77", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "New Page With Verification" + }, + { + "type": "NarrativeText", + "element_id": "be23aed1a36d4a5aa33b4dc454eff351", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you." + }, + { + "type": "NarrativeText", + "element_id": "b086a1d1099369a71964546dc89b2323", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T19:02:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: An owner of a page can verify it by clicking on the verification button above and choosing to verify the page for either a set amount of time or indefinitely!" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json new file mode 100644 index 000000000..a12a7d39d --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "c911244e369f9ee203656a820c260e4d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Vacation Policy" + }, + { + "type": "NarrativeText", + "element_id": "94bc9e2e465cfac3060a7f7ab8082e89", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json b/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json new file mode 100644 index 000000000..cffc024be --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/898538f2-26e1-4de7-81e6-354045d4d007.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "94efbf7307081f8f45b11a183ad99254", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Mission, Vision, Values" + }, + { + "type": "NarrativeText", + "element_id": "f116dc480f737022b3eef55d2095d808", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: A company mission provides direction and purpose, aligning actions and decisions towards a common goal. It also helps attract like-minded individuals who share the same values and vision for the company." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json b/test_unstructured_ingest/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/8d8bee42-2167-441c-af6c-7b2cff268809.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json b/test_unstructured_ingest/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json new file mode 100644 index 000000000..d1911cf19 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/8db7ccc9-0a9c-4168-94c3-f997e60cb8cf.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "b2d356b3e28717647c73b8767da6c485", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Recent Press" + }, + { + "type": "NarrativeText", + "element_id": "22f92b2ebdefec36664fc1cb69221f2b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Telling employees about news about your company is important because it helps them stay informed about the direction of the company and their role in it." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json b/test_unstructured_ingest/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json new file mode 100644 index 000000000..1ed01f379 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/9e20be3d-cbe0-4e28-ad46-2170d40a8d37.json @@ -0,0 +1,15 @@ +[ + { + "type": "Title", + "element_id": "18e350f89256491ebe1f8cce73a45231", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Sprint 3" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json b/test_unstructured_ingest/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json new file mode 100644 index 000000000..d3ee5a0d7 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/b2a12157-721e-4207-b3b7-527762b782c2.json @@ -0,0 +1,293 @@ +[ + { + "type": "Title", + "element_id": "d00eca1bae6742803906ab42a831e8b5", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Getting Started" + }, + { + "type": "NarrativeText", + "element_id": "be23aed1a36d4a5aa33b4dc454eff351", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: When creating a page, it's important to give it a clear title and provide some content. This could include verifying the information, summarizing the topic, or sharing your thoughts and opinions on something that matters to you." + }, + { + "type": "Title", + "element_id": "a5e729fb76c8c30039cbdb4c1a1f631f", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "The Basics" + }, + { + "type": "Title", + "element_id": "c0afdd6fd0720c2d378ca3ea772d5746", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Create a Page" + }, + { + "type": "NarrativeText", + "element_id": "6f7bd5b1aa870cb1bc65673b13c4f443", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Workspace" + ], + "emphasized_text_tags": [ + "b" + ] + }, + "text": "\n In your sidebar, click the \n " + }, + { + "type": "Title", + "element_id": "a3e7821cf681efe356a3725fb74e5afb", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Headings" + }, + { + "type": "NarrativeText", + "element_id": "de4d0201434c605453f8882bc60ed14f", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "You can add headings and subheadings in one of two ways:" + }, + { + "type": "ListItem", + "element_id": "e29f621fe1abe6865d3a18f3b9366bbb", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Type \n \n /heading\n \n or \n \n /h1\n \n , \n \n /h2\n \n , or \n \n /h3\n \n to choose the heading size you want." + }, + { + "type": "ListItem", + "element_id": "9ff65a4dda597336f4b5800282db7239", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Use Markdown shortcuts, like \n \n #\n \n , \n \n ##\n \n , and \n \n ###\n \n ." + }, + { + "type": "ListItem", + "element_id": "a64d13d5e52c832a2b5f7b5f6cfefc09", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Create inline code by wrapping text with \n \n `\n \n (or with the shortcut \n \n cmd/ctrl + e\n \n )." + }, + { + "type": "Title", + "element_id": "8b218d87a7b0992b03cbba22ed07d466", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Toggle Lists" + }, + { + "type": "NarrativeText", + "element_id": "de273473055b8b700d97642a29e6a787", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Toggle lists streamline your content. Click the arrow to open." + }, + { + "type": "Title", + "element_id": "660cca99d9a1ad75e7c0d12644c68d71", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Callout Blocks" + }, + { + "type": "Title", + "element_id": "60d3ab7e0452a72d68e57a722376ab19", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Code Blocks" + }, + { + "type": "NarrativeText", + "element_id": "7b82449d1e44001326797d90184eae33", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "You can add code notation to any Notion page:" + }, + { + "type": "ListItem", + "element_id": "66d838acacfc8d48cdd7714882dfc4ef", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Your teammates can select any code to comment on it." + }, + { + "type": "Title", + "element_id": "d7b5c2955f161528fd051e6273cfe1aa", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Organizing Pages" + }, + { + "type": "NarrativeText", + "element_id": "0a296a1b32283aa06e695312b049892b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "\n Instead of using folders, Notion lets you nest pages inside pages. Type \n " + }, + { + "type": "Title", + "element_id": "54844ea8733bd62ab0d7c3b35bde1bc2", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Advanced Techniques" + }, + { + "type": "NarrativeText", + "element_id": "b8736e38d7aeb7d60b821eb715ac9bbc", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" + ], + "link_texts": [ + "\n Notion Editor 101\n " + ] + }, + "text": "\n Check out this \n " + }, + { + "type": "UncategorizedText", + "element_id": "23b9158a48adb937cd37eae997f2dbe6", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/notion/Notion-editor-101-create-and-edit-68c7c67047494fdb87d50185429df93e" + ], + "link_texts": [ + "\n Notion Editor 101\n " + ] + }, + "text": "Notion Editor 101\n \n guide for more advanced tips and how-to's." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json new file mode 100644 index 000000000..ea8a504a4 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json @@ -0,0 +1,704 @@ +[ + { + "type": "Title", + "element_id": "5687503bd741f54090d4c0557c0eea1a", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Page with every block" + }, + { + "type": "NarrativeText", + "element_id": "197dece4986f325b5d51a9e9d50eb0a6", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Tag pages to let collaborators know what they can expect to use the page for. You can add one or many tags to any page in a wiki." + }, + { + "type": "Title", + "element_id": "3291955e900530da1940bdce97fa0f94", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Heading 2" + }, + { + "type": "NarrativeText", + "element_id": "74ee7ebc5b884a20b9995eef3f2e90bb", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "This is some new text" + }, + { + "type": "NarrativeText", + "element_id": "0e417d4322de881c074174e590eb664f", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "link_texts": [ + "\n text\n " + ], + "emphasized_text_contents": [ + "formatted" + ], + "emphasized_text_tags": [ + "b" + ] + }, + "text": "\n Some/less → \n more\n \n " + }, + { + "type": "UncategorizedText", + "element_id": "e302f58e2f42e9bcf16d78f5829ac32d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "link_texts": [ + "\n text\n " + ] + }, + "text": "text\n \n with other" + }, + { + "type": "UncategorizedText", + "element_id": "8864784f943d9f832a3dce22ef8bcf01", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "content" + ], + "emphasized_text_tags": [ + "b" + ] + }, + "text": "c1r1 \n \n content" + }, + { + "type": "UncategorizedText", + "element_id": "6f75c9d2993dbb3981c019741c7962a9", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "c2r1 table \n 2023-08-08T09:00:00.000-04:00\n cell" + }, + { + "type": "Title", + "element_id": "5687503bd741f54090d4c0557c0eea1a", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/c47a45664c7a488bac2a1292ee507fcb" + ], + "link_texts": [ + "\n Page with every block \n " + ] + }, + "text": "Page with every block" + }, + { + "type": "UncategorizedText", + "element_id": "13686520a51e25584bb06ab189b38552", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "/122b2c22996b435b9de2ee0e9d2b04bc" + ], + "link_texts": [ + "\n content\n " + ] + }, + "text": "c1r2 more \n \n content" + }, + { + "type": "UncategorizedText", + "element_id": "cf236cfe4b4c0ef644c37b4e491a4aa8", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "cell" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "c2r2 table \n \n cell" + }, + { + "type": "Title", + "element_id": "f59ab8d1331b7b16952fbd388258f856", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "link_texts": [ + "\n Untitled\n " + ] + }, + "text": "Untitled" + }, + { + "type": "NarrativeText", + "element_id": "7d96ce60a66271ef79da4c492ca7db8a", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "this is some green text" + }, + { + "type": "NarrativeText", + "element_id": "2d77a706008eebaf1f7c4e116bbe08b4", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "an", + "equation" + ], + "emphasized_text_tags": [ + "b", + "b" + ] + }, + "text": "this is \n \n an \n \n \n equation" + }, + { + "type": "Title", + "element_id": "f59ab8d1331b7b16952fbd388258f856", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/a1a5dff426f34b8f9a709d51b2a00c73" + ], + "link_texts": [ + "\n Untitled\n " + ] + }, + "text": "Untitled" + }, + { + "type": "UncategorizedText", + "element_id": "7e921a403f1840728e2887990cfe640d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "text2" + ], + "emphasized_text_tags": [ + "i" + ] + }, + "text": "text1\n\n\n \n text2\n \n \n\nMultiline cell" + }, + { + "type": "Title", + "element_id": "7013d5bb5a17e0e782e8971e23640bdb", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Another cell" + }, + { + "type": "Title", + "element_id": "f59ab8d1331b7b16952fbd388258f856", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://www.notion.so/84002066546448d0a030aa79b8d400b0" + ], + "link_texts": [ + "\n Untitled\n " + ] + }, + "text": "Untitled" + }, + { + "type": "UncategorizedText", + "element_id": "07d8cee827eb828e2a5eb7de833bbbe5", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "E = {mc^2}" + }, + { + "type": "ListItem", + "element_id": "cc62f6c7b2a82fd1677dd9f8bd7d22a0", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Numbered list" + }, + { + "type": "ListItem", + "element_id": "6834fb0bd1686c896a94d373f3b4b775", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "A number child" + }, + { + "type": "ListItem", + "element_id": "ca680621d39d5b8acce02dfc6a97a722", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "A number grandchild" + }, + { + "type": "ListItem", + "element_id": "87765da6ccf0668238c1d27c35692e11", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "great" + }, + { + "type": "ListItem", + "element_id": "5fe0c7e554be4e8a7c481fda450b4891", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "super great" + }, + { + "type": "ListItem", + "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "" + }, + { + "type": "ListItem", + "element_id": "49e55c65ff5dc6e829e6ecc0c70ebadd", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "with test text" + }, + { + "type": "ListItem", + "element_id": "fb92b200afd22befce69ed16445527cf", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Bullet one" + }, + { + "type": "ListItem", + "element_id": "0428d7ddb4428600794364fabf9f4c1d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "A child bullet" + }, + { + "type": "ListItem", + "element_id": "fa1e5246d82a38dbdfc9190a05a99dfd", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "A grandchild bullet" + }, + { + "type": "ListItem", + "element_id": "87765da6ccf0668238c1d27c35692e11", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "great" + }, + { + "type": "ListItem", + "element_id": "5fe0c7e554be4e8a7c481fda450b4891", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "super great" + }, + { + "type": "ListItem", + "element_id": "8adaf826c6c8658a2bebd16af3130ebe", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Bullet two" + }, + { + "type": "NarrativeText", + "element_id": "2ee54c373bd30284b127eba65fd6b949", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "I quote myself testings Notion" + }, + { + "type": "NarrativeText", + "element_id": "20c9899ff94a00676943e33f204aa191", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2, + "link_urls": [ + "https://www.notion.so/icons/airplane_brown.svg" + ], + "link_texts": [ + "\n https://www.notion.so/icons/airplane_brown.svg\n " + ] + }, + "text": "https://www.notion.so/icons/airplane_brown.svg\n \n I call this out" + }, + { + "type": "Title", + "element_id": "9a9d50ec8631eafde241ce5b6991ebb9", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2, + "link_urls": [ + "https://www.wikipedia.org/" + ], + "link_texts": [ + "\n https://www.wikipedia.org/\n " + ] + }, + "text": "https://www.wikipedia.org/" + }, + { + "type": "Title", + "element_id": "9f92a0dd3dc5f16dc2f03319a9593bd8", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2, + "link_urls": [ + "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" + ], + "link_texts": [ + "\n https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk\n " + ] + }, + "text": "https://play-lh.googleusercontent.com/KwUBNPbMTk9jDXYS2AeX3illtVRTkrKVh5xR1Mg4WHd0CG2tV4mrh1z3kXi5z_warlk" + }, + { + "type": "Title", + "element_id": "7b7f33d452d3a0e4110676710b2fa20c", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "Child Database:" + }, + { + "type": "Title", + "element_id": "94c116ee118a72998db6cd10b586a9ef", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2, + "link_urls": [ + "https://www.notion.so/d1fad658f1cf4eedb0b5ee72b9f0b530" + ], + "link_texts": [ + "\n Analytics\n " + ] + }, + "text": "Analytics" + }, + { + "type": "Title", + "element_id": "f1807acca7417b3a1017baa0f6786223", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "Child Page:" + }, + { + "type": "Title", + "element_id": "f59ab8d1331b7b16952fbd388258f856", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2, + "link_urls": [ + "https://www.notion.so/9ba4d6da8a574cfc81ebceac1fde52bd" + ], + "link_texts": [ + "\n Untitled\n " + ] + }, + "text": "Untitled" + }, + { + "type": "NarrativeText", + "element_id": "7416417e6b88ffae5fafbcb1f29adaa8", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "This is my code caption" + }, + { + "type": "NarrativeText", + "element_id": "2263d8dd95ccfe1ad45d732c6eaaf59b", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "This is some text" + }, + { + "type": "NarrativeText", + "element_id": "13a9d985856d4b7da7ce4cd1acf2f87c", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "This is text in next column" + }, + { + "type": "Title", + "element_id": "5ebfbce3fcf25d957554aed984ab57ed", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "Final text in column" + }, + { + "type": "NarrativeText", + "element_id": "700cbb1f3f71b5a21cda3fd79a9175e7", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "Heading 1 content" + }, + { + "type": "UncategorizedText", + "element_id": "18905f6924829bb0d7ca89b5dfaa33ba", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "page_number": 2 + }, + "text": "d3d87fc6-61cc-4bb5-89ed-e9dff0df1526" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json new file mode 100644 index 000000000..e037e6cfe --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json @@ -0,0 +1,653 @@ +[ + { + "type": "UncategorizedText", + "element_id": "d6ec04f65fbb09dbefa4210ef201c9c0", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-14 - 2023-08-27" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ] + }, + "text": "Roman Isecke" + }, + { + "type": "UncategorizedText", + "element_id": "9dfe062b68f15b3623944bd8ebb71b24", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-02T20:36:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "dece647865149e5a86e06c1af7c64aa5", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-31" + }, + { + "type": "EmailAddress", + "element_id": "3357b6f2cc3b8584f1b7e66afbb46d34", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "email@custom.domaine" + }, + { + "type": "Title", + "element_id": "f6c286e4b3078307fc8ae3635b4b2f5b", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "DevOps-Bot" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "DevOps-Bot" + }, + { + "type": "Title", + "element_id": "60a33e6cf5151f2d52eddae9685cfa27", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "False" + }, + { + "type": "Title", + "element_id": "fae2db093e1dd31042e8ab9427e8673a", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Option 1" + ], + "emphasized_text_tags": [ + "span" + ] + }, + "text": "Option 1" + }, + { + "type": "UncategorizedText", + "element_id": "6b51d431df5d7f141cbececcf79edf3d", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "12" + }, + { + "type": "Title", + "element_id": "31eac5f6d8daefa258fc494a7e020bc8", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "SPRI1-2" + }, + { + "type": "Title", + "element_id": "1ff57a29d7c9d11bdf61c1b80f2b289b", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Next" + }, + { + "type": "Title", + "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "notion://sprints/sprint_task_relation" + }, + { + "type": "UncategorizedText", + "element_id": "188dc9ca72be97b25c9fff24f24ae74b", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-28 - 2023-09-10" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ] + }, + "text": "Roman Isecke" + }, + { + "type": "UncategorizedText", + "element_id": "9dfe062b68f15b3623944bd8ebb71b24", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-02T20:36:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "c9b64468a792bcbf76cdce6d7ecc3bb9", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00" + }, + { + "type": "Title", + "element_id": "982d9e3eb996f559e633f4d194def376", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "text" + }, + { + "type": "UncategorizedText", + "element_id": "f5cbeacfbddd0de7391bc723762001a6", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "/51243b9d55dc4131b2ae03eff1ef1783" + ], + "link_texts": [ + "\n link\n " + ], + "emphasized_text_contents": [ + "More", + "text", + "text", + "with", + "link" + ], + "emphasized_text_tags": [ + "span", + "span", + "span", + "span", + "span" + ] + }, + "text": "More \n \n \n \n text\n \n \n \n with \n \n \n \n link" + }, + { + "type": "Title", + "element_id": "60a33e6cf5151f2d52eddae9685cfa27", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "False" + }, + { + "type": "UncategorizedText", + "element_id": "710375baee13b41d02266bd01d5f6b34", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "45666645345465454" + }, + { + "type": "Title", + "element_id": "41a49f786d133c212cf1a35177700394", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "option 1" + }, + { + "type": "Title", + "element_id": "502bb591b927c74b9f12ef78df9d5b1b", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "SPRI1-3" + }, + { + "type": "Title", + "element_id": "18e350f89256491ebe1f8cce73a45231", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Sprint 3" + }, + { + "type": "Title", + "element_id": "61636cdef547228389f0260d1dbb952b", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Future" + }, + { + "type": "Title", + "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "notion://sprints/sprint_task_relation" + }, + { + "type": "UncategorizedText", + "element_id": "a30a043314fa89294fa2c1c989a01fbb", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "0.25" + }, + { + "type": "UncategorizedText", + "element_id": "3dee6959f0ef3a4e6147de48fc70a814", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-07-31 - 2023-08-13" + }, + { + "type": "Title", + "element_id": "548b1cea7491191a12465d055db621f4", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100" + ], + "link_texts": [ + "\n Roman Isecke\n " + ] + }, + "text": "Roman Isecke" + }, + { + "type": "UncategorizedText", + "element_id": "9dfe062b68f15b3623944bd8ebb71b24", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-02T20:36:00.000Z" + }, + { + "type": "UncategorizedText", + "element_id": "7e4059ebb0ebf24caae1f12cb79b8c9c", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "2023-08-07" + }, + { + "type": "EmailAddress", + "element_id": "1ae8f7599f4f616683d2a69d29658afa", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "roman@unstructured.io" + }, + { + "type": "UncategorizedText", + "element_id": "5d2e9bcd00123dd21fc54731fef97129", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "https://lh3.googleusercontent.com/a/AAcHTtf2bisNRhNNHsC5OPcmNuCkfjmi4nmdyZxgYv27=s100", + "https://lh3.googleusercontent.com/a/AAcHTtdiriiUNnUcm1dkAp7cbmmQyeO-acsViQHFS9v0=s100" + ], + "link_texts": [ + "\n Roman Isecke\n ", + "\n Jason Scheirer\n " + ], + "emphasized_text_contents": [ + "Roman Isecke", + "Jason Scheirer" + ], + "emphasized_text_tags": [ + "span", + "span" + ] + }, + "text": "Roman Isecke\n \n \n \n \n Jason Scheirer" + }, + { + "type": "NarrativeText", + "element_id": "495e614a2084bd7c40e34b0b69534e67", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "This is some", + "formatted", + "formatted", + "text" + ], + "emphasized_text_tags": [ + "span", + "span", + "b", + "span" + ] + }, + "text": "This is some \n \n \n \n formatted\n \n \n \n text" + }, + { + "type": "Title", + "element_id": "3cbc87c7681f34db4617feaa2c880193", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "True" + }, + { + "type": "UncategorizedText", + "element_id": "4b2e896ce5416db25c44f6918648d0f4", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "emphasized_text_contents": [ + "Option 2", + "Option 1" + ], + "emphasized_text_tags": [ + "span", + "span" + ] + }, + "text": "Option 2\n \n \n Option 1" + }, + { + "type": "UncategorizedText", + "element_id": "e29c9c180c6279b0b02abd6a1801c7c0", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "32" + }, + { + "type": "UncategorizedText", + "element_id": "03ac674216f3e15c761ee1a5e255f067", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "1234" + }, + { + "type": "Title", + "element_id": "209ef9fc4dfe2166bcf2460b80334276", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "option 2" + }, + { + "type": "Title", + "element_id": "3fafa60b6782f5d52caf7be755d82232", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "SPRI1-1" + }, + { + "type": "Title", + "element_id": "f931bdb912a40a788890924578a0cff7", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Sprint 1" + }, + { + "type": "Title", + "element_id": "e0d1b68224bf0b31ef16b206c65b5f8f", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Current" + }, + { + "type": "Title", + "element_id": "dfcd7f16dd2d92ee4ec22516fb45abd6", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "notion://sprints/sprint_task_relation" + }, + { + "type": "Title", + "element_id": "191347bfe55d0ca9a574db77bc864827", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "link_urls": [ + "www.google.com" + ], + "link_texts": [ + "\n www.google.com\n " + ] + }, + "text": "www.google.com" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json b/test_unstructured_ingest/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json new file mode 100644 index 000000000..2c2856213 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/dfcbe584-30b1-4551-b533-e6a5759af842.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "b2c1cf36a9b45cdefac07d1899b96ff1", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Corporate Travel" + }, + { + "type": "NarrativeText", + "element_id": "ea6b271473e6accb79547f998e68f3d2", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: A corporate travel policy is crucial for controlling costs, ensuring compliance, and guaranteeing the safety of employees when traveling for the company." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json b/test_unstructured_ingest/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json new file mode 100644 index 000000000..ead749e93 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/feccfcd4-8ca0-4638-8212-1a5726461029.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "8bcdb5d9bc2bda33af04bae4495f5e37", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Benefits Policies" + }, + { + "type": "NarrativeText", + "element_id": "eef6bb1dab52f5a27ddff24998d3b614", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Benefits policies can attract and retain employees, promote well-being, create positive culture, differentiate from competitors, and increase morale and satisfaction." + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json b/test_unstructured_ingest/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json new file mode 100644 index 000000000..57fd1c153 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/fee2149e-6240-4431-8e98-a04a2e460a66.json @@ -0,0 +1,15 @@ +[ + { + "type": "Title", + "element_id": "f931bdb912a40a788890924578a0cff7", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Sprint 1" + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh new file mode 100755 index 000000000..4a13ab28f --- /dev/null +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(dirname "$(realpath "$0")") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=notion +OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME + +if [ -z "$NOTION_API_KEY" ]; then + echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set." + exit 0 +fi + +PYTHONPATH=. ./unstructured/ingest/main.py \ + notion \ + --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed \ + --download-dir "$DOWNLOAD_DIR" \ + --api-key "$NOTION_API_KEY" \ + --structured-output-dir "$OUTPUT_DIR" \ + --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ + --num-processes 2 \ + --recursive \ + --verbose + + +sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index a7ed550db..a12c57d23 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -33,6 +33,7 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-local-single-file.sh ./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh ./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh +./test_unstructured_ingest/test-ingest-notion.sh # NOTE(yuming): The following test should be put after any tests with --preserve-downloads option ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh ./test_unstructured_ingest/test-ingest-sharepoint.sh \ No newline at end of file diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py index 636fdf4fd..9b5a7b8b4 100644 --- a/unstructured/ingest/connector/discord.py +++ b/unstructured/ingest/connector/discord.py @@ -107,6 +107,7 @@ class DiscordIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): bot.run(self.token) + self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) with open(self._tmp_download_file(), "w") as f: for m in messages: f.write(m.content + "\n") @@ -131,7 +132,7 @@ class DiscordConnector(ConnectorCleanupMixin, BaseConnector): def initialize(self): """Verify that can get metadata for an object, validates connections info.""" - os.mkdir(self.standard_config.download_dir) + pass def get_ingest_docs(self): return [ diff --git a/unstructured/ingest/connector/notion/client.py b/unstructured/ingest/connector/notion/client.py index 1b8fcba96..138b516be 100644 --- a/unstructured/ingest/connector/notion/client.py +++ b/unstructured/ingest/connector/notion/client.py @@ -1,5 +1,6 @@ from typing import Any, Generator, List, Tuple +import httpx from notion_client import Client as NotionClient from notion_client.api_endpoints import ( BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint, @@ -7,6 +8,7 @@ from notion_client.api_endpoints import ( from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint +from notion_client.errors import RequestTimeoutError from unstructured.ingest.connector.notion.types.block import Block from unstructured.ingest.connector.notion.types.database import Database @@ -42,6 +44,18 @@ class DatabasesEndpoint(NotionDatabasesEndpoint): resp: dict = super().retrieve(database_id=database_id, **kwargs) # type: ignore return Database.from_dict(data=resp) + def retrieve_status(self, database_id: str, **kwargs) -> int: + request = self.parent._build_request( + method="HEAD", + path=f"databases/{database_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = self.parent.client.send(request) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. @@ -81,6 +95,18 @@ class PagesEndpoint(NotionPagesEndpoint): resp: dict = super().retrieve(page_id=page_id, **kwargs) # type: ignore return Page.from_dict(data=resp) + def retrieve_status(self, page_id: str, **kwargs) -> int: + request = self.parent._build_request( + method="HEAD", + path=f"pages/{page_id}", + auth=kwargs.get("auth"), + ) + try: + response: httpx.Response = self.parent.client.send(request) # type: ignore + return response.status_code + except httpx.TimeoutException: + raise RequestTimeoutError() + class Client(NotionClient): def __init__(self, *args: Any, **kwargs: Any) -> None: diff --git a/unstructured/ingest/connector/notion/connector.py b/unstructured/ingest/connector/notion/connector.py index dcc571c96..8c8f33aa7 100644 --- a/unstructured/ingest/connector/notion/connector.py +++ b/unstructured/ingest/connector/notion/connector.py @@ -289,6 +289,7 @@ class NotionConnector(ConnectorCleanupMixin, BaseConnector): config=config, ) + @requires_dependencies(dependencies=["notion_client"]) def initialize(self): """Verify that can get metadata for an object, validates connections info.""" pass @@ -302,6 +303,13 @@ class NotionConnector(ConnectorCleanupMixin, BaseConnector): client = NotionClient(auth=self.config.api_key, logger=self.config.get_logger()) + # sanity check that database id is valid + resp_code = client.pages.retrieve_status(page_id=page_id) + if resp_code != 200: + raise ValueError( + f"page associated with page id could not be found: {page_id}", + ) + child_content = get_recursive_content_from_page( client=client, page_id=page_id, @@ -333,6 +341,13 @@ class NotionConnector(ConnectorCleanupMixin, BaseConnector): client = NotionClient(auth=self.config.api_key, logger=self.config.get_logger()) + # sanity check that database id is valid + resp_code = client.databases.retrieve_status(database_id=database_id) + if resp_code != 200: + raise ValueError( + f"database associated with database id could not be found: {database_id}", + ) + child_content = get_recursive_content_from_database( client=client, database_id=database_id, @@ -363,6 +378,7 @@ class NotionConnector(ConnectorCleanupMixin, BaseConnector): for database_id in self.config.database_ids ] if self.config.recursive: + self.config.get_logger().info("Getting recursive content") child_pages = [] child_databases = [] for page_id in self.config.page_ids: diff --git a/unstructured/ingest/connector/notion/helpers.py b/unstructured/ingest/connector/notion/helpers.py index 5bc1c849e..91ffe2f28 100644 --- a/unstructured/ingest/connector/notion/helpers.py +++ b/unstructured/ingest/connector/notion/helpers.py @@ -20,6 +20,7 @@ from htmlBuilder.tags import ( Tr, Ul, ) +from notion_client.errors import APIResponseError import unstructured.ingest.connector.notion.types.blocks as notion_blocks from unstructured.ingest.connector.notion.client import Client @@ -160,9 +161,9 @@ def extract_database_html( logger.debug(f"Creating {len(all_pages)} rows") for page in all_pages: - if is_database_url(page.url): + if is_database_url(client=client, url=page.url): child_databases.append(page.id) - if is_page_url(page.url): + if is_page_url(client=client, url=page.url): child_pages.append(page.id) properties = page.properties inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore @@ -229,90 +230,138 @@ def get_recursive_content( logger: logging.Logger, ) -> ChildExtractionResponse: parents: List[QueueEntry] = [init_entry] - child_pages = [] - child_dbs = [] - processed = [] + child_pages: List[str] = [] + child_dbs: List[str] = [] + processed: List[str] = [] while len(parents) > 0: parent: QueueEntry = parents.pop() - processed.append(parent.id) + processed.append(str(parent.id)) if parent.type == QueueEntryType.PAGE: logger.debug(f"Getting child data from page: {parent.id}") - for children in client.blocks.children.iterate_list( # type: ignore - block_id=str(parent.id), - ): - child_pages_from_page = [ - c for c in children if isinstance(c.block, notion_blocks.ChildPage) - ] - if child_pages_from_page: - child_page_blocks: List[notion_blocks.ChildPage] = [ - p.block - for p in child_pages_from_page - if isinstance(p.block, notion_blocks.ChildPage) - ] - logger.debug( - "found child pages from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_page_blocks]), - ), - ) - new_pages = [p.id for p in child_pages_from_page if p.id not in processed] - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) + page_children = [] + try: + for children_block in client.blocks.children.iterate_list( # type: ignore + block_id=str(parent.id), + ): + page_children.extend(children_block) + except APIResponseError as api_error: + logger.error(f"failed to get page with id {parent.id}: {api_error}") + if str(parent.id) in child_pages: + child_pages.remove(str(parent.id)) + continue + if not page_children: + continue - child_dbs_from_page = [ - c for c in children if isinstance(c.block, notion_blocks.ChildDatabase) + # Extract child pages + child_pages_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildPage) + ] + if child_pages_from_page: + child_page_blocks: List[notion_blocks.ChildPage] = [ + p.block + for p in child_pages_from_page + if isinstance(p.block, notion_blocks.ChildPage) ] - if child_dbs_from_page: - child_db_blocks: List[notion_blocks.ChildDatabase] = [ - c.block - for c in children - if isinstance(c.block, notion_blocks.ChildDatabase) - ] - logger.debug( - "found child database from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_db_blocks]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + logger.debug( + "found child pages from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_page_blocks]), + ), ) + new_pages = [p.id for p in child_pages_from_page if p.id not in processed] + new_pages = list(set(new_pages)) + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + # Extract child databases + child_dbs_from_page = [ + c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase) + ] + if child_dbs_from_page: + child_db_blocks: List[notion_blocks.ChildDatabase] = [ + c.block + for c in page_children + if isinstance(c.block, notion_blocks.ChildDatabase) + ] + logger.debug( + "found child database from parent page {}: {}".format( + parent.id, + ", ".join([block.title for block in child_db_blocks]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] + new_dbs = list(set(new_dbs)) + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) + + linked_to_others: List[notion_blocks.LinkToPage] = [ + c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage) + ] + for link in linked_to_others: + if (page_id := link.page_id) and ( + page_id not in processed and page_id not in child_pages + ): + child_pages.append(page_id) + parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id))) + if (database_id := link.database_id) and ( + database_id not in processed and database_id not in child_dbs + ): + child_dbs.append(database_id) + parents.append( + QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), + ) + elif parent.type == QueueEntryType.DATABASE: logger.debug(f"Getting child data from database: {parent.id}") - for page_entries in client.databases.iterate_query( # type: ignore - database_id=str(parent.id), - ): - child_pages_from_db = [p for p in page_entries if is_page_url(p.url)] - if child_pages_from_db: - logger.debug( - "found child pages from parent database {}: {}".format( - parent.id, - ", ".join([p.url for p in child_pages_from_db]), - ), - ) - new_pages = [p.id for p in child_pages_from_db if p.id not in processed] - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) + database_pages = [] + try: + for page_entries in client.databases.iterate_query( # type: ignore + database_id=str(parent.id), + ): + database_pages.extend(page_entries) + except APIResponseError as api_error: + logger.error(f"failed to get database with id {parent.id}: {api_error}") + if str(parent.id) in child_dbs: + child_dbs.remove(str(parent.id)) + continue + if not database_pages: + continue - child_dbs_from_db = [p for p in page_entries if is_database_url(p.url)] - if child_dbs_from_db: - logger.debug( - "found child database from parent database {}: {}".format( - parent.id, - ", ".join([db.url for db in child_dbs_from_db]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + child_pages_from_db = [ + p for p in database_pages if is_page_url(client=client, url=p.url) + ] + if child_pages_from_db: + logger.debug( + "found child pages from parent database {}: {}".format( + parent.id, + ", ".join([p.url for p in child_pages_from_db]), + ), ) + new_pages = [p.id for p in child_pages_from_db if p.id not in processed] + child_pages.extend(new_pages) + parents.extend( + [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], + ) + + child_dbs_from_db = [ + p for p in database_pages if is_database_url(client=client, url=p.url) + ] + if child_dbs_from_db: + logger.debug( + "found child database from parent database {}: {}".format( + parent.id, + ", ".join([db.url for db in child_dbs_from_db]), + ), + ) + new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] + child_dbs.extend(new_dbs) + parents.extend( + [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], + ) return ChildExtractionResponse( child_pages=child_pages, @@ -328,25 +377,39 @@ def is_valid_uuid(uuid_str: str) -> bool: return False -def is_page_url(url: str): +def get_uuid_from_url(path: str) -> Optional[str]: + strings = path.split("-") + if len(strings) > 0 and is_valid_uuid(strings[-1]): + return strings[-1] + return None + + +def is_page_url(client: Client, url: str): parsed_url = urlparse(url) path = parsed_url.path.split("/")[-1] if parsed_url.netloc != "www.notion.so": return False - if is_valid_uuid(path): + page_uuid = get_uuid_from_url(path=path) + if not page_uuid: return False - strings = path.split("-") - if len(strings) > 0 and is_valid_uuid(strings[-1]): + check_resp = client.pages.retrieve_status(page_id=page_uuid) + if check_resp == 200: return True return False -def is_database_url(url: str): +def is_database_url(client: Client, url: str): parsed_url = urlparse(url) path = parsed_url.path.split("/")[-1] if parsed_url.netloc != "www.notion.so": return False - return is_valid_uuid(path) + database_uuid = get_uuid_from_url(path=path) + if not database_uuid: + return False + check_resp = client.databases.retrieve_status(database_id=database_uuid) + if check_resp == 200: + return True + return False @dataclass diff --git a/unstructured/ingest/connector/notion/types/block.py b/unstructured/ingest/connector/notion/types/block.py index b5d36e318..e0059fdd7 100644 --- a/unstructured/ingest/connector/notion/types/block.py +++ b/unstructured/ingest/connector/notion/types/block.py @@ -32,6 +32,7 @@ block_type_mapping = { "file": blocks.File, "image": blocks.Image, "link_preview": blocks.LinkPreview, + "link_to_page": blocks.LinkToPage, "numbered_list_item": blocks.NumberedListItem, "paragraph": blocks.Paragraph, "pdf": blocks.PDF, diff --git a/unstructured/ingest/connector/notion/types/blocks/__init__.py b/unstructured/ingest/connector/notion/types/blocks/__init__.py index 14e046778..5cd158bc8 100644 --- a/unstructured/ingest/connector/notion/types/blocks/__init__.py +++ b/unstructured/ingest/connector/notion/types/blocks/__init__.py @@ -13,6 +13,7 @@ from .file import File from .heading import Heading from .image import Image from .link_preview import LinkPreview +from .link_to_page import LinkToPage from .numbered_list import NumberedListItem from .paragraph import Paragraph from .pdf import PDF @@ -43,6 +44,7 @@ __all__ = [ "Heading", "Image", "LinkPreview", + "LinkToPage", "NumberedListItem", "Paragraph", "PDF", diff --git a/unstructured/ingest/connector/notion/types/blocks/link_to_page.py b/unstructured/ingest/connector/notion/types/blocks/link_to_page.py new file mode 100644 index 000000000..ed9156d26 --- /dev/null +++ b/unstructured/ingest/connector/notion/types/blocks/link_to_page.py @@ -0,0 +1,29 @@ +# https://developers.notion.com/reference/block#link-to-page +from dataclasses import dataclass +from typing import Optional + +from htmlBuilder.tags import Div, HtmlTag + +from unstructured.ingest.connector.notion.interfaces import BlockBase + + +@dataclass +class LinkToPage(BlockBase): + type: str + page_id: Optional[str] = None + database_id: Optional[str] = None + + @staticmethod + def can_have_children() -> bool: + return False + + @classmethod + def from_dict(cls, data: dict): + return cls(**data) + + def get_html(self) -> Optional[HtmlTag]: + if page_id := self.page_id: + return Div([], page_id) + if database_id := self.database_id: + return Div([], database_id) + return None diff --git a/unstructured/ingest/runner/airtable.py b/unstructured/ingest/runner/airtable.py index f1536f483..bf1d95662 100644 --- a/unstructured/ingest/runner/airtable.py +++ b/unstructured/ingest/runner/airtable.py @@ -22,6 +22,7 @@ def airtable( personal_access_token.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="airtable", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/azure.py b/unstructured/ingest/runner/azure.py index 93bb7968b..d808c3943 100644 --- a/unstructured/ingest/runner/azure.py +++ b/unstructured/ingest/runner/azure.py @@ -26,6 +26,7 @@ def azure( ) connector_config.download_dir = update_download_dir_remote_url( + connector_name="azure", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/biomed.py b/unstructured/ingest/runner/biomed.py index ca92e9215..f38af5015 100644 --- a/unstructured/ingest/runner/biomed.py +++ b/unstructured/ingest/runner/biomed.py @@ -37,6 +37,7 @@ def biomed( ) connector_config.download_dir = update_download_dir_hash( + connector_name="biomed", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/box.py b/unstructured/ingest/runner/box.py index 0b9961ba1..f313b7478 100644 --- a/unstructured/ingest/runner/box.py +++ b/unstructured/ingest/runner/box.py @@ -19,6 +19,7 @@ def box( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) connector_config.download_dir = update_download_dir_remote_url( + connector_name="box", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/confluence.py b/unstructured/ingest/runner/confluence.py index 366ecaf15..c4fba39fc 100644 --- a/unstructured/ingest/runner/confluence.py +++ b/unstructured/ingest/runner/confluence.py @@ -26,6 +26,7 @@ def confluence( url.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="confluence", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/discord.py b/unstructured/ingest/runner/discord.py index f94546aa8..e116bd55d 100644 --- a/unstructured/ingest/runner/discord.py +++ b/unstructured/ingest/runner/discord.py @@ -23,6 +23,7 @@ def discord( channels.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="discord", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/dropbox.py b/unstructured/ingest/runner/dropbox.py index 5e7aeee56..fb7e956c1 100644 --- a/unstructured/ingest/runner/dropbox.py +++ b/unstructured/ingest/runner/dropbox.py @@ -19,6 +19,7 @@ def dropbox( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) connector_config.download_dir = update_download_dir_remote_url( + connector_name="dropbox", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/elasticsearch.py b/unstructured/ingest/runner/elasticsearch.py index f6b066b28..537514814 100644 --- a/unstructured/ingest/runner/elasticsearch.py +++ b/unstructured/ingest/runner/elasticsearch.py @@ -25,6 +25,7 @@ def elasticsearch( ), ) connector_config.download_dir = update_download_dir_hash( + connector_name="elasticsearch", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/fsspec.py b/unstructured/ingest/runner/fsspec.py index a82b10b94..3da8f4346 100644 --- a/unstructured/ingest/runner/fsspec.py +++ b/unstructured/ingest/runner/fsspec.py @@ -19,6 +19,7 @@ def fsspec( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) connector_config.download_dir = update_download_dir_remote_url( + connector_name="fsspec", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/gcs.py b/unstructured/ingest/runner/gcs.py index d309d2364..4b330da7d 100644 --- a/unstructured/ingest/runner/gcs.py +++ b/unstructured/ingest/runner/gcs.py @@ -19,6 +19,7 @@ def gcs( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) connector_config.download_dir = update_download_dir_remote_url( + connector_name="gcs", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/github.py b/unstructured/ingest/runner/github.py index 46d8bc829..aceb7a4e2 100644 --- a/unstructured/ingest/runner/github.py +++ b/unstructured/ingest/runner/github.py @@ -26,6 +26,7 @@ def github( ), ) connector_config.download_dir = update_download_dir_hash( + connector_name="github", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/gitlab.py b/unstructured/ingest/runner/gitlab.py index a75bb0df7..d883dbc11 100644 --- a/unstructured/ingest/runner/gitlab.py +++ b/unstructured/ingest/runner/gitlab.py @@ -26,6 +26,7 @@ def gitlab( ), ) connector_config.download_dir = update_download_dir_hash( + connector_name="gitlab", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/google_drive.py b/unstructured/ingest/runner/google_drive.py index ed1863b2c..4bb420fbd 100644 --- a/unstructured/ingest/runner/google_drive.py +++ b/unstructured/ingest/runner/google_drive.py @@ -24,6 +24,7 @@ def gdrive( drive_id.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="gdrive", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/notion.py b/unstructured/ingest/runner/notion.py index 00f5c03bf..837efe51a 100644 --- a/unstructured/ingest/runner/notion.py +++ b/unstructured/ingest/runner/notion.py @@ -37,6 +37,7 @@ def notion( else: raise ValueError("could not create local cache directory name") connector_config.download_dir = update_download_dir_hash( + connector_name="notion", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/onedrive.py b/unstructured/ingest/runner/onedrive.py index 4bc6cf97d..c2e1584af 100644 --- a/unstructured/ingest/runner/onedrive.py +++ b/unstructured/ingest/runner/onedrive.py @@ -27,6 +27,7 @@ def onedrive( f"{tenant}_{user_pname}".encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="onedrive", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/outlook.py b/unstructured/ingest/runner/outlook.py index 9afb65ab5..09180faa6 100644 --- a/unstructured/ingest/runner/outlook.py +++ b/unstructured/ingest/runner/outlook.py @@ -25,6 +25,7 @@ def outlook( hashed_dir_name = hashlib.sha256(user_email.encode("utf-8")) connector_config.download_dir = update_download_dir_hash( + connector_name="outlook", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/reddit.py b/unstructured/ingest/runner/reddit.py index 1ea52207b..a4e48f4be 100644 --- a/unstructured/ingest/runner/reddit.py +++ b/unstructured/ingest/runner/reddit.py @@ -26,6 +26,7 @@ def reddit( subreddit_name.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="reddit", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/s3.py b/unstructured/ingest/runner/s3.py index f07a7abe5..ef3628d9d 100644 --- a/unstructured/ingest/runner/s3.py +++ b/unstructured/ingest/runner/s3.py @@ -18,6 +18,7 @@ def s3( ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) connector_config.download_dir = update_download_dir_remote_url( + connector_name="s3", connector_config=connector_config, remote_url=remote_url, logger=logger, diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py index 6eb7c8ed3..839119e1d 100644 --- a/unstructured/ingest/runner/sharepoint.py +++ b/unstructured/ingest/runner/sharepoint.py @@ -25,6 +25,7 @@ def sharepoint( f"{site}_{path}".encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="sharepoint", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/slack.py b/unstructured/ingest/runner/slack.py index 0607eb0a4..c2b16b963 100644 --- a/unstructured/ingest/runner/slack.py +++ b/unstructured/ingest/runner/slack.py @@ -24,6 +24,7 @@ def slack( channels.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="slack", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, diff --git a/unstructured/ingest/runner/utils.py b/unstructured/ingest/runner/utils.py index 9e59b388c..1e14bcbbf 100644 --- a/unstructured/ingest/runner/utils.py +++ b/unstructured/ingest/runner/utils.py @@ -10,12 +10,14 @@ from unstructured.ingest.interfaces import ( def update_download_dir_remote_url( + connector_name: str, connector_config: StandardConnectorConfig, remote_url: str, logger: logging.Logger, ) -> str: hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8")) return update_download_dir_hash( + connector_name=connector_name, connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger, @@ -23,6 +25,7 @@ def update_download_dir_remote_url( def update_download_dir_hash( + connector_name: str, connector_config: StandardConnectorConfig, hashed_dir_name: hashlib._Hash, logger: logging.Logger, @@ -32,7 +35,7 @@ def update_download_dir_hash( cache_path = Path.home() / ".cache" / "unstructured" / "ingest" if not cache_path.exists(): cache_path.mkdir(parents=True, exist_ok=True) - download_dir = cache_path / hashed_dir_name.hexdigest()[:10] + download_dir = cache_path / connector_name / hashed_dir_name.hexdigest()[:10] if connector_config.preserve_downloads: logger.warning( f"Preserving downloaded files but download_dir is not specified," diff --git a/unstructured/ingest/runner/wikipedia.py b/unstructured/ingest/runner/wikipedia.py index f5fce8314..1bff507ea 100644 --- a/unstructured/ingest/runner/wikipedia.py +++ b/unstructured/ingest/runner/wikipedia.py @@ -21,6 +21,7 @@ def wikipedia( page_title.encode("utf-8"), ) connector_config.download_dir = update_download_dir_hash( + connector_name="wikipedia", connector_config=connector_config, hashed_dir_name=hashed_dir_name, logger=logger,