diff --git a/CHANGELOG.md b/CHANGELOG.md index 97965f180..70aa21e17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.26-dev1 + +### Enhancements + +- **Add support for images in html partitioner** `` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source. + +### Features + +### Fixes + ## 0.16.25 ### Enhancements diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index 0ca830eec..4df535319 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -6,7 +6,7 @@ from __future__ import annotations import io import pathlib -from typing import Any +from typing import Any, Optional import pytest from lxml import etree @@ -24,6 +24,7 @@ from unstructured.cleaners.core import clean_extra_whitespace from unstructured.documents.elements import ( Address, CompositeElement, + ElementType, ListItem, NarrativeText, Table, @@ -296,6 +297,68 @@ def test_it_does_not_extract_text_in_style_tags(): assert element.text == "Lorem ipsum dolor" +# -- image parsing behaviors --------------------------------------------------------------------- + + +@pytest.mark.parametrize( + ("extract_to_payload", "extract_types", "expect_base64"), + [ + (True, ["Image"], True), + (True, [], False), + (True, None, False), + (False, ["Image"], False), + ], +) +def test_partition_html_base64_for_images( + opts_args: dict[str, Any], + extract_to_payload: bool, + extract_types: Optional[list[str]], + expect_base64: bool, +): + base64 = ( + "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/" + "w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" + ) + src = "data:image/png;base64," + base64 + alt_text = "Base64 Image" + + html = f""" +
+ {alt_text} +
+ """ + opts_args["text"] = html + opts_args["extract_image_block_to_payload"] = extract_to_payload + opts_args["extract_image_block_types"] = extract_types + opts = HtmlPartitionerOptions(**opts_args) + (element,) = list(_HtmlPartitioner.iter_elements(opts)) + + assert element.category == ElementType.IMAGE + assert element.text == alt_text + assert element.metadata.image_mime_type == "image/png" + if expect_base64: + assert element.metadata.image_base64 == base64 + else: + assert element.metadata.image_base64 is None + + +def test_partition_html_includes_url_for_images(): + url = "https://example.com/image.png" + alt_text = "URL Image" + # language=HTML + html = f""" +
+ {alt_text} +
+ """ + (image,) = partition_html( + text=html, + ) + assert image.category == ElementType.IMAGE + assert image.text == alt_text + assert image.metadata.url == url + + # -- table parsing behaviors --------------------------------------------------------------------- diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 1179600aa..07a6ae73a 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -210,7 +210,7 @@ def test_auto_partition_epub_from_filename(): elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 - assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports") def test_auto_partition_epub_from_file(): @@ -218,7 +218,7 @@ def test_auto_partition_epub_from_file(): elements = partition(file=f, strategy=PartitionStrategy.HI_RES) assert len(elements) > 0 - assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports") # ================================================================================================ @@ -430,7 +430,7 @@ def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path): def test_partition_md_from_url_works_with_embedded_html(): url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md" elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES) - assert "unstructured" in elements[0].text + assert "unstructured" in elements[1].text # ================================================================================================ diff --git a/test_unstructured/partition/test_epub.py b/test_unstructured/partition/test_epub.py index f490c9382..46748fb74 100644 --- a/test_unstructured/partition/test_epub.py +++ b/test_unstructured/partition/test_epub.py @@ -14,14 +14,14 @@ def test_partition_epub_from_filename(): assert len(elements) > 0 assert isinstance(elements[0], Text) - assert elements[0].text.startswith("a shared culture") + assert elements[1].text.startswith("a shared culture") if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: assert {element.metadata.detection_origin for element in elements} == {"epub"} def test_partition_epub_from_filename_returns_table_in_elements(): elements = partition_epub(example_doc_path("winter-sports.epub")) - assert elements[10] == Table( + assert elements[12] == Table( "Contents. List of Illustrations (In certain versions of this etext [in certain\nbrowsers]" " clicking on the image will bring up a larger\nversion.) (etext transcriber's note)" ) @@ -32,7 +32,7 @@ def test_partition_epub_from_file(): elements = partition_epub(file=f) assert len(elements) > 0 - assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports") # -- .metadata.filename -------------------------------------------------------------------------- diff --git a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/1605956.html b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/1605956.html index 386016fb7..ff40909ea 100644 --- a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/1605956.html +++ b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/1605956.html @@ -13,89 +13,98 @@

The overview is the first page visitors will see when they visit your space, so it helps to include some information on what the space is about and what your team is working on.

-

+ +

Add a header image. This gives your overview visual appeal and makes it welcoming for visitors.

-

+

Explain what the space is for. Start by summarizing the purpose of the space. This could be your team's mission statement or a brief description of the kind of work you do.

-

+

Share team goals. Add links to your team's OKRs, project plans, and product roadmaps so visitors can quickly get a sense of your team's goals.

-

+

Tell people how to contact you. Share your timezone and links to Slack channels, email aliases, or other contact details your team uses so visitors can contact you with questions or feedback about your team's work.

-

+

Use shortcuts for easy access

-

+

Shortcuts are helpful for important pages that members of a space might need to get to often. These shortcuts are added and organized by the space administrator. Space admins can link to pages in the space, other related spaces, or relevant external web content as well as reorder the shortcuts as needed.

-

+ +

πŸ’­Start discussions with inline comments

-

+

Thoughtful responses can get lost and lose context as email replies pile up. And if you neglect to copy someone or want to add them later on, it's difficult for them to get up to speed. Inline comments allow anyone (or everyone) to huddle around an idea while referencing key information on the project page.

-

+

To leave an inline comment, highlight text on the page and the comment icon will appear.

-

+ +

Team members with permission to access the page can respond to any comment. Plus, when a comment thread comes to its natural conclusion, comments can be resolved and cleared away.

-

+

πŸ‘‹Loop in team members with @mentions

-

+

@mentions on Confluence function like @mentions on social media platforms like Twitter, Instagram, and Slack. Type the @ symbol on a Confluence page or in a comment, begin spelling a team member's first name, and a list will appear. Select the individual to ask a question or assign a task.

-

+ +

πŸ‘Endorse ideas with reactions

-

+

Use reactions when you want to support a comment or acknowledge you've seen one without clogging up the thread with another comment.

-

+

You can also use reactions on a page or blog post. The author of the content will be notified, and if enough team members react or add comments to the content, it'll be surfaced on Confluence home feed

-

+ +

Take your Confluence space to the next level

-

+

Extend the capabilities of your Confluence pages by adding extra functionality or including dynamic content.

-

+

To add functionality:

-

+

Type ' / ' to open the list of items available to use

-

+

Find the item to be inserted and select it

-

+

Select Insert

-

+ +

Useful elements for Team space

-

+

Introduce the team

-

+

Add user profiles to display a short summary of a given Confluence user's profile with their role, profile photo and contact details.

-

+ + +

Share news and announcements with your team

-

+

Display a stream of latest blog posts so your team can easily see what's been going on.

-

+

Display a list of important pages

-

+

Paste in page URLs to create smart links, or use the content report table to create a list of all the pages in the space.

+ diff --git a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/229477.html b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/229477.html index 6b5a8313e..372485d9c 100644 --- a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/229477.html +++ b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/MFS/229477.html @@ -46,85 +46,91 @@

Add team members to your space.

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+

Contact us

-

+

How can someone reach out to your team?

-
+
team@email.com
-

+

Tickets

-

+

Jira board

-

+

#channel

-

+

Important Pages

-

+

List them here

-

+ +

Onboarding FAQs

-

+

Add resources for new hires

-

+ +

Meeting notes

-

+

Add links to meeting notes

-

+ +

Team goals

-

+

List them here

-

+

Team news

-

+

Create a blog post to share team news. It will automatically appear here once it's published.

-

+

Blog stream

-

+

Create a blog post to share news and announcements with your team and company.

diff --git a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605859.html b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605859.html index 567ec6950..38cf056aa 100644 --- a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605859.html +++ b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605859.html @@ -46,85 +46,91 @@

Add team members to your space.

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+ +

Team member

-

+

Role

-

+

Responsibility

-

+

Contact us

-

+

How can someone reach out to your team?

-
+
team@email.com
-

+

Tickets

-

+

Jira board

-

+

#channel

-

+

Important Pages

-

+

List them here

-

+ +

Onboarding FAQs

-

+

Add resources for new hires

-

+ +

Meeting notes

-

+

Add links to meeting notes

-

+ +

Team goals

-

+

List them here

-

+

Team news

-

+

Create a blog post to share team news. It will automatically appear here once it's published.

-

+

Blog stream

-

+

Create a blog post to share news and announcements with your team and company.

diff --git a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605989.html b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605989.html index e54d3c049..0f75ae86e 100644 --- a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605989.html +++ b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1605989.html @@ -49,64 +49,65 @@

testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3

-

+ +

Testdoc3 List Item 1

-

+

Testdoc3 List Item 1 Nested Item A

-

+

Testdoc3 List Item 1 Nested Item B

-

+

Testdoc3 List Item 2

-

+

Testdoc3 List Item 3

-

+

Testdoc3 List Item 4

-

+

Testdoc3 List Item 5

-

+

This is the link for unstructured . io.

-
  • +
  • Testdoc3 Checklist Item 1
  • -
  • +
  • Testdoc3 Checklist Item 2 (checked)
  • -
  • +
  • Testdoc3 Checklist Item 3
  • -

    +

    πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ

    -

    +

    Testdoc3 bold text

    -

    +

    Testdoc3 italic text

    -

    +

    Testdoc3 Heading 1 Sized Text

    -

    +

    Testdoc3 Heading 2 Sized Text

    -

    +

    Testdoc3 Heading 3 Sized Text

    -

    +

    Testdoc3 Heading 4 Sized Text

    -

    +

    Testdoc3 Heading 5 Sized Text

    - +
    Testdoc3 Table: Column 1 Row 0 @@ -141,5 +142,6 @@
    + diff --git a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1802252.html b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1802252.html index f27107467..bd14a6902 100644 --- a/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1802252.html +++ b/test_unstructured_ingest/expected-structured-output-html/confluence-diff/testteamsp/1802252.html @@ -49,64 +49,65 @@

    testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2

    -

    + +

    Testdoc2 List Item 1

    -

    +

    Testdoc2 List Item 1 Nested Item A

    -

    +

    Testdoc2 List Item 1 Nested Item B

    -

    +

    Testdoc2 List Item 2

    -

    +

    Testdoc2 List Item 3

    -

    +

    Testdoc2 List Item 4

    -

    +

    Testdoc2 List Item 5

    -

    +

    This is the link for unstructured . io.

    -
  • +
  • Testdoc2 Checklist Item 1
  • -
  • +
  • Testdoc2 Checklist Item 2 (checked)
  • -
  • +
  • Testdoc2 Checklist Item 3
  • -

    +

    πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ πŸ˜ƒ

    -

    +

    Testdoc2 bold text

    -

    +

    Testdoc2 italic text

    -

    +

    Testdoc2 Heading 1 Sized Text

    -

    +

    Testdoc2 Heading 2 Sized Text

    -

    +

    Testdoc2 Heading 3 Sized Text

    -

    +

    Testdoc2 Heading 4 Sized Text

    -

    +

    Testdoc2 Heading 5 Sized Text

    - +
    Testdoc2 Table: Column 1 Row 0 @@ -141,5 +142,6 @@
    + diff --git a/test_unstructured_ingest/expected-structured-output-html/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html b/test_unstructured_ingest/expected-structured-output-html/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html index d29de0166..c3b2d193c 100644 --- a/test_unstructured_ingest/expected-structured-output-html/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html +++ b/test_unstructured_ingest/expected-structured-output-html/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.html @@ -164,19 +164,20 @@

    Heading 1 content

    -

    + +

    d3d87fc6-61cc-4bb5-89ed-e9dff0df1526

    -

    +

    Stuff todo

    -

    +

    more stuff todo

    -

    +

    More things to do

    -

    +

    Something to do

    diff --git a/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErPIAU.eml.html b/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErPIAU.eml.html index 5bde62f48..5c1f52487 100644 --- a/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErPIAU.eml.html +++ b/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErPIAU.eml.html @@ -10,5 +10,6 @@

    Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/

    + diff --git a/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErQIAU.eml.html b/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErQIAU.eml.html index 111127aeb..94fb01c41 100644 --- a/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErQIAU.eml.html +++ b/test_unstructured_ingest/expected-structured-output-html/salesforce/EmailMessage/02sHu00001efErQIAU.eml.html @@ -10,5 +10,6 @@

    Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/

    + diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json index 7ccd6c081..042101261 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/1605956.json @@ -42,7 +42,28 @@ "type": "NarrativeText" }, { - "element_id": "21e1683c1bc71c40ea20081368bcc7f6", + "element_id": "2051072f068db11d81f2bcbd031f8c19", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "156af6589ee1a114454df9aa55b88d85", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -69,7 +90,7 @@ "type": "NarrativeText" }, { - "element_id": "65f03aec0f3637db38c5a3741968eeff", + "element_id": "618dd7e3cee45b5b0f04847b33879336", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -96,7 +117,7 @@ "type": "NarrativeText" }, { - "element_id": "e2522f792c3c5ef32bf1ba342a282fdd", + "element_id": "ca6d9e5f81ae268b7bbf6b62dad3357b", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -133,7 +154,7 @@ "type": "NarrativeText" }, { - "element_id": "bd058a2d2c45c92a3178e327564e135a", + "element_id": "cf63812b68970732916946496b13b763", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -160,7 +181,7 @@ "type": "NarrativeText" }, { - "element_id": "eab79997042ec6e273d0a13383347a57", + "element_id": "82d520e252b220d5c4c6ce29ffb1ade1", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -181,7 +202,7 @@ "type": "Title" }, { - "element_id": "29cdfa9dda669b1dac60890795ab526c", + "element_id": "b2d427efb6bb6f37c4afd368cefab926", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -202,7 +223,28 @@ "type": "NarrativeText" }, { - "element_id": "3251fe353cdbb64ce5cf084aef00cd96", + "element_id": "d9f3cfd98a3c67adb56cfafae39d3e03", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "84ef673952608f3ba8bc4d2fa9deab59", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -223,7 +265,7 @@ "type": "Title" }, { - "element_id": "29a93ef334092c2a12daf86b1c1b61fb", + "element_id": "bcb788a54a545e7f1448f6e4dacb91eb", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -250,7 +292,7 @@ "type": "NarrativeText" }, { - "element_id": "15cc91b0ec273ab28ab202cd5e7836ea", + "element_id": "c9dd716e43dfb450e3ff4cf59a3b5c63", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -271,7 +313,28 @@ "type": "NarrativeText" }, { - "element_id": "c606d30a11f8686a33c4f5305ab878fa", + "element_id": "46647a4ff2f932d50ca02a1ef0ac51a2", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "3452f07fead697f48e719306657044a6", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -292,7 +355,7 @@ "type": "NarrativeText" }, { - "element_id": "9cec5c4cb40b1424590a7d2255ba5d98", + "element_id": "025ce3293479133863a7a64723611197", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -313,7 +376,7 @@ "type": "Title" }, { - "element_id": "158ce46e2f05121666d26652b44ce556", + "element_id": "0fa6faf7cc80d654c319b481e7c7ffce", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -340,7 +403,28 @@ "type": "NarrativeText" }, { - "element_id": "aedbcb95b475418adc9e82fb50e1832f", + "element_id": "df15c1a5963603656576632632e1dced", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "964954bfb165e4c1aa687b78fba71144", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -361,7 +445,7 @@ "type": "Title" }, { - "element_id": "9dcf5a605331e2e0db925a329a727df8", + "element_id": "fe5335fa2c3bc18a1cbb8425fe071e47", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -382,7 +466,7 @@ "type": "NarrativeText" }, { - "element_id": "a26e40b5555fb394e0844b7ae0118a90", + "element_id": "d336ac79f4cbd3245fad05bfbc4c8f2b", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -403,7 +487,28 @@ "type": "NarrativeText" }, { - "element_id": "04dfe464a23b5192ca7465fca96e8a56", + "element_id": "984da83593997e86b62223f8d1b03a62", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "9901914d311723f7f14e905d32ee94fd", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -424,7 +529,7 @@ "type": "Title" }, { - "element_id": "06b459a1ab6ee59cbf44705c24934f15", + "element_id": "30b4b4dc49d65a5a014b40312edbb424", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -445,7 +550,7 @@ "type": "NarrativeText" }, { - "element_id": "7d4a53bc8e11c662ba62212041b24cf6", + "element_id": "a4d482bff56873324e2f2578c381e971", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -472,7 +577,7 @@ "type": "NarrativeText" }, { - "element_id": "29eaf10632e9bd8a0f0c46ac3f6ff876", + "element_id": "f17948e62a99462cb4013796e97eea23", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -493,7 +598,7 @@ "type": "NarrativeText" }, { - "element_id": "885e34b9230d70d0c3257eef2d3f6a0f", + "element_id": "62804fd3619c5c942cf3944315db132c", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -514,7 +619,7 @@ "type": "NarrativeText" }, { - "element_id": "258ee604863fd54e308f2925d07ebd79", + "element_id": "80ba4f784cb65e206b17b76f79c55818", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -541,7 +646,28 @@ "type": "UncategorizedText" }, { - "element_id": "04a5e0e0b40cb961c84088dcc67b26b7", + "element_id": "7927a0fdb568097efde58fdd68ed7e0a", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "60a261f17ffc821a917909bfb88a6d70", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -562,7 +688,7 @@ "type": "Title" }, { - "element_id": "bd4f8d2535746efce21ce872c09ef973", + "element_id": "39d32e21527ef07823ab779970d88f26", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -589,7 +715,7 @@ "type": "UncategorizedText" }, { - "element_id": "433789f2b20ca6275f62a944390e3c1d", + "element_id": "fd0d57485d0925b681a03e270faeeb06", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -616,7 +742,49 @@ "type": "NarrativeText" }, { - "element_id": "959ffe89453ca67c279ed576df24e196", + "element_id": "b82b06b66608a8353fc7f99608bd8b08", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "32ce3055a4b209c2734306d8e7266c08", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "2bad3c29ae9bd81da3a1d4c52487b032", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -643,7 +811,7 @@ "type": "UncategorizedText" }, { - "element_id": "8b81b2db2cef191090cfa1d4204b8964", + "element_id": "aa92002440f8c5a41323b8f85d131665", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -670,7 +838,7 @@ "type": "NarrativeText" }, { - "element_id": "3fd46bb09e57e95f1211f475c45b575b", + "element_id": "b313e6521d8168c6c840f8113c0ebd27", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -697,7 +865,7 @@ "type": "NarrativeText" }, { - "element_id": "5cbfe913e369743f1f14830c0b6572ab", + "element_id": "c4bffd5805a6c7d1cb196dcd505f13d1", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:45.288000", @@ -722,5 +890,26 @@ }, "text": "Paste in page URLs to create smart links, or use the content report table to create a list of all the pages in the space.", "type": "NarrativeText" + }, + { + "element_id": "15e9a49d1413538015b1fd4d7dee1825", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:45.288000", + "date_modified": "2023-07-09T12:54:45.288000", + "record_locator": { + "page_id": "1605956", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json index 6708b0ed2..a592fa71c 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/MFS/229477.json @@ -315,7 +315,28 @@ "type": "UncategorizedText" }, { - "element_id": "8e206800f74b037f87bc91ce09a66587", + "element_id": "11d63c2d51214128c8caebb58f2bf06d", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "3d68b97296629da6f56dbee7226fb9ea", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -336,7 +357,7 @@ "type": "Title" }, { - "element_id": "2c4cc93ed9393b0f05a3e564c436e13e", + "element_id": "b14012a7e1df00e14688673e6836af91", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -357,7 +378,7 @@ "type": "UncategorizedText" }, { - "element_id": "554c2527470d9fea2aaf8cefd8aa8ffc", + "element_id": "2ee3fe067727e804a8089f8c0131cd7e", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -378,7 +399,28 @@ "type": "UncategorizedText" }, { - "element_id": "feb3b3be79c77e3d661dc3fa522de26f", + "element_id": "e206acc35c25cd275875533feb308ecf", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "e9f3973e622aaacb42556e6f29d140c0", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -399,7 +441,7 @@ "type": "Title" }, { - "element_id": "5a73ff028549542468675768deee0430", + "element_id": "2b43cb7e0a29b1411d109e9a682940fa", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -420,7 +462,7 @@ "type": "UncategorizedText" }, { - "element_id": "94d211691238a7f3f74db151876c6734", + "element_id": "3560a31004a2e271125262ae3435cd80", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -441,7 +483,28 @@ "type": "UncategorizedText" }, { - "element_id": "198d8ad5606c445ba4dcafd19926c65e", + "element_id": "48a5d1f209c8025b1cfb1d882658743e", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "64c696a8ba912e8c86e3dacc55bcfd09", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -462,7 +525,7 @@ "type": "Title" }, { - "element_id": "776f1a1125f787afd3d193ede37edbf3", + "element_id": "60781a8a6086a335e6ef8efa6e767f74", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -483,7 +546,7 @@ "type": "UncategorizedText" }, { - "element_id": "7d9faf5ffc93c10998801ec69e82969d", + "element_id": "47137487152e9d98851e213658f3b212", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -504,7 +567,7 @@ "type": "UncategorizedText" }, { - "element_id": "46bdd16cf46259b25d67480f1467e0b0", + "element_id": "5189c62c2edeed476df22eaa2bb5af21", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -525,7 +588,7 @@ "type": "Title" }, { - "element_id": "80dadf7b66548e15b0b7f73c59ee50cf", + "element_id": "43e843feeaed82e03996b90693f9c8eb", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -552,7 +615,7 @@ "type": "NarrativeText" }, { - "element_id": "23168bef3f665803fb9ec74644a65674", + "element_id": "0bae84d0e5cdc716a1dce4f739b86469", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -579,7 +642,7 @@ "type": "EmailAddress" }, { - "element_id": "02510c1509479158e837ac5d13f84bf5", + "element_id": "4d103f0c3f7f3527c37f34a8c4e86782", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -600,7 +663,7 @@ "type": "UncategorizedText" }, { - "element_id": "c59943bccf5535ffd752fe52a2f6a184", + "element_id": "deda95e4491b693fdb7bb978868beefd", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -621,7 +684,7 @@ "type": "UncategorizedText" }, { - "element_id": "21d150625554235f8fe3270ed63d2921", + "element_id": "e35c7cd3ecffe9ca0e65935f3feebfbd", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -642,7 +705,7 @@ "type": "UncategorizedText" }, { - "element_id": "29c4e13f95e215957a8d697601c3d1cc", + "element_id": "f953d1e45bf1cf4cd4985b61255a41e3", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -663,7 +726,7 @@ "type": "Title" }, { - "element_id": "8bdacdf1a36489a491926616432b7b8e", + "element_id": "53c5427b05c4256bd7c7e03346e58b9f", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -690,7 +753,28 @@ "type": "NarrativeText" }, { - "element_id": "68accd9d0365712f54b96da661cce03d", + "element_id": "6e5310473567927ff094c33ba42ff201", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "a139fb30a2382364053eb57aa180550f", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -711,7 +795,7 @@ "type": "Title" }, { - "element_id": "35aa0d02a38ad72c0ca0534155dbdeb8", + "element_id": "eb784ba0d48bf9e06b53aed2ac3fbd72", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -738,7 +822,28 @@ "type": "UncategorizedText" }, { - "element_id": "ea538f1ebdd2ced67e8c86dcf50bc164", + "element_id": "768cfb8a51125da06add3109e7d155b3", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "67503783d98953e33cdc2846b90c21fd", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -759,7 +864,7 @@ "type": "Title" }, { - "element_id": "6f4ae84a8d8a1d9005384f35e2ce793c", + "element_id": "27194483431e4365b86572cbc73b9af5", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -786,7 +891,28 @@ "type": "NarrativeText" }, { - "element_id": "9616030a71ad0e0654b28e61578d0443", + "element_id": "c941c078ee573a2bbca654a7b5ce68f4", + "metadata": { + "data_source": { + "date_created": "2023-06-30T17:25:25.504000", + "date_modified": "2023-06-30T17:25:30.898000", + "record_locator": { + "page_id": "229477", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "885ad7169d419802971c64780c7a7968", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -807,7 +933,7 @@ "type": "Title" }, { - "element_id": "d81cb76df56721595c0495e4f5e6094f", + "element_id": "4d12c0c0f2d8211bc2b3eae35ac4f854", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -834,7 +960,7 @@ "type": "NarrativeText" }, { - "element_id": "46c3bd98dbea47cb63923597c929b932", + "element_id": "019ded9026166e1794b589358870fe60", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -855,7 +981,7 @@ "type": "Title" }, { - "element_id": "1558d5e9d97c1cbb5cbb5cb2b077f83d", + "element_id": "7cdd15b42c50cc95a64aa83149e72aec", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -882,7 +1008,7 @@ "type": "NarrativeText" }, { - "element_id": "c281ed85f2e1125c9aaf318fd5178d4d", + "element_id": "008813f1d7a4380879ff001294f8bc6e", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", @@ -903,7 +1029,7 @@ "type": "Title" }, { - "element_id": "4b401fd3bc190fce17f70000e0164772", + "element_id": "2a28d14ef4ba44c8f0098df26a520f23", "metadata": { "data_source": { "date_created": "2023-06-30T17:25:25.504000", diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json index 332585b7b..8e74f96ed 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605859.json @@ -315,7 +315,28 @@ "type": "UncategorizedText" }, { - "element_id": "93eecf0cb223bb9b38800c595a2c1ce2", + "element_id": "33831fbc138ef739d88d4f83b4cfc58d", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "240725efee18f416b470f886d83e54a3", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -336,7 +357,7 @@ "type": "Title" }, { - "element_id": "75ee4a303fc5ab8639c7bca973f29e30", + "element_id": "a8359a51dc7bc16fc9f2f412dfad01d7", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -357,7 +378,7 @@ "type": "UncategorizedText" }, { - "element_id": "22731d9c17747fc4708fd7f418e9dd57", + "element_id": "4d2982f8ec1f943ba5887ea5e1c41722", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -378,7 +399,28 @@ "type": "UncategorizedText" }, { - "element_id": "c4327bb8ec4ea8444a6307fcdf6928cd", + "element_id": "1709eac9e1289421c96b86fa773e85ba", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "8e408d997b6afdcc6dc7c5d2f60d51fe", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -399,7 +441,7 @@ "type": "Title" }, { - "element_id": "aa48062270f019242d68093284c4fa0c", + "element_id": "f86b21d5900d7c26053ce0d49624e22b", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -420,7 +462,7 @@ "type": "UncategorizedText" }, { - "element_id": "4bdb6fa86fd59b0729ecb9b6dbbf1ba7", + "element_id": "ad6b52393cba4295aa11d461df801ec9", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -441,7 +483,28 @@ "type": "UncategorizedText" }, { - "element_id": "07671349c39424db27fcf99634ed95d2", + "element_id": "3fa16ff3939638c6415d5d1367aa01be", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "92cda6e10ddc39a6274a39bd28d78fd6", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -462,7 +525,7 @@ "type": "Title" }, { - "element_id": "bb14e5c4bda33439f627d9d0484b603c", + "element_id": "c6bb501cb86fef4a7e6af33b44408860", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -483,7 +546,7 @@ "type": "UncategorizedText" }, { - "element_id": "a85a7425fe31f85a4aa6ae0a3d5c4251", + "element_id": "1e867147aebd2e2042c0b79216eb8ad6", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -504,7 +567,7 @@ "type": "UncategorizedText" }, { - "element_id": "c250d32242e3900d71e3dc6a4a6ac3c4", + "element_id": "72969103d9798a14b6937a5f17e95250", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -525,7 +588,7 @@ "type": "Title" }, { - "element_id": "2cba66c761cce97def3ee35ad7e841a1", + "element_id": "a1f62f9caaa9e0ab38abfecc9992beb6", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -552,7 +615,7 @@ "type": "NarrativeText" }, { - "element_id": "bb593264cda1392498158b2ce65053ac", + "element_id": "d6507473bd42ae2c5043ef9682f5b71f", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -579,7 +642,7 @@ "type": "EmailAddress" }, { - "element_id": "ad4aa408f6abd52bd1e2adf149fed96d", + "element_id": "d68042b1765da182a599d7f147d2abef", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -600,7 +663,7 @@ "type": "UncategorizedText" }, { - "element_id": "f98b3b59b55313381052f1cfa1194bc5", + "element_id": "717b067188e80741597eb37455bf4fbe", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -621,7 +684,7 @@ "type": "UncategorizedText" }, { - "element_id": "a9f4432dce00417cc8a4c304e424c28b", + "element_id": "16455e060585b3e0817764ca31c32151", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -642,7 +705,7 @@ "type": "UncategorizedText" }, { - "element_id": "3716407dd9d7c3bc756ab8ee46ea7770", + "element_id": "f773ae2bc874cb28cff580d0b63a627a", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -663,7 +726,7 @@ "type": "Title" }, { - "element_id": "6fda3a7478f59f5290ac529d13bbceaf", + "element_id": "8a7363b7d1eb2cb37430121d27168de0", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -690,7 +753,28 @@ "type": "NarrativeText" }, { - "element_id": "f4186b4e1cec5ef7009560d11cb74087", + "element_id": "030568cacd3b66ce8ee6c6c3c9be840f", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "fd9d745f22dffbb155b2e8022e2dc2e4", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -711,7 +795,7 @@ "type": "Title" }, { - "element_id": "a27a3099dea44c05dfea1e0e125abac5", + "element_id": "71d0ef13e2b308bf6c79c3153f3ed35f", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -738,7 +822,28 @@ "type": "UncategorizedText" }, { - "element_id": "44d083c5ce62947d874c568db0dbc01b", + "element_id": "7e882f807cf95f54e80ea3d7b75f6edd", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "16fda0efe288d0c8d1cf18b1037b5b0e", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -759,7 +864,7 @@ "type": "Title" }, { - "element_id": "23d9d3b7eb1b506a1031e99b28243136", + "element_id": "8bf5be7f0d4a4b5248347885f68f6b89", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -786,7 +891,28 @@ "type": "NarrativeText" }, { - "element_id": "5d12aca2ca2b8aba5c9dee48f1475f55", + "element_id": "56b696bc7b11d0f3e1165cb157426dcc", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:54:40.304000", + "date_modified": "2023-07-13T14:13:27.275000", + "record_locator": { + "page_id": "1605859", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859", + "version": "2" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "c6fe156426f03a42912623025777f8c8", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -807,7 +933,7 @@ "type": "Title" }, { - "element_id": "70182a5acbdac0041ee51b85dfca692f", + "element_id": "d8f7425068e3b4e6e99affa00d268060", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -834,7 +960,7 @@ "type": "NarrativeText" }, { - "element_id": "243fc77b8eebdbcf00a6a108a8159b69", + "element_id": "e4589df20d851e29530dbf5f97444eca", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -855,7 +981,7 @@ "type": "Title" }, { - "element_id": "3014e5236eb14590a7c13e83c36b20ce", + "element_id": "37a3e4a1755417a6944ff64115257147", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -882,7 +1008,7 @@ "type": "NarrativeText" }, { - "element_id": "800f984e0d3456624dce9630abfd873a", + "element_id": "e26ff7fd8e8e12c8aa704e6f97275fbf", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", @@ -903,7 +1029,7 @@ "type": "Title" }, { - "element_id": "800885acdda14ccb63621293f9a3aa2f", + "element_id": "18220fb2182492f64b3504513de4fbef", "metadata": { "data_source": { "date_created": "2023-07-09T12:54:40.304000", diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json index 995c9dc63..caceda2dd 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1605989.json @@ -308,7 +308,29 @@ "type": "UncategorizedText" }, { - "element_id": "d0b45e375f3a7207caacb7be289ebd62", + "element_id": "f8085d2948c73dfb968f7b221f3e8fab", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:56:40.842000", + "date_modified": "2023-07-09T12:57:59.173000", + "record_locator": { + "page_id": "1605989", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng", + "fra" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "48d494bb12fd182b0106bff99dd2e3be", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -330,7 +352,7 @@ "type": "UncategorizedText" }, { - "element_id": "351fc6ff4a9a491bf863ed7aa20fd5c5", + "element_id": "3f1b3ecb6515a47b94579cf7de892f09", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -352,7 +374,7 @@ "type": "UncategorizedText" }, { - "element_id": "6688bffe9c19dca7cb61ee039a6ffa10", + "element_id": "171423f703a966d2616837ed489f6975", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -374,7 +396,7 @@ "type": "UncategorizedText" }, { - "element_id": "e50d0b83f51c65bda0620ccec0368a41", + "element_id": "87daeeb71306ae76a90c0e6ccac0dd47", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -396,7 +418,7 @@ "type": "UncategorizedText" }, { - "element_id": "91697f192743d0583d02cb3e232d3c83", + "element_id": "c4d15cc61c5d6a3f2350f758b82e487f", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -418,7 +440,7 @@ "type": "UncategorizedText" }, { - "element_id": "7e1f204c284d5e878639feca87a022c4", + "element_id": "e069a6333ef83f6f250880a500439da3", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -440,7 +462,7 @@ "type": "UncategorizedText" }, { - "element_id": "24fc2762132dbbf33824a2c8575f6c14", + "element_id": "2f030590e85c72dd4a2fc739cc05affe", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -462,7 +484,7 @@ "type": "UncategorizedText" }, { - "element_id": "c8d91bf0f74cf2d7474b81fa319cc0e5", + "element_id": "2db738cf60bf0471df90b6141fc6a8e5", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -490,7 +512,7 @@ "type": "NarrativeText" }, { - "element_id": "ce274d7699a4270e902e3617c7cf6e36", + "element_id": "ab3005ca05b4f48396361646916154b4", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -512,7 +534,7 @@ "type": "ListItem" }, { - "element_id": "b54b171c49bbdb6f51308ff765b7f121", + "element_id": "ac70c0a823f0a1d56777036e77e77fd9", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -534,7 +556,7 @@ "type": "ListItem" }, { - "element_id": "712a4752864712c0ec58730edb76b2f1", + "element_id": "099fca1cec6f3eaa5f71ed9c2ed235e4", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -556,7 +578,7 @@ "type": "ListItem" }, { - "element_id": "f2e9daed509db420ecf36984f431900f", + "element_id": "85d4a299ad3ee61201530bf0030808b1", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -578,7 +600,7 @@ "type": "UncategorizedText" }, { - "element_id": "3cb5f4888419631affdc50af8f020348", + "element_id": "a1a4f27d3b3cc32777e25b3bb0766083", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -606,7 +628,7 @@ "type": "NarrativeText" }, { - "element_id": "9f5d86ea05eeb0bf570d9141a5b8994b", + "element_id": "976749a5d532d1f18195d61fe8c04be3", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -634,7 +656,7 @@ "type": "UncategorizedText" }, { - "element_id": "64ba474681b32c7dbc2a00fb9ec3e757", + "element_id": "079d83c4a7622c70baab0336e3128ec4", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -656,7 +678,7 @@ "type": "Title" }, { - "element_id": "2d4a5727cd260bb321af0f777e2e699f", + "element_id": "68e58e6fec19f4ec291fd5bcca3dadd8", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -678,7 +700,7 @@ "type": "Title" }, { - "element_id": "0a5dc104636145b04136d2eee7c4469b", + "element_id": "8dbaefc9dcf7af80f14871cdd2b0c1d6", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -700,7 +722,7 @@ "type": "Title" }, { - "element_id": "470c624d2702678b94685d916908136a", + "element_id": "2ca9400f81b794c7d595f9bcd719b18a", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -722,7 +744,7 @@ "type": "Title" }, { - "element_id": "bd8ca6285c9717384bd456685550304b", + "element_id": "191672694e36e804e20214e8c9bd2d44", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -744,7 +766,7 @@ "type": "Title" }, { - "element_id": "99613e61c98f3e61b28c98d096524077", + "element_id": "10d98c7ebc4b2cd6e5508aa9563cc788", "metadata": { "data_source": { "date_created": "2023-07-09T12:56:40.842000", @@ -765,5 +787,27 @@ }, "text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2", "type": "Table" + }, + { + "element_id": "45b142b9287e8ebadbc1dfb48ee5245a", + "metadata": { + "data_source": { + "date_created": "2023-07-09T12:56:40.842000", + "date_modified": "2023-07-09T12:57:59.173000", + "record_locator": { + "page_id": "1605989", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng", + "fra" + ] + }, + "text": "", + "type": "Image" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json index b244b7ae9..5476ea28a 100644 --- a/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json +++ b/test_unstructured_ingest/expected-structured-output/confluence-diff/testteamsp/1802252.json @@ -308,7 +308,29 @@ "type": "UncategorizedText" }, { - "element_id": "a931e049fc3bd99cf74ef09502a71938", + "element_id": "61525bb0c406b033be2849cb375e9fcd", + "metadata": { + "data_source": { + "date_created": "2023-07-11T17:01:39.240000", + "date_modified": "2023-07-11T17:01:47.340000", + "record_locator": { + "page_id": "1802252", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1802252", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng", + "fra" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "7e8cf6622ad36f28966178e194feaad8", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -330,7 +352,7 @@ "type": "UncategorizedText" }, { - "element_id": "59e566b7776eba69071658b586226bd0", + "element_id": "eeb9717b5c634b6bb4b4bb1b83101500", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -352,7 +374,7 @@ "type": "UncategorizedText" }, { - "element_id": "971bd18c2de3ef14a26ba1d8e4ef8668", + "element_id": "c5bc1b1b90fd002ca136bc3d76e3d482", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -374,7 +396,7 @@ "type": "UncategorizedText" }, { - "element_id": "c688b4f7d2e49c8d8d7c77d28ddf5ecc", + "element_id": "9ecb8d523cdd75be94294fa2ca440799", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -396,7 +418,7 @@ "type": "UncategorizedText" }, { - "element_id": "9ce074ac38046f414a5f16cd9c7308b3", + "element_id": "4909ea47e524d3b8fbac470c0663f589", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -418,7 +440,7 @@ "type": "UncategorizedText" }, { - "element_id": "a4fbf964d1efe50e1c1ee181b453d4d6", + "element_id": "ab916d1a2aa844a463b48a24637a8b2d", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -440,7 +462,7 @@ "type": "UncategorizedText" }, { - "element_id": "b7c108f30be7dfb550213536c197e563", + "element_id": "a9060bb59dfa0b37c1ec13e97228470a", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -462,7 +484,7 @@ "type": "UncategorizedText" }, { - "element_id": "9e78d28a1e5c130197f6a909ec74c987", + "element_id": "0dbff111f50ea5a58eee83c85a1c30b5", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -490,7 +512,7 @@ "type": "NarrativeText" }, { - "element_id": "d8ae65b075a2f46c394461d4e393f0d5", + "element_id": "ed95477f3bff4586983201b6387c875b", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -512,7 +534,7 @@ "type": "ListItem" }, { - "element_id": "7f3784563903fdf80ca26e027ca7376d", + "element_id": "260cee6fb6f1562e7b1cb0f7644ea64d", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -534,7 +556,7 @@ "type": "ListItem" }, { - "element_id": "81f723fb10893947353084829f8b5f68", + "element_id": "6820bf233e6604b7bd6750d0d2b62192", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -556,7 +578,7 @@ "type": "ListItem" }, { - "element_id": "a28747bf65c9c6ad4981e57ec35822a3", + "element_id": "8f13efbe5d5b289c6ce8eb00e2b5fae2", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -578,7 +600,7 @@ "type": "UncategorizedText" }, { - "element_id": "ec5dbc92af9cfee5f32dba0e9919b1f7", + "element_id": "ecc13666f56ab3fb01917335016cd9c3", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -606,7 +628,7 @@ "type": "NarrativeText" }, { - "element_id": "f09110aa418d33cbaccc7b380e0fe0c6", + "element_id": "93ec1210a0bf6e8b0c6c8504648e7489", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -634,7 +656,7 @@ "type": "UncategorizedText" }, { - "element_id": "fa11e4585afb53a4d046e095f08ac084", + "element_id": "f3c0e02138cb55302a075d5508843876", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -656,7 +678,7 @@ "type": "Title" }, { - "element_id": "2f06add07bf5f930085d334e1d1fdb6c", + "element_id": "5bc45f2fc513158f644f2c217cc9e54d", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -678,7 +700,7 @@ "type": "Title" }, { - "element_id": "0c493dc4e25a1447702be5bd7d8a156f", + "element_id": "81bebaf32dff5511a7856e553b526fa3", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -700,7 +722,7 @@ "type": "Title" }, { - "element_id": "dcf629a3cf73037815d0b85bf9878bd5", + "element_id": "833cc590b6815b09fd40d1d73a752420", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -722,7 +744,7 @@ "type": "Title" }, { - "element_id": "31d39e1ce259ec5bc37463b03c993697", + "element_id": "bff43fdce7563ccbd5cf3354090e8cf3", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -744,7 +766,7 @@ "type": "Title" }, { - "element_id": "8083af07d9148f975b439cdb91a216cf", + "element_id": "69b2cf7ade2f1034892b2b38b186fdaa", "metadata": { "data_source": { "date_created": "2023-07-11T17:01:39.240000", @@ -765,5 +787,27 @@ }, "text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2", "type": "Table" + }, + { + "element_id": "825a8cbb41eecc2f1b29d4b34cb05c2f", + "metadata": { + "data_source": { + "date_created": "2023-07-11T17:01:39.240000", + "date_modified": "2023-07-11T17:01:47.340000", + "record_locator": { + "page_id": "1802252", + "url": "https://unstructured-ingest-test.atlassian.net" + }, + "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1802252", + "version": "1" + }, + "filetype": "text/html", + "languages": [ + "eng", + "fra" + ] + }, + "text": "", + "type": "Image" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json index f6ce506dd..90ef86b66 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json +++ b/test_unstructured_ingest/expected-structured-output/notion/c47a4566-4c7a-488b-ac2a-1292ee507fcb.json @@ -538,7 +538,22 @@ "type": "NarrativeText" }, { - "element_id": "c087a92c7251ca836ff023d35cb0a1aa", + "element_id": "387c4d334f8e9650a56b3b444b2ad5f6", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-17T18:48:00.000Z" + }, + "filetype": "text/html", + "languages": [ + "eng" + ] + }, + "text": "", + "type": "Image" + }, + { + "element_id": "60d9f47b086264ea72277b741e3b2bdd", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -553,7 +568,7 @@ "type": "UncategorizedText" }, { - "element_id": "3126a68fa0a12481ca6dc64c16511a7e", + "element_id": "b39f61345657ccc5e201c20a6a90fad7", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -568,7 +583,7 @@ "type": "UncategorizedText" }, { - "element_id": "8cfa5b216c8d3f774f8e1def029681e6", + "element_id": "b95452fe8c6616a1ce1311457526c302", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -583,7 +598,7 @@ "type": "UncategorizedText" }, { - "element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26", + "element_id": "a7c3ee9360b2020e28aa31835ef5283c", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", @@ -598,7 +613,7 @@ "type": "NarrativeText" }, { - "element_id": "570c50d8758c5639a1dfd0f238f609d5", + "element_id": "349f058fcce7e32bb68b620841f40c9e", "metadata": { "data_source": { "date_created": "2023-08-04T18:31:00.000Z", diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json index d2aafacb8..ce4d1e2fe 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json @@ -26,5 +26,33 @@ "date_modified": "1692628456.0" } } + }, + { + "type": "Image", + "element_id": "f714fa214dac2f441515c4f28370d279", + "text": "", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "message/rfc822", + "email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net", + "sent_from": [ + "devops+salesforce-connector@unstructured.io" + ], + "sent_to": [ + "jane_gray@uoa.edu" + ], + "subject": "Test of email 1", + "data_source": { + "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU", + "version": "1694691603.0", + "record_locator": { + "id": "02sHu00001efErPIAU" + }, + "date_created": "1692542056.0", + "date_modified": "1692628456.0" + } + } } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json index 44ea5eb7f..302b1469b 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json @@ -26,5 +26,33 @@ "date_modified": "1692542155.0" } } + }, + { + "type": "Image", + "element_id": "68870d055535f48c7439ce67092768f6", + "text": "", + "metadata": { + "languages": [ + "eng" + ], + "filetype": "message/rfc822", + "email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net", + "sent_from": [ + "devops+salesforce-connector@unstructured.io" + ], + "sent_to": [ + "sean@edge.com" + ], + "subject": "Test of Salesforce 2", + "data_source": { + "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU", + "version": "1694691603.0", + "record_locator": { + "id": "02sHu00001efErQIAU" + }, + "date_created": "1692542149.0", + "date_modified": "1692542155.0" + } + } } ] \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f4c062f36..d9aaf41e7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.25" # pragma: no cover +__version__ = "0.16.26-dev1" # pragma: no cover diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index 26f86693d..1a9b03c50 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -75,6 +75,7 @@ Other background from __future__ import annotations +import re from collections import defaultdict, deque from types import MappingProxyType from typing import Any, Iterable, Iterator, Mapping, NamedTuple, Sequence, cast @@ -89,6 +90,7 @@ from unstructured.documents.elements import ( Element, ElementMetadata, EmailAddress, + Image, ListItem, NarrativeText, Table, @@ -477,6 +479,34 @@ class Pre(BlockItem): return _PreElementAccumulator(self) +class ImageBlock(Flow): + """Custom element-class for `` elements.""" + + BASE64_IMAGE_REGEX = re.compile(r"^data:(image/[^;]+);base64,(.*)") + + def iter_elements(self) -> Iterator[Element]: + """Generate an Image element based on `src`, `data-src`, and `alt`.""" + img_src = self.get("data-src", "").strip() or self.get("src", "").strip() + img_alt = self.get("alt", "").strip() + + if not img_src: # Early exit if no image source + return + + mime_match = self.BASE64_IMAGE_REGEX.match(img_src) + img_mime_type = mime_match.group(1) if mime_match else None + img_base64 = mime_match.group(2) if mime_match else None + img_url = None if img_base64 else img_src + + yield Image( + text=img_alt, + metadata=ElementMetadata( + image_mime_type=img_mime_type, + image_base64=img_base64, + url=img_url, + ), + ) + + class TableBlock(Flow): """Custom element-class for `` element.""" @@ -928,6 +958,8 @@ element_class_lookup.get_namespace(None).update( "ol": ListBlock, "ul": ListBlock, "li": ListItemBlock, + # -- image -- + "img": ImageBlock, # -- table -- "table": TableBlock, # -- annotated phrasing -- diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index 57dcc2b5c..5292344a9 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -10,7 +10,7 @@ import requests from lxml import etree from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, ElementType from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date @@ -108,6 +108,8 @@ class HtmlPartitionerOptions: detection_origin: str | None, html_parser_version: Literal["v1", "v2"] = "v1", image_alt_mode: Optional[Literal["to_text"]] = "to_text", + extract_image_block_types: Optional[list[str]] = None, + extract_image_block_to_payload: bool = False, ): self._file_path = file_path self._file = file @@ -120,6 +122,8 @@ class HtmlPartitionerOptions: self._detection_origin = detection_origin self._html_parser_version = html_parser_version self._image_alt_mode = image_alt_mode + self._extract_image_block_types = extract_image_block_types + self._extract_image_block_to_payload = extract_image_block_to_payload @lazyproperty def detection_origin(self) -> str | None: @@ -183,6 +187,15 @@ class _HtmlPartitioner: def __init__(self, opts: HtmlPartitionerOptions): self._opts = opts + def _should_include_image_base64(self, element: Element) -> bool: + """Determines if an image_base64 element should be included in the output.""" + return ( + element.category == ElementType.IMAGE + and self._opts._extract_image_block_to_payload + and self._opts._extract_image_block_types is not None + and "Image" in self._opts._extract_image_block_types + ) + @classmethod def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]: """Partition HTML document provided by `opts` into document-elements.""" @@ -202,6 +215,10 @@ class _HtmlPartitioner: for e in elements_iter: e.metadata.last_modified = self._opts.last_modified e.metadata.detection_origin = self._opts.detection_origin + + # -- remove if not requested -- + if not self._should_include_image_base64(e): + e.metadata.image_base64 = None yield e @lazyproperty @@ -224,7 +241,7 @@ class _HtmlPartitioner: # -- remove a variety of HTML element types like