feat: include images when partitioning html (#3945)

Currently we [filter img
tags](2addb19473/unstructured/partition/html/partition.py (L226-L229))
before tags are converted to Elements by the html partitioner. More
importantly we also don’t currently have a defined “block” / mapping to
support these. This adds these mappings and logic to process.

It also respects `extract_image_block_types` and
`extract_image_block_to_payload` (as we do with pdfs) to determine
whether base64 is included in the metadata.

The partitioned Image Elements sets the text to the img tag’s alt text
if available.

The partitioned Image Elements include the [url in the
metadata](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/documents/elements.py#L209)
(rather than image_base64) if the img tag src is a url.

## Testing

unit tests have been added for explicit coverage.
existing integration tests and other unit test fixtures have been
updated to account for `Image` elements now present

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
ryannikolaidis 2025-03-07 17:25:21 -08:00 committed by GitHub
parent 74b0647aa2
commit c0457c1cc3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 1014 additions and 264 deletions

View File

@ -1,3 +1,13 @@
## 0.16.26-dev1
### Enhancements
- **Add support for images in html partitioner** `<img>` tags will now be parsed as `Image` elements. When `extract_image_block_types` includes `Image` and `extract_image_block_to_payload`=True then the `image_base64` will be included for images that specify the base64 data (rather than url) as the source.
### Features
### Fixes
## 0.16.25
### Enhancements

View File

@ -6,7 +6,7 @@ from __future__ import annotations
import io
import pathlib
from typing import Any
from typing import Any, Optional
import pytest
from lxml import etree
@ -24,6 +24,7 @@ from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import (
Address,
CompositeElement,
ElementType,
ListItem,
NarrativeText,
Table,
@ -296,6 +297,68 @@ def test_it_does_not_extract_text_in_style_tags():
assert element.text == "Lorem ipsum dolor"
# -- image parsing behaviors ---------------------------------------------------------------------
@pytest.mark.parametrize(
("extract_to_payload", "extract_types", "expect_base64"),
[
(True, ["Image"], True),
(True, [], False),
(True, None, False),
(False, ["Image"], False),
],
)
def test_partition_html_base64_for_images(
opts_args: dict[str, Any],
extract_to_payload: bool,
extract_types: Optional[list[str]],
expect_base64: bool,
):
base64 = (
"iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/"
"w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
)
src = "data:image/png;base64," + base64
alt_text = "Base64 Image"
html = f"""
<div class="Page">
<img src="{src}" alt="{alt_text}">
</div>
"""
opts_args["text"] = html
opts_args["extract_image_block_to_payload"] = extract_to_payload
opts_args["extract_image_block_types"] = extract_types
opts = HtmlPartitionerOptions(**opts_args)
(element,) = list(_HtmlPartitioner.iter_elements(opts))
assert element.category == ElementType.IMAGE
assert element.text == alt_text
assert element.metadata.image_mime_type == "image/png"
if expect_base64:
assert element.metadata.image_base64 == base64
else:
assert element.metadata.image_base64 is None
def test_partition_html_includes_url_for_images():
url = "https://example.com/image.png"
alt_text = "URL Image"
# language=HTML
html = f"""
<div class="Page">
<img src="{url}" alt="{alt_text}">
</div>
"""
(image,) = partition_html(
text=html,
)
assert image.category == ElementType.IMAGE
assert image.text == alt_text
assert image.metadata.url == url
# -- table parsing behaviors ---------------------------------------------------------------------

View File

@ -210,7 +210,7 @@ def test_auto_partition_epub_from_filename():
elements = partition(example_doc_path("winter-sports.epub"), strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
def test_auto_partition_epub_from_file():
@ -218,7 +218,7 @@ def test_auto_partition_epub_from_file():
elements = partition(file=f, strategy=PartitionStrategy.HI_RES)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
# ================================================================================================
@ -430,7 +430,7 @@ def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
def test_partition_md_from_url_works_with_embedded_html():
url = "https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/README.md"
elements = partition(url=url, content_type="text/markdown", strategy=PartitionStrategy.HI_RES)
assert "unstructured" in elements[0].text
assert "unstructured" in elements[1].text
# ================================================================================================

View File

@ -14,14 +14,14 @@ def test_partition_epub_from_filename():
assert len(elements) > 0
assert isinstance(elements[0], Text)
assert elements[0].text.startswith("a shared culture")
assert elements[1].text.startswith("a shared culture")
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"epub"}
def test_partition_epub_from_filename_returns_table_in_elements():
elements = partition_epub(example_doc_path("winter-sports.epub"))
assert elements[10] == Table(
assert elements[12] == Table(
"Contents. List of Illustrations (In certain versions of this etext [in certain\nbrowsers]"
" clicking on the image will bring up a larger\nversion.) (etext transcriber's note)"
)
@ -32,7 +32,7 @@ def test_partition_epub_from_file():
elements = partition_epub(file=f)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
assert elements[2].text.startswith("The Project Gutenberg eBook of Winter Sports")
# -- .metadata.filename --------------------------------------------------------------------------

View File

@ -13,89 +13,98 @@
<p class="NarrativeText" id="d36113941235a14bdacafa399698ee71">
The overview is the first page visitors will see when they visit your space, so it helps to include some information on what the space is about and what your team is working on.
</p>
<p class="NarrativeText" id="21e1683c1bc71c40ea20081368bcc7f6">
<img alt="" class="Image" id="2051072f068db11d81f2bcbd031f8c19"/>
<p class="NarrativeText" id="156af6589ee1a114454df9aa55b88d85">
Add a header image. This gives your overview visual appeal and makes it welcoming for visitors.
</p>
<p class="NarrativeText" id="65f03aec0f3637db38c5a3741968eeff">
<p class="NarrativeText" id="618dd7e3cee45b5b0f04847b33879336">
Explain what the space is for. Start by summarizing the purpose of the space. This could be your team's mission statement or a brief description of the kind of work you do.
</p>
<p class="NarrativeText" id="e2522f792c3c5ef32bf1ba342a282fdd">
<p class="NarrativeText" id="ca6d9e5f81ae268b7bbf6b62dad3357b">
Share team goals. Add links to your team's OKRs, project plans, and product roadmaps so visitors can quickly get a sense of your team's goals.
</p>
<p class="NarrativeText" id="bd058a2d2c45c92a3178e327564e135a">
<p class="NarrativeText" id="cf63812b68970732916946496b13b763">
Tell people how to contact you. Share your timezone and links to Slack channels, email aliases, or other contact details your team uses so visitors can contact you with questions or feedback about your team's work.
</p>
<h1 class="Title" id="eab79997042ec6e273d0a13383347a57">
<h1 class="Title" id="82d520e252b220d5c4c6ce29ffb1ade1">
Use shortcuts for easy access
</h1>
<p class="NarrativeText" id="29cdfa9dda669b1dac60890795ab526c">
<p class="NarrativeText" id="b2d427efb6bb6f37c4afd368cefab926">
Shortcuts are helpful for important pages that members of a space might need to get to often. These shortcuts are added and organized by the space administrator. Space admins can link to pages in the space, other related spaces, or relevant external web content as well as reorder the shortcuts as needed.
</p>
<h1 class="Title" id="3251fe353cdbb64ce5cf084aef00cd96">
<img alt="" class="Image" id="d9f3cfd98a3c67adb56cfafae39d3e03"/>
<h1 class="Title" id="84ef673952608f3ba8bc4d2fa9deab59">
💭Start discussions with inline comments
</h1>
<p class="NarrativeText" id="29a93ef334092c2a12daf86b1c1b61fb">
<p class="NarrativeText" id="bcb788a54a545e7f1448f6e4dacb91eb">
Thoughtful responses can get lost and lose context as email replies pile up. And if you neglect to copy someone or want to add them later on, it's difficult for them to get up to speed. Inline comments allow anyone (or everyone) to huddle around an idea while referencing key information on the project page.
</p>
<p class="NarrativeText" id="15cc91b0ec273ab28ab202cd5e7836ea">
<p class="NarrativeText" id="c9dd716e43dfb450e3ff4cf59a3b5c63">
To leave an inline comment, highlight text on the page and the comment icon will appear.
</p>
<p class="NarrativeText" id="c606d30a11f8686a33c4f5305ab878fa">
<img alt="" class="Image" id="46647a4ff2f932d50ca02a1ef0ac51a2"/>
<p class="NarrativeText" id="3452f07fead697f48e719306657044a6">
Team members with permission to access the page can respond to any comment. Plus, when a comment thread comes to its natural conclusion, comments can be resolved and cleared away.
</p>
<h1 class="Title" id="9cec5c4cb40b1424590a7d2255ba5d98">
<h1 class="Title" id="025ce3293479133863a7a64723611197">
👋Loop in team members with @mentions
</h1>
<p class="NarrativeText" id="158ce46e2f05121666d26652b44ce556">
<p class="NarrativeText" id="0fa6faf7cc80d654c319b481e7c7ffce">
@mentions on Confluence function like @mentions on social media platforms like Twitter, Instagram, and Slack. Type the @ symbol on a Confluence page or in a comment, begin spelling a team member's first name, and a list will appear. Select the individual to ask a question or assign a task.
</p>
<h1 class="Title" id="aedbcb95b475418adc9e82fb50e1832f">
<img alt="" class="Image" id="df15c1a5963603656576632632e1dced"/>
<h1 class="Title" id="964954bfb165e4c1aa687b78fba71144">
👏Endorse ideas with reactions
</h1>
<p class="NarrativeText" id="9dcf5a605331e2e0db925a329a727df8">
<p class="NarrativeText" id="fe5335fa2c3bc18a1cbb8425fe071e47">
Use reactions when you want to support a comment or acknowledge you've seen one without clogging up the thread with another comment.
</p>
<p class="NarrativeText" id="a26e40b5555fb394e0844b7ae0118a90">
<p class="NarrativeText" id="d336ac79f4cbd3245fad05bfbc4c8f2b">
You can also use reactions on a page or blog post. The author of the content will be notified, and if enough team members react or add comments to the content, it'll be surfaced on Confluence home feed
</p>
<h1 class="Title" id="04dfe464a23b5192ca7465fca96e8a56">
<img alt="" class="Image" id="984da83593997e86b62223f8d1b03a62"/>
<h1 class="Title" id="9901914d311723f7f14e905d32ee94fd">
Take your Confluence space to the next level
</h1>
<p class="NarrativeText" id="06b459a1ab6ee59cbf44705c24934f15">
<p class="NarrativeText" id="30b4b4dc49d65a5a014b40312edbb424">
Extend the capabilities of your Confluence pages by adding extra functionality or including dynamic content.
</p>
<p class="NarrativeText" id="7d4a53bc8e11c662ba62212041b24cf6">
<p class="NarrativeText" id="a4d482bff56873324e2f2578c381e971">
To add functionality:
</p>
<p class="NarrativeText" id="29eaf10632e9bd8a0f0c46ac3f6ff876">
<p class="NarrativeText" id="f17948e62a99462cb4013796e97eea23">
Type ' / ' to open the list of items available to use
</p>
<p class="NarrativeText" id="885e34b9230d70d0c3257eef2d3f6a0f">
<p class="NarrativeText" id="62804fd3619c5c942cf3944315db132c">
Find the item to be inserted and select it
</p>
<p class="UncategorizedText" id="258ee604863fd54e308f2925d07ebd79">
<p class="UncategorizedText" id="80ba4f784cb65e206b17b76f79c55818">
Select Insert
</p>
<h1 class="Title" id="04a5e0e0b40cb961c84088dcc67b26b7">
<img alt="" class="Image" id="7927a0fdb568097efde58fdd68ed7e0a"/>
<h1 class="Title" id="60a261f17ffc821a917909bfb88a6d70">
Useful elements for Team space
</h1>
<p class="UncategorizedText" id="bd4f8d2535746efce21ce872c09ef973">
<p class="UncategorizedText" id="39d32e21527ef07823ab779970d88f26">
Introduce the team
</p>
<p class="NarrativeText" id="433789f2b20ca6275f62a944390e3c1d">
<p class="NarrativeText" id="fd0d57485d0925b681a03e270faeeb06">
Add user profiles to display a short summary of a given Confluence user's profile with their role, profile photo and contact details.
</p>
<p class="UncategorizedText" id="959ffe89453ca67c279ed576df24e196">
<img alt="" class="Image" id="b82b06b66608a8353fc7f99608bd8b08"/>
<img alt="" class="Image" id="32ce3055a4b209c2734306d8e7266c08"/>
<p class="UncategorizedText" id="2bad3c29ae9bd81da3a1d4c52487b032">
Share news and announcements with your team
</p>
<p class="NarrativeText" id="8b81b2db2cef191090cfa1d4204b8964">
<p class="NarrativeText" id="aa92002440f8c5a41323b8f85d131665">
Display a stream of latest blog posts so your team can easily see what's been going on.
</p>
<p class="NarrativeText" id="3fd46bb09e57e95f1211f475c45b575b">
<p class="NarrativeText" id="b313e6521d8168c6c840f8113c0ebd27">
Display a list of important pages
</p>
<p class="NarrativeText" id="5cbfe913e369743f1f14830c0b6572ab">
<p class="NarrativeText" id="c4bffd5805a6c7d1cb196dcd505f13d1">
Paste in page URLs to create smart links, or use the content report table to create a list of all the pages in the space.
</p>
<img alt="" class="Image" id="15e9a49d1413538015b1fd4d7dee1825"/>
</body>
</html>

View File

@ -46,85 +46,91 @@
<p class="UncategorizedText" id="9d2ea8da0d1c12bb3616cd3cb4e56128">
Add team members to your space.
</p>
<h1 class="Title" id="8e206800f74b037f87bc91ce09a66587">
<img alt="" class="Image" id="11d63c2d51214128c8caebb58f2bf06d"/>
<h1 class="Title" id="3d68b97296629da6f56dbee7226fb9ea">
Team member
</h1>
<p class="UncategorizedText" id="2c4cc93ed9393b0f05a3e564c436e13e">
<p class="UncategorizedText" id="b14012a7e1df00e14688673e6836af91">
Role
</p>
<p class="UncategorizedText" id="554c2527470d9fea2aaf8cefd8aa8ffc">
<p class="UncategorizedText" id="2ee3fe067727e804a8089f8c0131cd7e">
Responsibility
</p>
<h1 class="Title" id="feb3b3be79c77e3d661dc3fa522de26f">
<img alt="" class="Image" id="e206acc35c25cd275875533feb308ecf"/>
<h1 class="Title" id="e9f3973e622aaacb42556e6f29d140c0">
Team member
</h1>
<p class="UncategorizedText" id="5a73ff028549542468675768deee0430">
<p class="UncategorizedText" id="2b43cb7e0a29b1411d109e9a682940fa">
Role
</p>
<p class="UncategorizedText" id="94d211691238a7f3f74db151876c6734">
<p class="UncategorizedText" id="3560a31004a2e271125262ae3435cd80">
Responsibility
</p>
<h1 class="Title" id="198d8ad5606c445ba4dcafd19926c65e">
<img alt="" class="Image" id="48a5d1f209c8025b1cfb1d882658743e"/>
<h1 class="Title" id="64c696a8ba912e8c86e3dacc55bcfd09">
Team member
</h1>
<p class="UncategorizedText" id="776f1a1125f787afd3d193ede37edbf3">
<p class="UncategorizedText" id="60781a8a6086a335e6ef8efa6e767f74">
Role
</p>
<p class="UncategorizedText" id="7d9faf5ffc93c10998801ec69e82969d">
<p class="UncategorizedText" id="47137487152e9d98851e213658f3b212">
Responsibility
</p>
<h1 class="Title" id="46bdd16cf46259b25d67480f1467e0b0">
<h1 class="Title" id="5189c62c2edeed476df22eaa2bb5af21">
Contact us
</h1>
<p class="NarrativeText" id="80dadf7b66548e15b0b7f73c59ee50cf">
<p class="NarrativeText" id="43e843feeaed82e03996b90693f9c8eb">
How can someone reach out to your team?
</p>
<div class="EmailAddress" id="23168bef3f665803fb9ec74644a65674">
<div class="EmailAddress" id="0bae84d0e5cdc716a1dce4f739b86469">
team@email.com
</div>
<p class="UncategorizedText" id="02510c1509479158e837ac5d13f84bf5">
<p class="UncategorizedText" id="4d103f0c3f7f3527c37f34a8c4e86782">
Tickets
</p>
<p class="UncategorizedText" id="c59943bccf5535ffd752fe52a2f6a184">
<p class="UncategorizedText" id="deda95e4491b693fdb7bb978868beefd">
Jira board
</p>
<p class="UncategorizedText" id="21d150625554235f8fe3270ed63d2921">
<p class="UncategorizedText" id="e35c7cd3ecffe9ca0e65935f3feebfbd">
#channel
</p>
<h1 class="Title" id="29c4e13f95e215957a8d697601c3d1cc">
<h1 class="Title" id="f953d1e45bf1cf4cd4985b61255a41e3">
Important Pages
</h1>
<p class="NarrativeText" id="8bdacdf1a36489a491926616432b7b8e">
<p class="NarrativeText" id="53c5427b05c4256bd7c7e03346e58b9f">
List them here
</p>
<h1 class="Title" id="68accd9d0365712f54b96da661cce03d">
<img alt="" class="Image" id="6e5310473567927ff094c33ba42ff201"/>
<h1 class="Title" id="a139fb30a2382364053eb57aa180550f">
Onboarding FAQs
</h1>
<p class="UncategorizedText" id="35aa0d02a38ad72c0ca0534155dbdeb8">
<p class="UncategorizedText" id="eb784ba0d48bf9e06b53aed2ac3fbd72">
Add resources for new hires
</p>
<h1 class="Title" id="ea538f1ebdd2ced67e8c86dcf50bc164">
<img alt="" class="Image" id="768cfb8a51125da06add3109e7d155b3"/>
<h1 class="Title" id="67503783d98953e33cdc2846b90c21fd">
Meeting notes
</h1>
<p class="NarrativeText" id="6f4ae84a8d8a1d9005384f35e2ce793c">
<p class="NarrativeText" id="27194483431e4365b86572cbc73b9af5">
Add links to meeting notes
</p>
<h1 class="Title" id="9616030a71ad0e0654b28e61578d0443">
<img alt="" class="Image" id="c941c078ee573a2bbca654a7b5ce68f4"/>
<h1 class="Title" id="885ad7169d419802971c64780c7a7968">
Team goals
</h1>
<p class="NarrativeText" id="d81cb76df56721595c0495e4f5e6094f">
<p class="NarrativeText" id="4d12c0c0f2d8211bc2b3eae35ac4f854">
List them here
</p>
<h1 class="Title" id="46c3bd98dbea47cb63923597c929b932">
<h1 class="Title" id="019ded9026166e1794b589358870fe60">
Team news
</h1>
<p class="NarrativeText" id="1558d5e9d97c1cbb5cbb5cb2b077f83d">
<p class="NarrativeText" id="7cdd15b42c50cc95a64aa83149e72aec">
Create a blog post to share team news. It will automatically appear here once it's published.
</p>
<h1 class="Title" id="c281ed85f2e1125c9aaf318fd5178d4d">
<h1 class="Title" id="008813f1d7a4380879ff001294f8bc6e">
Blog stream
</h1>
<p class="NarrativeText" id="4b401fd3bc190fce17f70000e0164772">
<p class="NarrativeText" id="2a28d14ef4ba44c8f0098df26a520f23">
Create a blog post to share news and announcements with your team and company.
</p>
</body>

View File

@ -46,85 +46,91 @@
<p class="UncategorizedText" id="af3236ec30847a0d5e80d5c4c48d24b3">
Add team members to your space.
</p>
<h1 class="Title" id="93eecf0cb223bb9b38800c595a2c1ce2">
<img alt="" class="Image" id="33831fbc138ef739d88d4f83b4cfc58d"/>
<h1 class="Title" id="240725efee18f416b470f886d83e54a3">
Team member
</h1>
<p class="UncategorizedText" id="75ee4a303fc5ab8639c7bca973f29e30">
<p class="UncategorizedText" id="a8359a51dc7bc16fc9f2f412dfad01d7">
Role
</p>
<p class="UncategorizedText" id="22731d9c17747fc4708fd7f418e9dd57">
<p class="UncategorizedText" id="4d2982f8ec1f943ba5887ea5e1c41722">
Responsibility
</p>
<h1 class="Title" id="c4327bb8ec4ea8444a6307fcdf6928cd">
<img alt="" class="Image" id="1709eac9e1289421c96b86fa773e85ba"/>
<h1 class="Title" id="8e408d997b6afdcc6dc7c5d2f60d51fe">
Team member
</h1>
<p class="UncategorizedText" id="aa48062270f019242d68093284c4fa0c">
<p class="UncategorizedText" id="f86b21d5900d7c26053ce0d49624e22b">
Role
</p>
<p class="UncategorizedText" id="4bdb6fa86fd59b0729ecb9b6dbbf1ba7">
<p class="UncategorizedText" id="ad6b52393cba4295aa11d461df801ec9">
Responsibility
</p>
<h1 class="Title" id="07671349c39424db27fcf99634ed95d2">
<img alt="" class="Image" id="3fa16ff3939638c6415d5d1367aa01be"/>
<h1 class="Title" id="92cda6e10ddc39a6274a39bd28d78fd6">
Team member
</h1>
<p class="UncategorizedText" id="bb14e5c4bda33439f627d9d0484b603c">
<p class="UncategorizedText" id="c6bb501cb86fef4a7e6af33b44408860">
Role
</p>
<p class="UncategorizedText" id="a85a7425fe31f85a4aa6ae0a3d5c4251">
<p class="UncategorizedText" id="1e867147aebd2e2042c0b79216eb8ad6">
Responsibility
</p>
<h1 class="Title" id="c250d32242e3900d71e3dc6a4a6ac3c4">
<h1 class="Title" id="72969103d9798a14b6937a5f17e95250">
Contact us
</h1>
<p class="NarrativeText" id="2cba66c761cce97def3ee35ad7e841a1">
<p class="NarrativeText" id="a1f62f9caaa9e0ab38abfecc9992beb6">
How can someone reach out to your team?
</p>
<div class="EmailAddress" id="bb593264cda1392498158b2ce65053ac">
<div class="EmailAddress" id="d6507473bd42ae2c5043ef9682f5b71f">
team@email.com
</div>
<p class="UncategorizedText" id="ad4aa408f6abd52bd1e2adf149fed96d">
<p class="UncategorizedText" id="d68042b1765da182a599d7f147d2abef">
Tickets
</p>
<p class="UncategorizedText" id="f98b3b59b55313381052f1cfa1194bc5">
<p class="UncategorizedText" id="717b067188e80741597eb37455bf4fbe">
Jira board
</p>
<p class="UncategorizedText" id="a9f4432dce00417cc8a4c304e424c28b">
<p class="UncategorizedText" id="16455e060585b3e0817764ca31c32151">
#channel
</p>
<h1 class="Title" id="3716407dd9d7c3bc756ab8ee46ea7770">
<h1 class="Title" id="f773ae2bc874cb28cff580d0b63a627a">
Important Pages
</h1>
<p class="NarrativeText" id="6fda3a7478f59f5290ac529d13bbceaf">
<p class="NarrativeText" id="8a7363b7d1eb2cb37430121d27168de0">
List them here
</p>
<h1 class="Title" id="f4186b4e1cec5ef7009560d11cb74087">
<img alt="" class="Image" id="030568cacd3b66ce8ee6c6c3c9be840f"/>
<h1 class="Title" id="fd9d745f22dffbb155b2e8022e2dc2e4">
Onboarding FAQs
</h1>
<p class="UncategorizedText" id="a27a3099dea44c05dfea1e0e125abac5">
<p class="UncategorizedText" id="71d0ef13e2b308bf6c79c3153f3ed35f">
Add resources for new hires
</p>
<h1 class="Title" id="44d083c5ce62947d874c568db0dbc01b">
<img alt="" class="Image" id="7e882f807cf95f54e80ea3d7b75f6edd"/>
<h1 class="Title" id="16fda0efe288d0c8d1cf18b1037b5b0e">
Meeting notes
</h1>
<p class="NarrativeText" id="23d9d3b7eb1b506a1031e99b28243136">
<p class="NarrativeText" id="8bf5be7f0d4a4b5248347885f68f6b89">
Add links to meeting notes
</p>
<h1 class="Title" id="5d12aca2ca2b8aba5c9dee48f1475f55">
<img alt="" class="Image" id="56b696bc7b11d0f3e1165cb157426dcc"/>
<h1 class="Title" id="c6fe156426f03a42912623025777f8c8">
Team goals
</h1>
<p class="NarrativeText" id="70182a5acbdac0041ee51b85dfca692f">
<p class="NarrativeText" id="d8f7425068e3b4e6e99affa00d268060">
List them here
</p>
<h1 class="Title" id="243fc77b8eebdbcf00a6a108a8159b69">
<h1 class="Title" id="e4589df20d851e29530dbf5f97444eca">
Team news
</h1>
<p class="NarrativeText" id="3014e5236eb14590a7c13e83c36b20ce">
<p class="NarrativeText" id="37a3e4a1755417a6944ff64115257147">
Create a blog post to share team news. It will automatically appear here once it's published.
</p>
<h1 class="Title" id="800f984e0d3456624dce9630abfd873a">
<h1 class="Title" id="e26ff7fd8e8e12c8aa704e6f97275fbf">
Blog stream
</h1>
<p class="NarrativeText" id="800885acdda14ccb63621293f9a3aa2f">
<p class="NarrativeText" id="18220fb2182492f64b3504513de4fbef">
Create a blog post to share news and announcements with your team and company.
</p>
</body>

View File

@ -49,64 +49,65 @@
<p class="UncategorizedText" id="ca6e8673360d0f9a946786edc086f26e">
testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3
</p>
<p class="UncategorizedText" id="d0b45e375f3a7207caacb7be289ebd62">
<img alt="" class="Image" id="f8085d2948c73dfb968f7b221f3e8fab"/>
<p class="UncategorizedText" id="48d494bb12fd182b0106bff99dd2e3be">
Testdoc3 List Item 1
</p>
<p class="UncategorizedText" id="351fc6ff4a9a491bf863ed7aa20fd5c5">
<p class="UncategorizedText" id="3f1b3ecb6515a47b94579cf7de892f09">
Testdoc3 List Item 1 Nested Item A
</p>
<p class="UncategorizedText" id="6688bffe9c19dca7cb61ee039a6ffa10">
<p class="UncategorizedText" id="171423f703a966d2616837ed489f6975">
Testdoc3 List Item 1 Nested Item B
</p>
<p class="UncategorizedText" id="e50d0b83f51c65bda0620ccec0368a41">
<p class="UncategorizedText" id="87daeeb71306ae76a90c0e6ccac0dd47">
Testdoc3 List Item 2
</p>
<p class="UncategorizedText" id="91697f192743d0583d02cb3e232d3c83">
<p class="UncategorizedText" id="c4d15cc61c5d6a3f2350f758b82e487f">
Testdoc3 List Item 3
</p>
<p class="UncategorizedText" id="7e1f204c284d5e878639feca87a022c4">
<p class="UncategorizedText" id="e069a6333ef83f6f250880a500439da3">
Testdoc3 List Item 4
</p>
<p class="UncategorizedText" id="24fc2762132dbbf33824a2c8575f6c14">
<p class="UncategorizedText" id="2f030590e85c72dd4a2fc739cc05affe">
Testdoc3 List Item 5
</p>
<p class="NarrativeText" id="c8d91bf0f74cf2d7474b81fa319cc0e5">
<p class="NarrativeText" id="2db738cf60bf0471df90b6141fc6a8e5">
This is the link for unstructured . io.
</p>
<li class="ListItem" id="ce274d7699a4270e902e3617c7cf6e36">
<li class="ListItem" id="ab3005ca05b4f48396361646916154b4">
Testdoc3 Checklist Item 1
</li>
<li class="ListItem" id="b54b171c49bbdb6f51308ff765b7f121">
<li class="ListItem" id="ac70c0a823f0a1d56777036e77e77fd9">
Testdoc3 Checklist Item 2 (checked)
</li>
<li class="ListItem" id="712a4752864712c0ec58730edb76b2f1">
<li class="ListItem" id="099fca1cec6f3eaa5f71ed9c2ed235e4">
Testdoc3 Checklist Item 3
</li>
<p class="UncategorizedText" id="f2e9daed509db420ecf36984f431900f">
<p class="UncategorizedText" id="85d4a299ad3ee61201530bf0030808b1">
😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃
</p>
<p class="NarrativeText" id="3cb5f4888419631affdc50af8f020348">
<p class="NarrativeText" id="a1a4f27d3b3cc32777e25b3bb0766083">
Testdoc3 bold text
</p>
<p class="UncategorizedText" id="9f5d86ea05eeb0bf570d9141a5b8994b">
<p class="UncategorizedText" id="976749a5d532d1f18195d61fe8c04be3">
Testdoc3 italic text
</p>
<h1 class="Title" id="64ba474681b32c7dbc2a00fb9ec3e757">
<h1 class="Title" id="079d83c4a7622c70baab0336e3128ec4">
Testdoc3 Heading 1 Sized Text
</h1>
<h1 class="Title" id="2d4a5727cd260bb321af0f777e2e699f">
<h1 class="Title" id="68e58e6fec19f4ec291fd5bcca3dadd8">
Testdoc3 Heading 2 Sized Text
</h1>
<h1 class="Title" id="0a5dc104636145b04136d2eee7c4469b">
<h1 class="Title" id="8dbaefc9dcf7af80f14871cdd2b0c1d6">
Testdoc3 Heading 3 Sized Text
</h1>
<h1 class="Title" id="470c624d2702678b94685d916908136a">
<h1 class="Title" id="2ca9400f81b794c7d595f9bcd719b18a">
Testdoc3 Heading 4 Sized Text
</h1>
<h1 class="Title" id="bd8ca6285c9717384bd456685550304b">
<h1 class="Title" id="191672694e36e804e20214e8c9bd2d44">
Testdoc3 Heading 5 Sized Text
</h1>
<table class="Table" id="99613e61c98f3e61b28c98d096524077" style="border: 1px solid black; border-collapse: collapse;">
<table class="Table" id="10d98c7ebc4b2cd6e5508aa9563cc788" style="border: 1px solid black; border-collapse: collapse;">
<tr style="border: 1px solid black;">
<td style="border: 1px solid black;">
Testdoc3 Table: Column 1 Row 0
@ -141,5 +142,6 @@
</td>
</tr>
</table>
<img alt="" class="Image" id="45b142b9287e8ebadbc1dfb48ee5245a"/>
</body>
</html>

View File

@ -49,64 +49,65 @@
<p class="UncategorizedText" id="caab6974e98b9e03c78191c02591775e">
testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2
</p>
<p class="UncategorizedText" id="a931e049fc3bd99cf74ef09502a71938">
<img alt="" class="Image" id="61525bb0c406b033be2849cb375e9fcd"/>
<p class="UncategorizedText" id="7e8cf6622ad36f28966178e194feaad8">
Testdoc2 List Item 1
</p>
<p class="UncategorizedText" id="59e566b7776eba69071658b586226bd0">
<p class="UncategorizedText" id="eeb9717b5c634b6bb4b4bb1b83101500">
Testdoc2 List Item 1 Nested Item A
</p>
<p class="UncategorizedText" id="971bd18c2de3ef14a26ba1d8e4ef8668">
<p class="UncategorizedText" id="c5bc1b1b90fd002ca136bc3d76e3d482">
Testdoc2 List Item 1 Nested Item B
</p>
<p class="UncategorizedText" id="c688b4f7d2e49c8d8d7c77d28ddf5ecc">
<p class="UncategorizedText" id="9ecb8d523cdd75be94294fa2ca440799">
Testdoc2 List Item 2
</p>
<p class="UncategorizedText" id="9ce074ac38046f414a5f16cd9c7308b3">
<p class="UncategorizedText" id="4909ea47e524d3b8fbac470c0663f589">
Testdoc2 List Item 3
</p>
<p class="UncategorizedText" id="a4fbf964d1efe50e1c1ee181b453d4d6">
<p class="UncategorizedText" id="ab916d1a2aa844a463b48a24637a8b2d">
Testdoc2 List Item 4
</p>
<p class="UncategorizedText" id="b7c108f30be7dfb550213536c197e563">
<p class="UncategorizedText" id="a9060bb59dfa0b37c1ec13e97228470a">
Testdoc2 List Item 5
</p>
<p class="NarrativeText" id="9e78d28a1e5c130197f6a909ec74c987">
<p class="NarrativeText" id="0dbff111f50ea5a58eee83c85a1c30b5">
This is the link for unstructured . io.
</p>
<li class="ListItem" id="d8ae65b075a2f46c394461d4e393f0d5">
<li class="ListItem" id="ed95477f3bff4586983201b6387c875b">
Testdoc2 Checklist Item 1
</li>
<li class="ListItem" id="7f3784563903fdf80ca26e027ca7376d">
<li class="ListItem" id="260cee6fb6f1562e7b1cb0f7644ea64d">
Testdoc2 Checklist Item 2 (checked)
</li>
<li class="ListItem" id="81f723fb10893947353084829f8b5f68">
<li class="ListItem" id="6820bf233e6604b7bd6750d0d2b62192">
Testdoc2 Checklist Item 3
</li>
<p class="UncategorizedText" id="a28747bf65c9c6ad4981e57ec35822a3">
<p class="UncategorizedText" id="8f13efbe5d5b289c6ce8eb00e2b5fae2">
😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃
</p>
<p class="NarrativeText" id="ec5dbc92af9cfee5f32dba0e9919b1f7">
<p class="NarrativeText" id="ecc13666f56ab3fb01917335016cd9c3">
Testdoc2 bold text
</p>
<p class="UncategorizedText" id="f09110aa418d33cbaccc7b380e0fe0c6">
<p class="UncategorizedText" id="93ec1210a0bf6e8b0c6c8504648e7489">
Testdoc2 italic text
</p>
<h1 class="Title" id="fa11e4585afb53a4d046e095f08ac084">
<h1 class="Title" id="f3c0e02138cb55302a075d5508843876">
Testdoc2 Heading 1 Sized Text
</h1>
<h1 class="Title" id="2f06add07bf5f930085d334e1d1fdb6c">
<h1 class="Title" id="5bc45f2fc513158f644f2c217cc9e54d">
Testdoc2 Heading 2 Sized Text
</h1>
<h1 class="Title" id="0c493dc4e25a1447702be5bd7d8a156f">
<h1 class="Title" id="81bebaf32dff5511a7856e553b526fa3">
Testdoc2 Heading 3 Sized Text
</h1>
<h1 class="Title" id="dcf629a3cf73037815d0b85bf9878bd5">
<h1 class="Title" id="833cc590b6815b09fd40d1d73a752420">
Testdoc2 Heading 4 Sized Text
</h1>
<h1 class="Title" id="31d39e1ce259ec5bc37463b03c993697">
<h1 class="Title" id="bff43fdce7563ccbd5cf3354090e8cf3">
Testdoc2 Heading 5 Sized Text
</h1>
<table class="Table" id="8083af07d9148f975b439cdb91a216cf" style="border: 1px solid black; border-collapse: collapse;">
<table class="Table" id="69b2cf7ade2f1034892b2b38b186fdaa" style="border: 1px solid black; border-collapse: collapse;">
<tr style="border: 1px solid black;">
<td style="border: 1px solid black;">
Testdoc2 Table: Column 1 Row 0
@ -141,5 +142,6 @@
</td>
</tr>
</table>
<img alt="" class="Image" id="825a8cbb41eecc2f1b29d4b34cb05c2f"/>
</body>
</html>

View File

@ -164,19 +164,20 @@
<p class="NarrativeText" id="fa3e9d761730605036aaf854d9edd5b4">
Heading 1 content
</p>
<p class="UncategorizedText" id="c087a92c7251ca836ff023d35cb0a1aa">
<img alt="" class="Image" id="387c4d334f8e9650a56b3b444b2ad5f6"/>
<p class="UncategorizedText" id="60d9f47b086264ea72277b741e3b2bdd">
d3d87fc6-61cc-4bb5-89ed-e9dff0df1526
</p>
<p class="UncategorizedText" id="3126a68fa0a12481ca6dc64c16511a7e">
<p class="UncategorizedText" id="b39f61345657ccc5e201c20a6a90fad7">
Stuff todo
</p>
<p class="UncategorizedText" id="8cfa5b216c8d3f774f8e1def029681e6">
<p class="UncategorizedText" id="b95452fe8c6616a1ce1311457526c302">
more stuff todo
</p>
<p class="NarrativeText" id="b538abdbf0aff3f9f1ab11d79bb5bc26">
<p class="NarrativeText" id="a7c3ee9360b2020e28aa31835ef5283c">
More things to do
</p>
<p class="NarrativeText" id="570c50d8758c5639a1dfd0f238f609d5">
<p class="NarrativeText" id="349f058fcce7e32bb68b620841f40c9e">
Something to do
</p>
</body>

View File

@ -10,5 +10,6 @@
<p class="NarrativeText" id="4196fe41da19e8657761ecffcafd3d2f">
Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/
</p>
<img alt="" class="Image" id="f714fa214dac2f441515c4f28370d279"/>
</body>
</html>

View File

@ -10,5 +10,6 @@
<p class="NarrativeText" id="6f168cd430b41fc0d66a3691ef3caa0f">
Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/
</p>
<img alt="" class="Image" id="68870d055535f48c7439ce67092768f6"/>
</body>
</html>

View File

@ -42,7 +42,28 @@
"type": "NarrativeText"
},
{
"element_id": "21e1683c1bc71c40ea20081368bcc7f6",
"element_id": "2051072f068db11d81f2bcbd031f8c19",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "156af6589ee1a114454df9aa55b88d85",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -69,7 +90,7 @@
"type": "NarrativeText"
},
{
"element_id": "65f03aec0f3637db38c5a3741968eeff",
"element_id": "618dd7e3cee45b5b0f04847b33879336",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -96,7 +117,7 @@
"type": "NarrativeText"
},
{
"element_id": "e2522f792c3c5ef32bf1ba342a282fdd",
"element_id": "ca6d9e5f81ae268b7bbf6b62dad3357b",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -133,7 +154,7 @@
"type": "NarrativeText"
},
{
"element_id": "bd058a2d2c45c92a3178e327564e135a",
"element_id": "cf63812b68970732916946496b13b763",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -160,7 +181,7 @@
"type": "NarrativeText"
},
{
"element_id": "eab79997042ec6e273d0a13383347a57",
"element_id": "82d520e252b220d5c4c6ce29ffb1ade1",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -181,7 +202,7 @@
"type": "Title"
},
{
"element_id": "29cdfa9dda669b1dac60890795ab526c",
"element_id": "b2d427efb6bb6f37c4afd368cefab926",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -202,7 +223,28 @@
"type": "NarrativeText"
},
{
"element_id": "3251fe353cdbb64ce5cf084aef00cd96",
"element_id": "d9f3cfd98a3c67adb56cfafae39d3e03",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "84ef673952608f3ba8bc4d2fa9deab59",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -223,7 +265,7 @@
"type": "Title"
},
{
"element_id": "29a93ef334092c2a12daf86b1c1b61fb",
"element_id": "bcb788a54a545e7f1448f6e4dacb91eb",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -250,7 +292,7 @@
"type": "NarrativeText"
},
{
"element_id": "15cc91b0ec273ab28ab202cd5e7836ea",
"element_id": "c9dd716e43dfb450e3ff4cf59a3b5c63",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -271,7 +313,28 @@
"type": "NarrativeText"
},
{
"element_id": "c606d30a11f8686a33c4f5305ab878fa",
"element_id": "46647a4ff2f932d50ca02a1ef0ac51a2",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "3452f07fead697f48e719306657044a6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -292,7 +355,7 @@
"type": "NarrativeText"
},
{
"element_id": "9cec5c4cb40b1424590a7d2255ba5d98",
"element_id": "025ce3293479133863a7a64723611197",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -313,7 +376,7 @@
"type": "Title"
},
{
"element_id": "158ce46e2f05121666d26652b44ce556",
"element_id": "0fa6faf7cc80d654c319b481e7c7ffce",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -340,7 +403,28 @@
"type": "NarrativeText"
},
{
"element_id": "aedbcb95b475418adc9e82fb50e1832f",
"element_id": "df15c1a5963603656576632632e1dced",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "964954bfb165e4c1aa687b78fba71144",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -361,7 +445,7 @@
"type": "Title"
},
{
"element_id": "9dcf5a605331e2e0db925a329a727df8",
"element_id": "fe5335fa2c3bc18a1cbb8425fe071e47",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -382,7 +466,7 @@
"type": "NarrativeText"
},
{
"element_id": "a26e40b5555fb394e0844b7ae0118a90",
"element_id": "d336ac79f4cbd3245fad05bfbc4c8f2b",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -403,7 +487,28 @@
"type": "NarrativeText"
},
{
"element_id": "04dfe464a23b5192ca7465fca96e8a56",
"element_id": "984da83593997e86b62223f8d1b03a62",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "9901914d311723f7f14e905d32ee94fd",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -424,7 +529,7 @@
"type": "Title"
},
{
"element_id": "06b459a1ab6ee59cbf44705c24934f15",
"element_id": "30b4b4dc49d65a5a014b40312edbb424",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -445,7 +550,7 @@
"type": "NarrativeText"
},
{
"element_id": "7d4a53bc8e11c662ba62212041b24cf6",
"element_id": "a4d482bff56873324e2f2578c381e971",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -472,7 +577,7 @@
"type": "NarrativeText"
},
{
"element_id": "29eaf10632e9bd8a0f0c46ac3f6ff876",
"element_id": "f17948e62a99462cb4013796e97eea23",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -493,7 +598,7 @@
"type": "NarrativeText"
},
{
"element_id": "885e34b9230d70d0c3257eef2d3f6a0f",
"element_id": "62804fd3619c5c942cf3944315db132c",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -514,7 +619,7 @@
"type": "NarrativeText"
},
{
"element_id": "258ee604863fd54e308f2925d07ebd79",
"element_id": "80ba4f784cb65e206b17b76f79c55818",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -541,7 +646,28 @@
"type": "UncategorizedText"
},
{
"element_id": "04a5e0e0b40cb961c84088dcc67b26b7",
"element_id": "7927a0fdb568097efde58fdd68ed7e0a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "60a261f17ffc821a917909bfb88a6d70",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -562,7 +688,7 @@
"type": "Title"
},
{
"element_id": "bd4f8d2535746efce21ce872c09ef973",
"element_id": "39d32e21527ef07823ab779970d88f26",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -589,7 +715,7 @@
"type": "UncategorizedText"
},
{
"element_id": "433789f2b20ca6275f62a944390e3c1d",
"element_id": "fd0d57485d0925b681a03e270faeeb06",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -616,7 +742,49 @@
"type": "NarrativeText"
},
{
"element_id": "959ffe89453ca67c279ed576df24e196",
"element_id": "b82b06b66608a8353fc7f99608bd8b08",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "32ce3055a4b209c2734306d8e7266c08",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "2bad3c29ae9bd81da3a1d4c52487b032",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -643,7 +811,7 @@
"type": "UncategorizedText"
},
{
"element_id": "8b81b2db2cef191090cfa1d4204b8964",
"element_id": "aa92002440f8c5a41323b8f85d131665",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -670,7 +838,7 @@
"type": "NarrativeText"
},
{
"element_id": "3fd46bb09e57e95f1211f475c45b575b",
"element_id": "b313e6521d8168c6c840f8113c0ebd27",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -697,7 +865,7 @@
"type": "NarrativeText"
},
{
"element_id": "5cbfe913e369743f1f14830c0b6572ab",
"element_id": "c4bffd5805a6c7d1cb196dcd505f13d1",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
@ -722,5 +890,26 @@
},
"text": "Paste in page URLs to create smart links, or use the content report table to create a list of all the pages in the space.",
"type": "NarrativeText"
},
{
"element_id": "15e9a49d1413538015b1fd4d7dee1825",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:45.288000",
"date_modified": "2023-07-09T12:54:45.288000",
"record_locator": {
"page_id": "1605956",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605956",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
}
]

View File

@ -315,7 +315,28 @@
"type": "UncategorizedText"
},
{
"element_id": "8e206800f74b037f87bc91ce09a66587",
"element_id": "11d63c2d51214128c8caebb58f2bf06d",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "3d68b97296629da6f56dbee7226fb9ea",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -336,7 +357,7 @@
"type": "Title"
},
{
"element_id": "2c4cc93ed9393b0f05a3e564c436e13e",
"element_id": "b14012a7e1df00e14688673e6836af91",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -357,7 +378,7 @@
"type": "UncategorizedText"
},
{
"element_id": "554c2527470d9fea2aaf8cefd8aa8ffc",
"element_id": "2ee3fe067727e804a8089f8c0131cd7e",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -378,7 +399,28 @@
"type": "UncategorizedText"
},
{
"element_id": "feb3b3be79c77e3d661dc3fa522de26f",
"element_id": "e206acc35c25cd275875533feb308ecf",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "e9f3973e622aaacb42556e6f29d140c0",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -399,7 +441,7 @@
"type": "Title"
},
{
"element_id": "5a73ff028549542468675768deee0430",
"element_id": "2b43cb7e0a29b1411d109e9a682940fa",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -420,7 +462,7 @@
"type": "UncategorizedText"
},
{
"element_id": "94d211691238a7f3f74db151876c6734",
"element_id": "3560a31004a2e271125262ae3435cd80",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -441,7 +483,28 @@
"type": "UncategorizedText"
},
{
"element_id": "198d8ad5606c445ba4dcafd19926c65e",
"element_id": "48a5d1f209c8025b1cfb1d882658743e",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "64c696a8ba912e8c86e3dacc55bcfd09",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -462,7 +525,7 @@
"type": "Title"
},
{
"element_id": "776f1a1125f787afd3d193ede37edbf3",
"element_id": "60781a8a6086a335e6ef8efa6e767f74",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -483,7 +546,7 @@
"type": "UncategorizedText"
},
{
"element_id": "7d9faf5ffc93c10998801ec69e82969d",
"element_id": "47137487152e9d98851e213658f3b212",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -504,7 +567,7 @@
"type": "UncategorizedText"
},
{
"element_id": "46bdd16cf46259b25d67480f1467e0b0",
"element_id": "5189c62c2edeed476df22eaa2bb5af21",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -525,7 +588,7 @@
"type": "Title"
},
{
"element_id": "80dadf7b66548e15b0b7f73c59ee50cf",
"element_id": "43e843feeaed82e03996b90693f9c8eb",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -552,7 +615,7 @@
"type": "NarrativeText"
},
{
"element_id": "23168bef3f665803fb9ec74644a65674",
"element_id": "0bae84d0e5cdc716a1dce4f739b86469",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -579,7 +642,7 @@
"type": "EmailAddress"
},
{
"element_id": "02510c1509479158e837ac5d13f84bf5",
"element_id": "4d103f0c3f7f3527c37f34a8c4e86782",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -600,7 +663,7 @@
"type": "UncategorizedText"
},
{
"element_id": "c59943bccf5535ffd752fe52a2f6a184",
"element_id": "deda95e4491b693fdb7bb978868beefd",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -621,7 +684,7 @@
"type": "UncategorizedText"
},
{
"element_id": "21d150625554235f8fe3270ed63d2921",
"element_id": "e35c7cd3ecffe9ca0e65935f3feebfbd",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -642,7 +705,7 @@
"type": "UncategorizedText"
},
{
"element_id": "29c4e13f95e215957a8d697601c3d1cc",
"element_id": "f953d1e45bf1cf4cd4985b61255a41e3",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -663,7 +726,7 @@
"type": "Title"
},
{
"element_id": "8bdacdf1a36489a491926616432b7b8e",
"element_id": "53c5427b05c4256bd7c7e03346e58b9f",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -690,7 +753,28 @@
"type": "NarrativeText"
},
{
"element_id": "68accd9d0365712f54b96da661cce03d",
"element_id": "6e5310473567927ff094c33ba42ff201",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "a139fb30a2382364053eb57aa180550f",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -711,7 +795,7 @@
"type": "Title"
},
{
"element_id": "35aa0d02a38ad72c0ca0534155dbdeb8",
"element_id": "eb784ba0d48bf9e06b53aed2ac3fbd72",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -738,7 +822,28 @@
"type": "UncategorizedText"
},
{
"element_id": "ea538f1ebdd2ced67e8c86dcf50bc164",
"element_id": "768cfb8a51125da06add3109e7d155b3",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "67503783d98953e33cdc2846b90c21fd",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -759,7 +864,7 @@
"type": "Title"
},
{
"element_id": "6f4ae84a8d8a1d9005384f35e2ce793c",
"element_id": "27194483431e4365b86572cbc73b9af5",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -786,7 +891,28 @@
"type": "NarrativeText"
},
{
"element_id": "9616030a71ad0e0654b28e61578d0443",
"element_id": "c941c078ee573a2bbca654a7b5ce68f4",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
"date_modified": "2023-06-30T17:25:30.898000",
"record_locator": {
"page_id": "229477",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/229477",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "885ad7169d419802971c64780c7a7968",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -807,7 +933,7 @@
"type": "Title"
},
{
"element_id": "d81cb76df56721595c0495e4f5e6094f",
"element_id": "4d12c0c0f2d8211bc2b3eae35ac4f854",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -834,7 +960,7 @@
"type": "NarrativeText"
},
{
"element_id": "46c3bd98dbea47cb63923597c929b932",
"element_id": "019ded9026166e1794b589358870fe60",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -855,7 +981,7 @@
"type": "Title"
},
{
"element_id": "1558d5e9d97c1cbb5cbb5cb2b077f83d",
"element_id": "7cdd15b42c50cc95a64aa83149e72aec",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -882,7 +1008,7 @@
"type": "NarrativeText"
},
{
"element_id": "c281ed85f2e1125c9aaf318fd5178d4d",
"element_id": "008813f1d7a4380879ff001294f8bc6e",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",
@ -903,7 +1029,7 @@
"type": "Title"
},
{
"element_id": "4b401fd3bc190fce17f70000e0164772",
"element_id": "2a28d14ef4ba44c8f0098df26a520f23",
"metadata": {
"data_source": {
"date_created": "2023-06-30T17:25:25.504000",

View File

@ -315,7 +315,28 @@
"type": "UncategorizedText"
},
{
"element_id": "93eecf0cb223bb9b38800c595a2c1ce2",
"element_id": "33831fbc138ef739d88d4f83b4cfc58d",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "240725efee18f416b470f886d83e54a3",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -336,7 +357,7 @@
"type": "Title"
},
{
"element_id": "75ee4a303fc5ab8639c7bca973f29e30",
"element_id": "a8359a51dc7bc16fc9f2f412dfad01d7",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -357,7 +378,7 @@
"type": "UncategorizedText"
},
{
"element_id": "22731d9c17747fc4708fd7f418e9dd57",
"element_id": "4d2982f8ec1f943ba5887ea5e1c41722",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -378,7 +399,28 @@
"type": "UncategorizedText"
},
{
"element_id": "c4327bb8ec4ea8444a6307fcdf6928cd",
"element_id": "1709eac9e1289421c96b86fa773e85ba",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "8e408d997b6afdcc6dc7c5d2f60d51fe",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -399,7 +441,7 @@
"type": "Title"
},
{
"element_id": "aa48062270f019242d68093284c4fa0c",
"element_id": "f86b21d5900d7c26053ce0d49624e22b",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -420,7 +462,7 @@
"type": "UncategorizedText"
},
{
"element_id": "4bdb6fa86fd59b0729ecb9b6dbbf1ba7",
"element_id": "ad6b52393cba4295aa11d461df801ec9",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -441,7 +483,28 @@
"type": "UncategorizedText"
},
{
"element_id": "07671349c39424db27fcf99634ed95d2",
"element_id": "3fa16ff3939638c6415d5d1367aa01be",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "92cda6e10ddc39a6274a39bd28d78fd6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -462,7 +525,7 @@
"type": "Title"
},
{
"element_id": "bb14e5c4bda33439f627d9d0484b603c",
"element_id": "c6bb501cb86fef4a7e6af33b44408860",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -483,7 +546,7 @@
"type": "UncategorizedText"
},
{
"element_id": "a85a7425fe31f85a4aa6ae0a3d5c4251",
"element_id": "1e867147aebd2e2042c0b79216eb8ad6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -504,7 +567,7 @@
"type": "UncategorizedText"
},
{
"element_id": "c250d32242e3900d71e3dc6a4a6ac3c4",
"element_id": "72969103d9798a14b6937a5f17e95250",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -525,7 +588,7 @@
"type": "Title"
},
{
"element_id": "2cba66c761cce97def3ee35ad7e841a1",
"element_id": "a1f62f9caaa9e0ab38abfecc9992beb6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -552,7 +615,7 @@
"type": "NarrativeText"
},
{
"element_id": "bb593264cda1392498158b2ce65053ac",
"element_id": "d6507473bd42ae2c5043ef9682f5b71f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -579,7 +642,7 @@
"type": "EmailAddress"
},
{
"element_id": "ad4aa408f6abd52bd1e2adf149fed96d",
"element_id": "d68042b1765da182a599d7f147d2abef",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -600,7 +663,7 @@
"type": "UncategorizedText"
},
{
"element_id": "f98b3b59b55313381052f1cfa1194bc5",
"element_id": "717b067188e80741597eb37455bf4fbe",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -621,7 +684,7 @@
"type": "UncategorizedText"
},
{
"element_id": "a9f4432dce00417cc8a4c304e424c28b",
"element_id": "16455e060585b3e0817764ca31c32151",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -642,7 +705,7 @@
"type": "UncategorizedText"
},
{
"element_id": "3716407dd9d7c3bc756ab8ee46ea7770",
"element_id": "f773ae2bc874cb28cff580d0b63a627a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -663,7 +726,7 @@
"type": "Title"
},
{
"element_id": "6fda3a7478f59f5290ac529d13bbceaf",
"element_id": "8a7363b7d1eb2cb37430121d27168de0",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -690,7 +753,28 @@
"type": "NarrativeText"
},
{
"element_id": "f4186b4e1cec5ef7009560d11cb74087",
"element_id": "030568cacd3b66ce8ee6c6c3c9be840f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "fd9d745f22dffbb155b2e8022e2dc2e4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -711,7 +795,7 @@
"type": "Title"
},
{
"element_id": "a27a3099dea44c05dfea1e0e125abac5",
"element_id": "71d0ef13e2b308bf6c79c3153f3ed35f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -738,7 +822,28 @@
"type": "UncategorizedText"
},
{
"element_id": "44d083c5ce62947d874c568db0dbc01b",
"element_id": "7e882f807cf95f54e80ea3d7b75f6edd",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "16fda0efe288d0c8d1cf18b1037b5b0e",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -759,7 +864,7 @@
"type": "Title"
},
{
"element_id": "23d9d3b7eb1b506a1031e99b28243136",
"element_id": "8bf5be7f0d4a4b5248347885f68f6b89",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -786,7 +891,28 @@
"type": "NarrativeText"
},
{
"element_id": "5d12aca2ca2b8aba5c9dee48f1475f55",
"element_id": "56b696bc7b11d0f3e1165cb157426dcc",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
"date_modified": "2023-07-13T14:13:27.275000",
"record_locator": {
"page_id": "1605859",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605859",
"version": "2"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "c6fe156426f03a42912623025777f8c8",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -807,7 +933,7 @@
"type": "Title"
},
{
"element_id": "70182a5acbdac0041ee51b85dfca692f",
"element_id": "d8f7425068e3b4e6e99affa00d268060",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -834,7 +960,7 @@
"type": "NarrativeText"
},
{
"element_id": "243fc77b8eebdbcf00a6a108a8159b69",
"element_id": "e4589df20d851e29530dbf5f97444eca",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -855,7 +981,7 @@
"type": "Title"
},
{
"element_id": "3014e5236eb14590a7c13e83c36b20ce",
"element_id": "37a3e4a1755417a6944ff64115257147",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -882,7 +1008,7 @@
"type": "NarrativeText"
},
{
"element_id": "800f984e0d3456624dce9630abfd873a",
"element_id": "e26ff7fd8e8e12c8aa704e6f97275fbf",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",
@ -903,7 +1029,7 @@
"type": "Title"
},
{
"element_id": "800885acdda14ccb63621293f9a3aa2f",
"element_id": "18220fb2182492f64b3504513de4fbef",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:54:40.304000",

View File

@ -308,7 +308,29 @@
"type": "UncategorizedText"
},
{
"element_id": "d0b45e375f3a7207caacb7be289ebd62",
"element_id": "f8085d2948c73dfb968f7b221f3e8fab",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
"date_modified": "2023-07-09T12:57:59.173000",
"record_locator": {
"page_id": "1605989",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng",
"fra"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "48d494bb12fd182b0106bff99dd2e3be",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -330,7 +352,7 @@
"type": "UncategorizedText"
},
{
"element_id": "351fc6ff4a9a491bf863ed7aa20fd5c5",
"element_id": "3f1b3ecb6515a47b94579cf7de892f09",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -352,7 +374,7 @@
"type": "UncategorizedText"
},
{
"element_id": "6688bffe9c19dca7cb61ee039a6ffa10",
"element_id": "171423f703a966d2616837ed489f6975",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -374,7 +396,7 @@
"type": "UncategorizedText"
},
{
"element_id": "e50d0b83f51c65bda0620ccec0368a41",
"element_id": "87daeeb71306ae76a90c0e6ccac0dd47",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -396,7 +418,7 @@
"type": "UncategorizedText"
},
{
"element_id": "91697f192743d0583d02cb3e232d3c83",
"element_id": "c4d15cc61c5d6a3f2350f758b82e487f",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -418,7 +440,7 @@
"type": "UncategorizedText"
},
{
"element_id": "7e1f204c284d5e878639feca87a022c4",
"element_id": "e069a6333ef83f6f250880a500439da3",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -440,7 +462,7 @@
"type": "UncategorizedText"
},
{
"element_id": "24fc2762132dbbf33824a2c8575f6c14",
"element_id": "2f030590e85c72dd4a2fc739cc05affe",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -462,7 +484,7 @@
"type": "UncategorizedText"
},
{
"element_id": "c8d91bf0f74cf2d7474b81fa319cc0e5",
"element_id": "2db738cf60bf0471df90b6141fc6a8e5",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -490,7 +512,7 @@
"type": "NarrativeText"
},
{
"element_id": "ce274d7699a4270e902e3617c7cf6e36",
"element_id": "ab3005ca05b4f48396361646916154b4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -512,7 +534,7 @@
"type": "ListItem"
},
{
"element_id": "b54b171c49bbdb6f51308ff765b7f121",
"element_id": "ac70c0a823f0a1d56777036e77e77fd9",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -534,7 +556,7 @@
"type": "ListItem"
},
{
"element_id": "712a4752864712c0ec58730edb76b2f1",
"element_id": "099fca1cec6f3eaa5f71ed9c2ed235e4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -556,7 +578,7 @@
"type": "ListItem"
},
{
"element_id": "f2e9daed509db420ecf36984f431900f",
"element_id": "85d4a299ad3ee61201530bf0030808b1",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -578,7 +600,7 @@
"type": "UncategorizedText"
},
{
"element_id": "3cb5f4888419631affdc50af8f020348",
"element_id": "a1a4f27d3b3cc32777e25b3bb0766083",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -606,7 +628,7 @@
"type": "NarrativeText"
},
{
"element_id": "9f5d86ea05eeb0bf570d9141a5b8994b",
"element_id": "976749a5d532d1f18195d61fe8c04be3",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -634,7 +656,7 @@
"type": "UncategorizedText"
},
{
"element_id": "64ba474681b32c7dbc2a00fb9ec3e757",
"element_id": "079d83c4a7622c70baab0336e3128ec4",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -656,7 +678,7 @@
"type": "Title"
},
{
"element_id": "2d4a5727cd260bb321af0f777e2e699f",
"element_id": "68e58e6fec19f4ec291fd5bcca3dadd8",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -678,7 +700,7 @@
"type": "Title"
},
{
"element_id": "0a5dc104636145b04136d2eee7c4469b",
"element_id": "8dbaefc9dcf7af80f14871cdd2b0c1d6",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -700,7 +722,7 @@
"type": "Title"
},
{
"element_id": "470c624d2702678b94685d916908136a",
"element_id": "2ca9400f81b794c7d595f9bcd719b18a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -722,7 +744,7 @@
"type": "Title"
},
{
"element_id": "bd8ca6285c9717384bd456685550304b",
"element_id": "191672694e36e804e20214e8c9bd2d44",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -744,7 +766,7 @@
"type": "Title"
},
{
"element_id": "99613e61c98f3e61b28c98d096524077",
"element_id": "10d98c7ebc4b2cd6e5508aa9563cc788",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
@ -765,5 +787,27 @@
},
"text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2",
"type": "Table"
},
{
"element_id": "45b142b9287e8ebadbc1dfb48ee5245a",
"metadata": {
"data_source": {
"date_created": "2023-07-09T12:56:40.842000",
"date_modified": "2023-07-09T12:57:59.173000",
"record_locator": {
"page_id": "1605989",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng",
"fra"
]
},
"text": "",
"type": "Image"
}
]

View File

@ -308,7 +308,29 @@
"type": "UncategorizedText"
},
{
"element_id": "a931e049fc3bd99cf74ef09502a71938",
"element_id": "61525bb0c406b033be2849cb375e9fcd",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
"date_modified": "2023-07-11T17:01:47.340000",
"record_locator": {
"page_id": "1802252",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1802252",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng",
"fra"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "7e8cf6622ad36f28966178e194feaad8",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -330,7 +352,7 @@
"type": "UncategorizedText"
},
{
"element_id": "59e566b7776eba69071658b586226bd0",
"element_id": "eeb9717b5c634b6bb4b4bb1b83101500",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -352,7 +374,7 @@
"type": "UncategorizedText"
},
{
"element_id": "971bd18c2de3ef14a26ba1d8e4ef8668",
"element_id": "c5bc1b1b90fd002ca136bc3d76e3d482",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -374,7 +396,7 @@
"type": "UncategorizedText"
},
{
"element_id": "c688b4f7d2e49c8d8d7c77d28ddf5ecc",
"element_id": "9ecb8d523cdd75be94294fa2ca440799",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -396,7 +418,7 @@
"type": "UncategorizedText"
},
{
"element_id": "9ce074ac38046f414a5f16cd9c7308b3",
"element_id": "4909ea47e524d3b8fbac470c0663f589",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -418,7 +440,7 @@
"type": "UncategorizedText"
},
{
"element_id": "a4fbf964d1efe50e1c1ee181b453d4d6",
"element_id": "ab916d1a2aa844a463b48a24637a8b2d",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -440,7 +462,7 @@
"type": "UncategorizedText"
},
{
"element_id": "b7c108f30be7dfb550213536c197e563",
"element_id": "a9060bb59dfa0b37c1ec13e97228470a",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -462,7 +484,7 @@
"type": "UncategorizedText"
},
{
"element_id": "9e78d28a1e5c130197f6a909ec74c987",
"element_id": "0dbff111f50ea5a58eee83c85a1c30b5",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -490,7 +512,7 @@
"type": "NarrativeText"
},
{
"element_id": "d8ae65b075a2f46c394461d4e393f0d5",
"element_id": "ed95477f3bff4586983201b6387c875b",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -512,7 +534,7 @@
"type": "ListItem"
},
{
"element_id": "7f3784563903fdf80ca26e027ca7376d",
"element_id": "260cee6fb6f1562e7b1cb0f7644ea64d",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -534,7 +556,7 @@
"type": "ListItem"
},
{
"element_id": "81f723fb10893947353084829f8b5f68",
"element_id": "6820bf233e6604b7bd6750d0d2b62192",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -556,7 +578,7 @@
"type": "ListItem"
},
{
"element_id": "a28747bf65c9c6ad4981e57ec35822a3",
"element_id": "8f13efbe5d5b289c6ce8eb00e2b5fae2",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -578,7 +600,7 @@
"type": "UncategorizedText"
},
{
"element_id": "ec5dbc92af9cfee5f32dba0e9919b1f7",
"element_id": "ecc13666f56ab3fb01917335016cd9c3",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -606,7 +628,7 @@
"type": "NarrativeText"
},
{
"element_id": "f09110aa418d33cbaccc7b380e0fe0c6",
"element_id": "93ec1210a0bf6e8b0c6c8504648e7489",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -634,7 +656,7 @@
"type": "UncategorizedText"
},
{
"element_id": "fa11e4585afb53a4d046e095f08ac084",
"element_id": "f3c0e02138cb55302a075d5508843876",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -656,7 +678,7 @@
"type": "Title"
},
{
"element_id": "2f06add07bf5f930085d334e1d1fdb6c",
"element_id": "5bc45f2fc513158f644f2c217cc9e54d",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -678,7 +700,7 @@
"type": "Title"
},
{
"element_id": "0c493dc4e25a1447702be5bd7d8a156f",
"element_id": "81bebaf32dff5511a7856e553b526fa3",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -700,7 +722,7 @@
"type": "Title"
},
{
"element_id": "dcf629a3cf73037815d0b85bf9878bd5",
"element_id": "833cc590b6815b09fd40d1d73a752420",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -722,7 +744,7 @@
"type": "Title"
},
{
"element_id": "31d39e1ce259ec5bc37463b03c993697",
"element_id": "bff43fdce7563ccbd5cf3354090e8cf3",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -744,7 +766,7 @@
"type": "Title"
},
{
"element_id": "8083af07d9148f975b439cdb91a216cf",
"element_id": "69b2cf7ade2f1034892b2b38b186fdaa",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
@ -765,5 +787,27 @@
},
"text": "Testdoc2 Table: Column 1 Row 0 Testdoc2 Table: Column 2 Row 0 Testdoc2 Table: Column 3 Row 0 Testdoc2 Table: Column 1 Row 1 Testdoc2 Table: Column 2 Row 1 Testdoc2 Table: Column 3 Row 1 Testdoc2 Table: Column 1 Row 2 Testdoc2 Table: Column 2 Row 2 Testdoc2 Table: Column 3 Row 2",
"type": "Table"
},
{
"element_id": "825a8cbb41eecc2f1b29d4b34cb05c2f",
"metadata": {
"data_source": {
"date_created": "2023-07-11T17:01:39.240000",
"date_modified": "2023-07-11T17:01:47.340000",
"record_locator": {
"page_id": "1802252",
"url": "https://unstructured-ingest-test.atlassian.net"
},
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1802252",
"version": "1"
},
"filetype": "text/html",
"languages": [
"eng",
"fra"
]
},
"text": "",
"type": "Image"
}
]

View File

@ -538,7 +538,22 @@
"type": "NarrativeText"
},
{
"element_id": "c087a92c7251ca836ff023d35cb0a1aa",
"element_id": "387c4d334f8e9650a56b3b444b2ad5f6",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
"date_modified": "2023-08-17T18:48:00.000Z"
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "",
"type": "Image"
},
{
"element_id": "60d9f47b086264ea72277b741e3b2bdd",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
@ -553,7 +568,7 @@
"type": "UncategorizedText"
},
{
"element_id": "3126a68fa0a12481ca6dc64c16511a7e",
"element_id": "b39f61345657ccc5e201c20a6a90fad7",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
@ -568,7 +583,7 @@
"type": "UncategorizedText"
},
{
"element_id": "8cfa5b216c8d3f774f8e1def029681e6",
"element_id": "b95452fe8c6616a1ce1311457526c302",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
@ -583,7 +598,7 @@
"type": "UncategorizedText"
},
{
"element_id": "b538abdbf0aff3f9f1ab11d79bb5bc26",
"element_id": "a7c3ee9360b2020e28aa31835ef5283c",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",
@ -598,7 +613,7 @@
"type": "NarrativeText"
},
{
"element_id": "570c50d8758c5639a1dfd0f238f609d5",
"element_id": "349f058fcce7e32bb68b620841f40c9e",
"metadata": {
"data_source": {
"date_created": "2023-08-04T18:31:00.000Z",

View File

@ -26,5 +26,33 @@
"date_modified": "1692628456.0"
}
}
},
{
"type": "Image",
"element_id": "f714fa214dac2f441515c4f28370d279",
"text": "",
"metadata": {
"languages": [
"eng"
],
"filetype": "message/rfc822",
"email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net",
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"jane_gray@uoa.edu"
],
"subject": "Test of email 1",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErPIAU"
},
"date_created": "1692542056.0",
"date_modified": "1692628456.0"
}
}
}
]

View File

@ -26,5 +26,33 @@
"date_modified": "1692542155.0"
}
}
},
{
"type": "Image",
"element_id": "68870d055535f48c7439ce67092768f6",
"text": "",
"metadata": {
"languages": [
"eng"
],
"filetype": "message/rfc822",
"email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net",
"sent_from": [
"devops+salesforce-connector@unstructured.io"
],
"sent_to": [
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",
"record_locator": {
"id": "02sHu00001efErQIAU"
},
"date_created": "1692542149.0",
"date_modified": "1692542155.0"
}
}
}
]

View File

@ -1 +1 @@
__version__ = "0.16.25" # pragma: no cover
__version__ = "0.16.26-dev1" # pragma: no cover

View File

@ -75,6 +75,7 @@ Other background
from __future__ import annotations
import re
from collections import defaultdict, deque
from types import MappingProxyType
from typing import Any, Iterable, Iterator, Mapping, NamedTuple, Sequence, cast
@ -89,6 +90,7 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
EmailAddress,
Image,
ListItem,
NarrativeText,
Table,
@ -477,6 +479,34 @@ class Pre(BlockItem):
return _PreElementAccumulator(self)
class ImageBlock(Flow):
"""Custom element-class for `<img>` elements."""
BASE64_IMAGE_REGEX = re.compile(r"^data:(image/[^;]+);base64,(.*)")
def iter_elements(self) -> Iterator[Element]:
"""Generate an Image element based on `src`, `data-src`, and `alt`."""
img_src = self.get("data-src", "").strip() or self.get("src", "").strip()
img_alt = self.get("alt", "").strip()
if not img_src: # Early exit if no image source
return
mime_match = self.BASE64_IMAGE_REGEX.match(img_src)
img_mime_type = mime_match.group(1) if mime_match else None
img_base64 = mime_match.group(2) if mime_match else None
img_url = None if img_base64 else img_src
yield Image(
text=img_alt,
metadata=ElementMetadata(
image_mime_type=img_mime_type,
image_base64=img_base64,
url=img_url,
),
)
class TableBlock(Flow):
"""Custom element-class for `<table>` element."""
@ -928,6 +958,8 @@ element_class_lookup.get_namespace(None).update(
"ol": ListBlock,
"ul": ListBlock,
"li": ListItemBlock,
# -- image --
"img": ImageBlock,
# -- table --
"table": TableBlock,
# -- annotated phrasing --

View File

@ -10,7 +10,7 @@ import requests
from lxml import etree
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element
from unstructured.documents.elements import Element, ElementType
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
@ -108,6 +108,8 @@ class HtmlPartitionerOptions:
detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
extract_image_block_types: Optional[list[str]] = None,
extract_image_block_to_payload: bool = False,
):
self._file_path = file_path
self._file = file
@ -120,6 +122,8 @@ class HtmlPartitionerOptions:
self._detection_origin = detection_origin
self._html_parser_version = html_parser_version
self._image_alt_mode = image_alt_mode
self._extract_image_block_types = extract_image_block_types
self._extract_image_block_to_payload = extract_image_block_to_payload
@lazyproperty
def detection_origin(self) -> str | None:
@ -183,6 +187,15 @@ class _HtmlPartitioner:
def __init__(self, opts: HtmlPartitionerOptions):
self._opts = opts
def _should_include_image_base64(self, element: Element) -> bool:
"""Determines if an image_base64 element should be included in the output."""
return (
element.category == ElementType.IMAGE
and self._opts._extract_image_block_to_payload
and self._opts._extract_image_block_types is not None
and "Image" in self._opts._extract_image_block_types
)
@classmethod
def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]:
"""Partition HTML document provided by `opts` into document-elements."""
@ -202,6 +215,10 @@ class _HtmlPartitioner:
for e in elements_iter:
e.metadata.last_modified = self._opts.last_modified
e.metadata.detection_origin = self._opts.detection_origin
# -- remove <image_base64> if not requested --
if not self._should_include_image_base64(e):
e.metadata.image_base64 = None
yield e
@lazyproperty
@ -224,7 +241,7 @@ class _HtmlPartitioner:
# -- remove a variety of HTML element types like <script> and <style> that we prefer not
# -- to encounter while parsing.
etree.strip_elements(
root, ["del", "img", "link", "meta", "noscript", "script", "style"], with_tail=False
root, ["del", "link", "meta", "noscript", "script", "style"], with_tail=False
)
# -- remove <header> and <footer> tags if the caller doesn't want their contents --