Matt Robinson f4ddf53590
feat: track emphasized text in partition_html (#1034)
* Feat/965 track emphasized text html (#1021)

* feat: add functionality to track emphasized text (<strong>, <em>, <span>, <b>, <i> tags) in HTML

* feat: add `include_tail_text` parameter to `_construct_text`

* test: add test case for `_get_emphasized_texts_from_tag`

* test: add `emphasized_texts` to metadata

* chore: update changelog & version

* fix tests

* fix lint errors

* chore: update changelog

* chore: small comment updates

* feat: update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag

* chore: update changelog

* Update ingest test fixtures (#1026)

Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: Matt Robinson <mrobinson@unstructured.io>

* ingest-test-fixtures-update

* Update ingest test fixtures (#1035)

Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>

---------

Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-08-03 16:24:25 +00:00

492 lines
12 KiB
JSON

[
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "Title",
"element_id": "e7f29a76338c0f4d24e30d18e6f336ee",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2 testtext2"
},
{
"type": "ListItem",
"element_id": "83cd16582f7c6143822c0954f7f00350",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 List Item 1Testdoc2 List Item 1 Nested Item ATestdoc2 List Item 1 Nested Item B"
},
{
"type": "ListItem",
"element_id": "4e97e8a96031986042c3bec526dd273f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 List Item 2"
},
{
"type": "ListItem",
"element_id": "629e4dee8b4acc5b39782b4d012ab83c",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 List Item 3"
},
{
"type": "ListItem",
"element_id": "d177691364d515d6eaa385205a64664c",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 List Item 4"
},
{
"type": "ListItem",
"element_id": "f7a3306959cba883aca00f29ca138b6a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 List Item 5"
},
{
"type": "NarrativeText",
"element_id": "3d15d9222ffb2770ceede5b6532e842a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "This is the link for unstructured . io.",
"url": "https://www.unstructured.io/"
}
]
},
"text": "This is the link for unstructured . io."
},
{
"type": "ListItem",
"element_id": "fcb6283714c2b7640835964291e59ebd",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 1",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 1"
},
{
"type": "ListItem",
"element_id": "899f082b917d5e6e380f15705db5923a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 2 (checked)",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 2 (checked)"
},
{
"type": "ListItem",
"element_id": "07005b9f1774ceabf978b2cc8afb7183",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Checklist Item 3",
"tag": "span"
}
]
},
"text": "Testdoc2 Checklist Item 3"
},
{
"type": "UncategorizedText",
"element_id": "86e0e9ce51725074a3dc41ed3f10c13c",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃"
},
{
"type": "NarrativeText",
"element_id": "72fa9ecafbf0df76a3b307485ce4c98b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 bold text",
"tag": "strong"
}
]
},
"text": "Testdoc2 bold text"
},
{
"type": "Title",
"element_id": "552ed5322965c5e1e8b235b0373f2470",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 italic text",
"tag": "em"
}
]
},
"text": "Testdoc2 italic text"
},
{
"type": "Title",
"element_id": "6e11265369ab068c620fc2a2b7b858f5",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 Heading 1 Sized Text"
},
{
"type": "Title",
"element_id": "4da0847af03e9440530539d6da679e85",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 Heading 2 Sized Text"
},
{
"type": "Title",
"element_id": "2003a49a54c45aae0cd7640f47ea3f41",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 Heading 3 Sized Text"
},
{
"type": "Title",
"element_id": "ab09ddfa08278cd006e9ea2201c8989d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 Heading 4 Sized Text"
},
{
"type": "Title",
"element_id": "4ad754b028bec050fbff5ce2f50c60d1",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Testdoc2 Heading 5 Sized Text"
},
{
"type": "Title",
"element_id": "7aa138ab1f6ef154504c3d8ade2fd1a0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 0"
},
{
"type": "Title",
"element_id": "b40b0fee79c609772c958caa07bd47a8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 0"
},
{
"type": "Title",
"element_id": "cc59bb6025ceae34c2b9c9d7cdbfbcf9",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 0",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 0"
},
{
"type": "Title",
"element_id": "3cb373750d4e46b4bbc980dd0d74321e",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 1"
},
{
"type": "Title",
"element_id": "219a8d1fc742fb75b2481a0a75c77a3b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 1"
},
{
"type": "Title",
"element_id": "07a1ad32c97f3669f88014ee5942f616",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 1",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 1"
},
{
"type": "Title",
"element_id": "17228bddb06b739951fab2ab04c09ea8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 1 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 1 Row 2"
},
{
"type": "Title",
"element_id": "4ad7ae00fff8c8a3f903864d037cf86e",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 2 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 2 Row 2"
},
{
"type": "Title",
"element_id": "f2701095922247ecafbbd3fe31d585bf",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Testdoc2 Table: Column 3 Row 2",
"tag": "strong"
}
]
},
"text": "Testdoc2 Table: Column 3 Row 2"
}
]