Matt Robinson f4ddf53590
feat: track emphasized text in partition_html (#1034)
* Feat/965 track emphasized text html (#1021)

* feat: add functionality to track emphasized text (<strong>, <em>, <span>, <b>, <i> tags) in HTML

* feat: add `include_tail_text` parameter to `_construct_text`

* test: add test case for `_get_emphasized_texts_from_tag`

* test: add `emphasized_texts` to metadata

* chore: update changelog & version

* fix tests

* fix lint errors

* chore: update changelog

* chore: small comment updates

* feat: update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag

* chore: update changelog

* Update ingest test fixtures (#1026)

Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: Matt Robinson <mrobinson@unstructured.io>

* ingest-test-fixtures-update

* Update ingest test fixtures (#1035)

Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>

---------

Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-08-03 16:24:25 +00:00

456 lines
9.3 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

[
{
"type": "Title",
"element_id": "9fe4c68ec20dda7c6b1d3f760e5e6af6",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Driver",
"tag": "strong"
}
]
},
"text": "Driver"
},
{
"type": "Title",
"element_id": "3ebb5648c8bcb2934590555c69356e27",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Approver",
"tag": "strong"
}
]
},
"text": "Approver"
},
{
"type": "Title",
"element_id": "350ad433c42fe8cecdb38439f33947ea",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Contributors",
"tag": "strong"
}
]
},
"text": "Contributors"
},
{
"type": "Title",
"element_id": "31a717c19407f215d8bcd329fc82e646",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Informed",
"tag": "strong"
}
]
},
"text": "Informed"
},
{
"type": "Title",
"element_id": "3b20adc3b2ce1c15ea6880c3151baabe",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Objective",
"tag": "strong"
}
]
},
"text": "Objective"
},
{
"type": "Title",
"element_id": "e1cb6d30fa3f17ee1e50b2bcf1967374",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Due date",
"tag": "strong"
}
]
},
"text": "Due date"
},
{
"type": "Title",
"element_id": "80f5b18f225fca5e493dc48e4e60e8c7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Key outcomes",
"tag": "strong"
}
]
},
"text": "Key outcomes"
},
{
"type": "Title",
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Status",
"tag": "strong"
}
]
},
"text": "Status"
},
{
"type": "Title",
"element_id": "a54416fced47600988250cacdb064691",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "NOT STARTED",
"tag": "span"
},
{
"text": "IN PROGRESS",
"tag": "span"
},
{
"text": "COMPLETE",
"tag": "span"
}
]
},
"text": "NOT STARTED / IN PROGRESS / COMPLETE"
},
{
"type": "Title",
"element_id": "4e2022d4483a407d85060675f64fbe17",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "\\uD83E\\uDD14 Problem Statement"
},
{
"type": "Title",
"element_id": "81163675915a75217e4116686fdca412",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "🎯 Scope"
},
{
"type": "NarrativeText",
"element_id": "0e5c4ed000097332e1e1b29a96fefd56",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Must have:",
"tag": "strong"
}
]
},
"text": "Must have:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "NarrativeText",
"element_id": "d29e06627b1fec1ecf65bce63fc5fda5",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Nice to have:",
"tag": "strong"
}
]
},
"text": "Nice to have:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "7f999c0456e4e85cc028aa6ed90455d4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Not in scope:",
"tag": "strong"
}
]
},
"text": "Not in scope:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "e8b61a28d07e977379b42df455a1cde4",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "\\uD83D\\uDDD3 Timeline"
},
{
"type": "Title",
"element_id": "5043f71fbc70e35c0be413d4135be99f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Lane 1"
},
{
"type": "Title",
"element_id": "d5a2e177c588bf0c4f914baa4fae85b6",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Lane 2"
},
{
"type": "Title",
"element_id": "c98ba1acbd22a15ddddfc244cbd8a2db",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Feature 1"
},
{
"type": "Title",
"element_id": "e04620c8b3b611b3fefecef89baa63a9",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Feature 2"
},
{
"type": "Title",
"element_id": "82e522a86692cc50ee5c020c8e6ce6a0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Feature 3"
},
{
"type": "Title",
"element_id": "822f7c45ea725c535970aab819a8ff10",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Feature 4"
},
{
"type": "Title",
"element_id": "6e0f6eca4ff17d3377c1c3e8e1f73457",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "iOS app"
},
{
"type": "Title",
"element_id": "0b60fe04b3c5c3c76371b6eca8b19c8e",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Android app"
},
{
"type": "Title",
"element_id": "e1cc184f345d146586fb12527c4fa696",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "\\uD83D\\uDEA9 Milestones and deadlines"
},
{
"type": "Title",
"element_id": "9e86248cf2351e388065b80307b7ac00",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Milestone",
"tag": "strong"
}
]
},
"text": "Milestone"
},
{
"type": "Title",
"element_id": "4b1b8aa3608a26da451ae0630d75b60a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Owner",
"tag": "strong"
}
]
},
"text": "Owner"
},
{
"type": "Title",
"element_id": "6fcb38ddc858fc8592e4f693d04a2ed1",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Deadline",
"tag": "strong"
}
]
},
"text": "Deadline"
},
{
"type": "Title",
"element_id": "920e413c7d411b61ef3e8c63b1cb6ad0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Status",
"tag": "strong"
}
]
},
"text": "Status"
},
{
"type": "Title",
"element_id": "890c9b6d8d69ca1de5fd7a8b83fe78ff",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "\\uD83D\\uDD17 Reference materials"
}
]