Matt Robinson f4ddf53590
feat: track emphasized text in partition_html (#1034)
* Feat/965 track emphasized text html (#1021)

* feat: add functionality to track emphasized text (<strong>, <em>, <span>, <b>, <i> tags) in HTML

* feat: add `include_tail_text` parameter to `_construct_text`

* test: add test case for `_get_emphasized_texts_from_tag`

* test: add `emphasized_texts` to metadata

* chore: update changelog & version

* fix tests

* fix lint errors

* chore: update changelog

* chore: small comment updates

* feat: update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag

* chore: update changelog

* Update ingest test fixtures (#1026)

Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: Matt Robinson <mrobinson@unstructured.io>

* ingest-test-fixtures-update

* Update ingest test fixtures (#1035)

Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>

---------

Co-authored-by: Christine Straub <christinemstraub@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-08-03 16:24:25 +00:00

172 lines
3.4 KiB
JSON

[
{
"type": "Title",
"element_id": "307afee17dac4c598e361c095338decd",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Copy and paste this section for each week."
},
{
"type": "Title",
"element_id": "b980a145c5e8c9e233a0643366ba520a",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Win",
"tag": "strong"
}
]
},
"text": "Win"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "aecc044c7725a6555114285dc28fe2d1",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Needs input",
"tag": "strong"
}
]
},
"text": "Needs input"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "9d3cab2b5efed4eaef42a707dbc813da",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Focus",
"tag": "strong"
}
]
},
"text": "Focus"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Notes",
"tag": "strong"
}
]
},
"text": "Notes"
},
{
"type": "Title",
"element_id": "98e38cd6c5f88330322de759657563f9",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "Important Links",
"tag": "strong"
}
]
},
"text": "Important Links"
}
]