mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-23 09:00:40 +00:00

* Feat/965 track emphasized text html (#1021) * feat: add functionality to track emphasized text (<strong>, <em>, <span>, <b>, <i> tags) in HTML * feat: add `include_tail_text` parameter to `_construct_text` * test: add test case for `_get_emphasized_texts_from_tag` * test: add `emphasized_texts` to metadata * chore: update changelog & version * fix tests * fix lint errors * chore: update changelog * chore: small comment updates * feat: update `XMLDocument._read_xml` to create `<p>` tag element for the text enclosed in the `<pre>` tag * chore: update changelog * Update ingest test fixtures (#1026) Co-authored-by: christinestraub <christinestraub@users.noreply.github.com> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com> Co-authored-by: Matt Robinson <mrobinson@unstructured.io> * ingest-test-fixtures-update * Update ingest test fixtures (#1035) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> --------- Co-authored-by: Christine Straub <christinemstraub@gmail.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
176 lines
3.6 KiB
JSON
176 lines
3.6 KiB
JSON
[
|
||
{
|
||
"type": "Title",
|
||
"element_id": "35054d4d1455c734e83a868656b4ad16",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Date"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "0126c1353ddd7c8dfdb29f252a64a344",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDC65 Participants"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "fa64ff027cbc0c6929bc75d3c78c94c3",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83E\\uDD45 Goals"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "537ea1b14dcba1742bdbd4a5fbfb488c",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDE3 Discussion topics"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "33b93476cf597a3330653b66a658983d",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1,
|
||
"emphasized_texts": [
|
||
{
|
||
"text": "Time",
|
||
"tag": "strong"
|
||
}
|
||
]
|
||
},
|
||
"text": "Time"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "652bcc3a478428893cc505ae19f847b4",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1,
|
||
"emphasized_texts": [
|
||
{
|
||
"text": "Item",
|
||
"tag": "strong"
|
||
}
|
||
]
|
||
},
|
||
"text": "Item"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "9ef077a1231ea3b71df182b87db1cb7f",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1,
|
||
"emphasized_texts": [
|
||
{
|
||
"text": "Presenter",
|
||
"tag": "strong"
|
||
}
|
||
]
|
||
},
|
||
"text": "Presenter"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "8a7525b1492fb84833f5c4a69b30f4bf",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1,
|
||
"emphasized_texts": [
|
||
{
|
||
"text": "Notes",
|
||
"tag": "strong"
|
||
}
|
||
]
|
||
},
|
||
"text": "Notes"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "✅ Action items"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "addb0aa08f77b69fa754ba55c6600c8a",
|
||
"metadata": {
|
||
"data_source": {},
|
||
"filetype": "text/html",
|
||
"page_number": 1
|
||
},
|
||
"text": "⤴ Decisions"
|
||
}
|
||
] |