Feat/1060 update metadata fields (#1099)

Closes Github Issue #1060. * update the metadata field links * update the metadata field emphasized_texts
2025-12-26 14:45:31 +00:00 · 2023-08-15 21:33:06 -07:00 · 2023-08-15 21:33:06 -07:00 · 0e887cc36b
commit 0e887cc36b
parent fe5048a834
21 changed files with 536 additions and 633 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,9 @@
-## 0.9.4-dev0
+## 0.10.0

 ### Enhancements

+* Update the `links` and `emphasized_texts` metadata fields
+
 ### Features

 ### Fixes
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -16,6 +16,7 @@ from unstructured.documents.elements import (
 )
 from unstructured.partition.doc import partition_doc
 from unstructured.partition.docx import (
+    _extract_contents_and_tags,
    _get_emphasized_texts_from_paragraph,
    _get_emphasized_texts_from_table,
    partition_docx,
@ -63,6 +64,26 @@ def expected_elements():
    ]


+@pytest.fixture()
+def expected_emphasized_texts():
+    return [
+        {"text": "bold", "tag": "b"},
+        {"text": "italic", "tag": "i"},
+        {"text": "bold-italic", "tag": "b"},
+        {"text": "bold-italic", "tag": "i"},
+    ]
+
+
+@pytest.fixture()
+def expected_emphasized_text_contents():
+    return ["bold", "italic", "bold-italic", "bold-italic"]
+
+
+@pytest.fixture()
+def expected_emphasized_text_tags():
+    return ["b", "i", "b", "i"]
+
+
 def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir):
    filename = os.path.join(tmpdir.dirname, "mock_document.docx")
    mock_document.save(filename)
@ -293,19 +314,14 @@ def test_partition_docx_from_file_without_metadata_date(


 def test_get_emphasized_texts_from_paragraph(
+    expected_emphasized_texts,
    filename="example-docs/fake-doc-emphasized-text.docx",
 ):
-    expected = [
-        {"text": "bold", "tag": "b"},
-        {"text": "italic", "tag": "i"},
-        {"text": "bold-italic", "tag": "b"},
-        {"text": "bold-italic", "tag": "i"},
-    ]
    document = docx.Document(filename)
    paragraph = document.paragraphs[1]
    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
    assert paragraph.text == "I am a bold italic bold-italic text."
-    assert emphasized_texts == expected
+    assert emphasized_texts == expected_emphasized_texts

    paragraph = document.paragraphs[2]
    emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
@ -319,18 +335,29 @@ def test_get_emphasized_texts_from_paragraph(


 def test_get_emphasized_texts_from_table(
+    expected_emphasized_texts,
    filename="example-docs/fake-doc-emphasized-text.docx",
 ):
-    expected = [
-        {"text": "bold", "tag": "b"},
-        {"text": "italic", "tag": "i"},
-        {"text": "bold-italic", "tag": "b"},
-        {"text": "bold-italic", "tag": "i"},
-    ]
    document = docx.Document(filename)
    table = document.tables[0]
    emphasized_texts = _get_emphasized_texts_from_table(table)
-    assert emphasized_texts == expected
+    assert emphasized_texts == expected_emphasized_texts
+
+
+def test_extract_contents_and_tags(
+    expected_emphasized_texts,
+    expected_emphasized_text_contents,
+    expected_emphasized_text_tags,
+):
+    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
+        expected_emphasized_texts,
+    )
+    assert emphasized_text_contents == expected_emphasized_text_contents
+    assert emphasized_text_tags == expected_emphasized_text_tags
+
+    emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
+    assert emphasized_text_contents is None
+    assert emphasized_text_tags is None


@pytest.mark.parametrize(
@ -340,24 +367,22 @@ def test_get_emphasized_texts_from_table(
        ("fake-doc-emphasized-text.doc", partition_doc),
    ],
 )
-def test_partition_docx_grabs_emphasized_texts(filename, partition_func):
+def test_partition_docx_grabs_emphasized_texts(
+    filename,
+    partition_func,
+    expected_emphasized_text_contents,
+    expected_emphasized_text_tags,
+):
    elements = partition_func(filename=f"example-docs/{filename}")

    assert isinstance(elements[0], Table)
-    assert elements[0].metadata.emphasized_texts == [
-        {"text": "bold", "tag": "b"},
-        {"text": "italic", "tag": "i"},
-        {"text": "bold-italic", "tag": "b"},
-        {"text": "bold-italic", "tag": "i"},
-    ]
+    assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
+    assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags

    assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
-    assert elements[1].metadata.emphasized_texts == [
-        {"text": "bold", "tag": "b"},
-        {"text": "italic", "tag": "i"},
-        {"text": "bold-italic", "tag": "b"},
-        {"text": "bold-italic", "tag": "i"},
-    ]
+    assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
+    assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags

    assert elements[2] == NarrativeText("I am a normal text.")
-    assert elements[2].metadata.emphasized_texts is None
+    assert elements[2].metadata.emphasized_text_contents is None
+    assert elements[2].metadata.emphasized_text_tags is None
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@ -455,34 +455,24 @@ def test_partition_html_grabs_links():
    elements = partition_html(text=html_text)

    assert elements[0] == NarrativeText("Hello there I am a very important link!")
-    assert elements[0].metadata.links == [
-        {
-            "text": "very important link!",
-            "url": "/link",
-        },
-    ]
+    assert elements[0].metadata.link_urls == ["/link"]
+    assert elements[0].metadata.link_texts == ["very important link!"]

    assert elements[1] == NarrativeText("Here is a list of my favorite things")
-    assert elements[1].metadata.links is None
+    assert elements[1].metadata.link_urls is None
+    assert elements[1].metadata.link_texts is None

    assert elements[2] == ListItem("Parrots")
-    assert elements[2].metadata.links == [
-        {
-            "text": "Parrots",
-            "url": "https://en.wikipedia.org/wiki/Parrot",
-        },
-    ]
+    assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
+    assert elements[2].metadata.link_texts == ["Parrots"]

    assert elements[3] == ListItem("Dogs")
-    assert elements[3].metadata.links is None
+    assert elements[3].metadata.link_urls is None
+    assert elements[3].metadata.link_texts is None

    assert elements[4] == Title("A lone link!")
-    assert elements[4].metadata.links == [
-        {
-            "text": "A lone link!",
-            "url": "/loner",
-        },
-    ]
+    assert elements[4].metadata.link_urls == ["/loner"]
+    assert elements[4].metadata.link_texts == ["A lone link!"]


 def test_partition_html_from_filename_with_skip_headers_and_footers(
@ -570,26 +560,25 @@ def test_partition_html_grabs_emphasized_texts():
    elements = partition_html(text=html_text)

    assert elements[0] == NarrativeText("Hello there I am a very important text!")
-    assert elements[0].metadata.emphasized_texts == [
-        {"text": "important", "tag": "strong"},
-    ]
+    assert elements[0].metadata.emphasized_text_contents == ["important"]
+    assert elements[0].metadata.emphasized_text_tags == ["strong"]

    assert elements[1] == NarrativeText("Here is a list of my favorite things")
-    assert elements[1].metadata.emphasized_texts == [
-        {"text": "list", "tag": "span"},
-        {"text": "my favorite things", "tag": "b"},
-        {"text": "favorite", "tag": "i"},
+    assert elements[1].metadata.emphasized_text_contents == [
+        "list",
+        "my favorite things",
+        "favorite",
    ]
+    assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]

    assert elements[2] == ListItem("Parrots")
-    assert elements[2].metadata.emphasized_texts == [
-        {"text": "Parrots", "tag": "em"},
-    ]
+    assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
+    assert elements[2].metadata.emphasized_text_tags == ["em"]

    assert elements[3] == ListItem("Dogs")
-    assert elements[3].metadata.emphasized_texts is None
+    assert elements[3].metadata.emphasized_text_contents is None
+    assert elements[3].metadata.emphasized_text_tags is None

    assert elements[4] == Title("A lone span text!")
-    assert elements[4].metadata.emphasized_texts == [
-        {"text": "A lone span text!", "tag": "span"},
-    ]
+    assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
+    assert elements[4].metadata.emphasized_text_tags == ["span"]
--- a/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
+++ b/test_unstructured_ingest/expected-structured-output/Sharepoint/Shared
@ -15,21 +15,19 @@
      "filename": "ideas-page.html",
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json
@ -36,11 +36,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Weather.gov",
-          "url": "https://www.weather.gov"
-        }
+      "link_urls": [
+        "https://www.weather.gov"
+      ],
+      "link_texts": [
+        "Weather.gov"
      ]
    },
    "text": "Weather.gov >"
@ -52,11 +52,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "News Around NOAA",
-          "url": "https://www.weather.gov/news"
-        }
+      "link_urls": [
+        "https://www.weather.gov/news"
+      ],
+      "link_texts": [
+        "News Around NOAA"
      ]
    },
    "text": "News Around NOAA > Are You Weather-Ready for the Spring?"
@ -68,91 +68,51 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Weather Safety",
-          "url": "http://www.weather.gov/safetycampaign"
-        },
-        {
-          "text": "Air Quality",
-          "url": "https://www.weather.gov/safety/airquality"
-        },
-        {
-          "text": "Beach Hazards",
-          "url": "https://www.weather.gov/safety/beachhazards"
-        },
-        {
-          "text": "Cold",
-          "url": "https://www.weather.gov/safety/cold"
-        },
-        {
-          "text": "Cold Water",
-          "url": "https://www.weather.gov/safety/coldwater"
-        },
-        {
-          "text": "Drought",
-          "url": "https://www.weather.gov/safety/drought"
-        },
-        {
-          "text": "Floods",
-          "url": "https://www.weather.gov/safety/flood"
-        },
-        {
-          "text": "Fog",
-          "url": "https://www.weather.gov/safety/fog"
-        },
-        {
-          "text": "Heat",
-          "url": "https://www.weather.gov/safety/heat"
-        },
-        {
-          "text": " Hurricanes",
-          "url": "https://www.weather.gov/safety/hurricane"
-        },
-        {
-          "text": " Lightning Safety",
-          "url": "https://www.weather.gov/safety/lightning"
-        },
-        {
-          "text": "Rip Currents",
-          "url": "https://www.weather.gov/safety/ripcurrent"
-        },
-        {
-          "text": "Safe Boating",
-          "url": "https://www.weather.gov/safety/safeboating"
-        },
-        {
-          "text": "Space Weather",
-          "url": "https://www.weather.gov/safety/space"
-        },
-        {
-          "text": "Sun (Ultraviolet Radiation)",
-          "url": "https://www.weather.gov/safety/heat-uv"
-        },
-        {
-          "text": " Thunderstorms & Tornadoes",
-          "url": "https://www.weather.gov/safety/thunderstorm"
-        },
-        {
-          "text": "Tornado",
-          "url": "https://www.weather.gov/safety/tornado"
-        },
-        {
-          "text": "Tsunami",
-          "url": "https://www.weather.gov/safety/tsunami"
-        },
-        {
-          "text": "Wildfire",
-          "url": "https://www.weather.gov/safety/wildfire"
-        },
-        {
-          "text": "Wind",
-          "url": "https://www.weather.gov/safety/wind"
-        },
-        {
-          "text": "Winter",
-          "url": "https://www.weather.gov/safety/winter "
-        }
+      "link_urls": [
+        "http://www.weather.gov/safetycampaign",
+        "https://www.weather.gov/safety/airquality",
+        "https://www.weather.gov/safety/beachhazards",
+        "https://www.weather.gov/safety/cold",
+        "https://www.weather.gov/safety/coldwater",
+        "https://www.weather.gov/safety/drought",
+        "https://www.weather.gov/safety/flood",
+        "https://www.weather.gov/safety/fog",
+        "https://www.weather.gov/safety/heat",
+        "https://www.weather.gov/safety/hurricane",
+        "https://www.weather.gov/safety/lightning",
+        "https://www.weather.gov/safety/ripcurrent",
+        "https://www.weather.gov/safety/safeboating",
+        "https://www.weather.gov/safety/space",
+        "https://www.weather.gov/safety/heat-uv",
+        "https://www.weather.gov/safety/thunderstorm",
+        "https://www.weather.gov/safety/tornado",
+        "https://www.weather.gov/safety/tsunami",
+        "https://www.weather.gov/safety/wildfire",
+        "https://www.weather.gov/safety/wind",
+        "https://www.weather.gov/safety/winter "
+      ],
+      "link_texts": [
+        "Weather Safety",
+        "Air Quality",
+        "Beach Hazards",
+        "Cold",
+        "Cold Water",
+        "Drought",
+        "Floods",
+        "Fog",
+        "Heat",
+        " Hurricanes",
+        " Lightning Safety",
+        "Rip Currents",
+        "Safe Boating",
+        "Space Weather",
+        "Sun (Ultraviolet Radiation)",
+        " Thunderstorms & Tornadoes",
+        "Tornado",
+        "Tsunami",
+        "Wildfire",
+        "Wind",
+        "Winter"
      ]
    },
    "text": "Weather Safety                                                                                                        Air Quality                                                                            Beach Hazards                                                                            Cold                                                                            Cold Water                                                                            Drought                                                                            Floods                                                                            Fog                                                                            Heat                                                                             Hurricanes                                                                             Lightning Safety                                                                            Rip Currents                                                                            Safe Boating                                                                            Space Weather                                                                            Sun (Ultraviolet Radiation)                                                                             Thunderstorms & Tornadoes                                                                            Tornado                                                                            Tsunami                                                                            Wildfire                                                                            Wind                                                                            Winter"
@ -164,35 +124,23 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Safety Campaigns",
-          "url": "https://www.weather.gov/safetycampaign"
-        },
-        {
-          "text": "Seasonal Safety Campaigns",
-          "url": "https://www.weather.gov/safetycampaign"
-        },
-        {
-          "text": "#SafePlaceSelfie",
-          "url": "https://www.weather.gov/wrn/safeplaceselfie"
-        },
-        {
-          "text": "Deaf & Hard of Hearing",
-          "url": "https://www.weather.gov/wrn/dhh-safety"
-        },
-        {
-          "text": "Intellectual Disabilities",
-          "url": "https://www.weather.gov/wrn/intellectualdisabilities"
-        },
-        {
-          "text": "Spanish-language Content",
-          "url": "https://www.weather.gov/wrn/fall2020-espanol-sm"
-        },
-        {
-          "text": "The Great Outdoors",
-          "url": "https://www.noaa.gov/explainers/great-outdoors-weather-safety"
-        }
+      "link_urls": [
+        "https://www.weather.gov/safetycampaign",
+        "https://www.weather.gov/safetycampaign",
+        "https://www.weather.gov/wrn/safeplaceselfie",
+        "https://www.weather.gov/wrn/dhh-safety",
+        "https://www.weather.gov/wrn/intellectualdisabilities",
+        "https://www.weather.gov/wrn/fall2020-espanol-sm",
+        "https://www.noaa.gov/explainers/great-outdoors-weather-safety"
+      ],
+      "link_texts": [
+        "Safety Campaigns",
+        "Seasonal Safety Campaigns",
+        "#SafePlaceSelfie",
+        "Deaf & Hard of Hearing",
+        "Intellectual Disabilities",
+        "Spanish-language Content",
+        "The Great Outdoors"
      ]
    },
    "text": "Safety Campaigns                                                                                                        Seasonal Safety Campaigns                                                                            #SafePlaceSelfie                                                                            Deaf & Hard of Hearing                                                                            Intellectual Disabilities                                                                            Spanish-language Content                                                                            The Great Outdoors"
@ -204,59 +152,35 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Ambassador",
-          "url": "https://www.weather.gov/wrn/ambassadors"
-        },
-        {
-          "text": "About WRN Ambassadors",
-          "url": "https://www.weather.gov/wrn/ambassadors"
-        },
-        {
-          "text": "Become an Ambassador",
-          "url": "https://www.weather.gov/wrn/amb-tou"
-        },
-        {
-          "text": "Ambassadors of Excellence",
-          "url": "https://www.weather.gov/wrn/ambassador_recognition"
-        },
-        {
-          "text": "People of WRN",
-          "url": "https://www.weather.gov/people/"
-        },
-        {
-          "text": " FAQS",
-          "url": "https://www.weather.gov/wrn/amb-faqs"
-        },
-        {
-          "text": "Tell Your Success Story",
-          "url": "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform"
-        },
-        {
-          "text": " Success Stories",
-          "url": " https://www.weather.gov/wrn/success-stories"
-        },
-        {
-          "text": "Tri-fold",
-          "url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf"
-        },
-        {
-          "text": "Aviation",
-          "url": "https://www.weather.gov/wrn/aviation"
-        },
-        {
-          "text": " Current Ambassadors",
-          "url": " http://www.weather.gov/wrn/current-ambassadors"
-        },
-        {
-          "text": "Brochure",
-          "url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
-        },
-        {
-          "text": "En Español",
-          "url": "https://www.weather.gov/wrn/en-espanol"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/ambassadors",
+        "https://www.weather.gov/wrn/ambassadors",
+        "https://www.weather.gov/wrn/amb-tou",
+        "https://www.weather.gov/wrn/ambassador_recognition",
+        "https://www.weather.gov/people/",
+        "https://www.weather.gov/wrn/amb-faqs",
+        "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform",
+        " https://www.weather.gov/wrn/success-stories",
+        "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf",
+        "https://www.weather.gov/wrn/aviation",
+        " http://www.weather.gov/wrn/current-ambassadors",
+        "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
+        "https://www.weather.gov/wrn/en-espanol"
+      ],
+      "link_texts": [
+        "Ambassador",
+        "About WRN Ambassadors",
+        "Become an Ambassador",
+        "Ambassadors of Excellence",
+        "People of WRN",
+        " FAQS",
+        "Tell Your Success Story",
+        " Success Stories",
+        "Tri-fold",
+        "Aviation",
+        " Current Ambassadors",
+        "Brochure",
+        "En Español"
      ]
    },
    "text": "Ambassador                                                                                                        About WRN Ambassadors                                                                            Become an Ambassador                                                                            Ambassadors of Excellence                                                                            People of WRN                                                                             FAQS                                                                            Tell Your Success Story                                                                             Success Stories                                                                            Tri-fold                                                                            Aviation                                                                             Current Ambassadors                                                                            Brochure                                                                            En Español"
@ -268,51 +192,31 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Education",
-          "url": "http://www.weather.gov/owlie/"
-        },
-        {
-          "text": "NWS Education Home",
-          "url": "http://www.weather.gov/owlie/"
-        },
-        {
-          "text": "Be A Force Of Nature",
-          "url": "https://www.weather.gov/wrn/force"
-        },
-        {
-          "text": "WRN Kids Flyer",
-          "url": " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf"
-        },
-        {
-          "text": "Wireless Emergency Alerts",
-          "url": "https://www.weather.gov/wrn/wea"
-        },
-        {
-          "text": "NOAA Weather Radio",
-          "url": "http://www.nws.noaa.gov/nwr/"
-        },
-        {
-          "text": "Mobile Weather",
-          "url": "https://www.weather.gov/wrn/mobile-phone"
-        },
-        {
-          "text": "Brochures",
-          "url": "http://www.weather.gov/owlie/publication_brochures"
-        },
-        {
-          "text": "Hourly Weather Forecast",
-          "url": "https://www.weather.gov/wrn/hourly-weather-graph"
-        },
-        {
-          "text": "Citizen Science",
-          "url": "http://www.weather.gov/media/wrn/citizen_science_page.pdf"
-        },
-        {
-          "text": "Intellectual Disabilities",
-          "url": "https://www.weather.gov/wrn/intellectualdisabilities"
-        }
+      "link_urls": [
+        "http://www.weather.gov/owlie/",
+        "http://www.weather.gov/owlie/",
+        "https://www.weather.gov/wrn/force",
+        " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf",
+        "https://www.weather.gov/wrn/wea",
+        "http://www.nws.noaa.gov/nwr/",
+        "https://www.weather.gov/wrn/mobile-phone",
+        "http://www.weather.gov/owlie/publication_brochures",
+        "https://www.weather.gov/wrn/hourly-weather-graph",
+        "http://www.weather.gov/media/wrn/citizen_science_page.pdf",
+        "https://www.weather.gov/wrn/intellectualdisabilities"
+      ],
+      "link_texts": [
+        "Education",
+        "NWS Education Home",
+        "Be A Force Of Nature",
+        "WRN Kids Flyer",
+        "Wireless Emergency Alerts",
+        "NOAA Weather Radio",
+        "Mobile Weather",
+        "Brochures",
+        "Hourly Weather Forecast",
+        "Citizen Science",
+        "Intellectual Disabilities"
      ]
    },
    "text": "Education                                                                                                        NWS Education Home                                                                            Be A Force Of Nature                                                                            WRN Kids Flyer                                                                            Wireless Emergency Alerts                                                                            NOAA Weather Radio                                                                            Mobile Weather                                                                            Brochures                                                                            Hourly Weather Forecast                                                                            Citizen Science                                                                            Intellectual Disabilities"
@ -324,47 +228,29 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Collaboration",
-          "url": "https://www.weather.gov/wrn/collaborate"
-        },
-        {
-          "text": "Get Involved ",
-          "url": "https://www.weather.gov/wrn/get-involved"
-        },
-        {
-          "text": "Social Media",
-          "url": "http://www.weather.gov/socialmedia"
-        },
-        {
-          "text": "WRN Ambassadors ",
-          "url": "https://www.weather.gov/wrn/ambassadors"
-        },
-        {
-          "text": "Enterprise Resources",
-          "url": "https://www.weather.gov/enterprise/"
-        },
-        {
-          "text": "StormReady",
-          "url": "http://www.weather.gov/stormready/"
-        },
-        {
-          "text": "TsunamiReady",
-          "url": "https://www.weather.gov/tsunamiready/"
-        },
-        {
-          "text": "NWSChat (core partners only)",
-          "url": "https://nwschat.weather.gov/"
-        },
-        {
-          "text": "InteractiveNWS (iNWS) (core partners only)",
-          "url": "https://inws.ncep.noaa.gov/"
-        },
-        {
-          "text": "SKYWARN",
-          "url": "https://www.weather.gov/SKYWARN"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/collaborate",
+        "https://www.weather.gov/wrn/get-involved",
+        "http://www.weather.gov/socialmedia",
+        "https://www.weather.gov/wrn/ambassadors",
+        "https://www.weather.gov/enterprise/",
+        "http://www.weather.gov/stormready/",
+        "https://www.weather.gov/tsunamiready/",
+        "https://nwschat.weather.gov/",
+        "https://inws.ncep.noaa.gov/",
+        "https://www.weather.gov/SKYWARN"
+      ],
+      "link_texts": [
+        "Collaboration",
+        "Get Involved ",
+        "Social Media",
+        "WRN Ambassadors ",
+        "Enterprise Resources",
+        "StormReady",
+        "TsunamiReady",
+        "NWSChat (core partners only)",
+        "InteractiveNWS (iNWS) (core partners only)",
+        "SKYWARN"
      ]
    },
    "text": "Collaboration                                                                                                        Get Involved                                                                             Social Media                                                                            WRN Ambassadors                                                                             Enterprise Resources                                                                            StormReady                                                                            TsunamiReady                                                                            NWSChat (core partners only)                                                                            InteractiveNWS (iNWS) (core partners only)                                                                            SKYWARN"
@ -376,27 +262,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": " News & Events",
-          "url": "http://www.weather.gov/news/"
-        },
-        {
-          "text": "Latest News",
-          "url": " http://www.weather.gov/news/"
-        },
-        {
-          "text": "Calendar",
-          "url": "https://www.weather.gov/wrn/calendar"
-        },
-        {
-          "text": "Meetings & Workshops",
-          "url": " https://www.weather.gov/wrn/workshops"
-        },
-        {
-          "text": "NWS Aware Newsletter",
-          "url": "https://www.weather.gov/publications/aware"
-        }
+      "link_urls": [
+        "http://www.weather.gov/news/",
+        " http://www.weather.gov/news/",
+        "https://www.weather.gov/wrn/calendar",
+        " https://www.weather.gov/wrn/workshops",
+        "https://www.weather.gov/publications/aware"
+      ],
+      "link_texts": [
+        " News & Events",
+        "Latest News",
+        "Calendar",
+        "Meetings & Workshops",
+        "NWS Aware Newsletter"
      ]
    },
    "text": "News & Events                                                                                                        Latest News                                                                            Calendar                                                                            Meetings & Workshops                                                                            NWS Aware Newsletter"
@ -408,11 +286,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "International",
-          "url": "https://www.weather.gov/wrn/wrns"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/wrns"
+      ],
+      "link_texts": [
+        "International"
      ]
    },
    "text": "International"
@ -424,51 +302,31 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "About",
-          "url": "https://www.weather.gov/wrn/about"
-        },
-        {
-          "text": "Contact Us",
-          "url": " https://www.weather.gov/wrn/contact"
-        },
-        {
-          "text": " What is WRN?",
-          "url": "https://www.weather.gov/wrn/about"
-        },
-        {
-          "text": " WRN FAQ",
-          "url": "https://www.weather.gov/wrn/faqs"
-        },
-        {
-          "text": "WRN Brochure",
-          "url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
-        },
-        {
-          "text": "Hazard Simplification",
-          "url": "https://www.weather.gov/hazardsimplification/"
-        },
-        {
-          "text": "IDSS Brochure",
-          "url": "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf"
-        },
-        {
-          "text": "Roadmap",
-          "url": "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf"
-        },
-        {
-          "text": "Strategic Plan",
-          "url": "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf"
-        },
-        {
-          "text": "WRN International",
-          "url": " https://www.weather.gov/wrn/international"
-        },
-        {
-          "text": "Social Science",
-          "url": "https://vlab.noaa.gov/web/nws-social-science"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/about",
+        " https://www.weather.gov/wrn/contact",
+        "https://www.weather.gov/wrn/about",
+        "https://www.weather.gov/wrn/faqs",
+        "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
+        "https://www.weather.gov/hazardsimplification/",
+        "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf",
+        "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf",
+        "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf",
+        " https://www.weather.gov/wrn/international",
+        "https://vlab.noaa.gov/web/nws-social-science"
+      ],
+      "link_texts": [
+        "About",
+        "Contact Us",
+        " What is WRN?",
+        " WRN FAQ",
+        "WRN Brochure",
+        "Hazard Simplification",
+        "IDSS Brochure",
+        "Roadmap",
+        "Strategic Plan",
+        "WRN International",
+        "Social Science"
      ]
    },
    "text": "About                                                                                                        Contact Us                                                                             What is WRN?                                                                             WRN FAQ                                                                            WRN Brochure                                                                            Hazard Simplification                                                                            IDSS Brochure                                                                            Roadmap                                                                            Strategic Plan                                                                            WRN International                                                                            Social Science"
@ -500,11 +358,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "emphasized_texts": [
-        {
-          "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.",
-          "tag": "strong"
-        }
+      "emphasized_text_contents": [
+        "First, take steps to better prepare for the seasonal hazards weather can throw at you."
+      ],
+      "emphasized_text_tags": [
+        "strong"
      ]
    },
    "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
@ -516,17 +374,17 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Spring Safety website",
-          "url": "https://www.weather.gov/wrn/spring-safety"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/spring-safety"
      ],
-      "emphasized_texts": [
-        {
-          "text": "Second, encourage others to become Weather-Ready as well.",
-          "tag": "strong"
-        }
+      "link_texts": [
+        "Spring Safety website"
+      ],
+      "emphasized_text_contents": [
+        "Second, encourage others to become Weather-Ready as well."
+      ],
+      "emphasized_text_tags": [
+        "strong"
      ]
    },
    "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic."
@ -538,11 +396,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "infographics",
-          "url": "https://www.weather.gov/wrn/spring-infographics"
-        }
+      "link_urls": [
+        "https://www.weather.gov/wrn/spring-infographics"
+      ],
+      "link_texts": [
+        "infographics"
      ]
    },
    "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available."
@ -564,23 +422,17 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "US Dept of Commerce",
-          "url": "http://www.commerce.gov"
-        },
-        {
-          "text": "National Oceanic and Atmospheric Administration",
-          "url": "http://www.noaa.gov"
-        },
-        {
-          "text": "National Weather Service",
-          "url": "https://www.weather.gov"
-        },
-        {
-          "text": "Comments? Questions? Please Contact Us.",
-          "url": "https://www.weather.gov/news/contact"
-        }
+      "link_urls": [
+        "http://www.commerce.gov",
+        "http://www.noaa.gov",
+        "https://www.weather.gov",
+        "https://www.weather.gov/news/contact"
+      ],
+      "link_texts": [
+        "US Dept of Commerce",
+        "National Oceanic and Atmospheric Administration",
+        "National Weather Service",
+        "Comments? Questions? Please Contact Us."
      ]
    },
    "text": "US Dept of Commerce\n                        National Oceanic and Atmospheric Administration\n                        National Weather Service\n                        News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us."
@ -592,11 +444,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Disclaimer",
-          "url": "https://www.weather.gov/disclaimer"
-        }
+      "link_urls": [
+        "https://www.weather.gov/disclaimer"
+      ],
+      "link_texts": [
+        "Disclaimer"
      ]
    },
    "text": "Disclaimer"
@ -608,11 +460,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Information Quality",
-          "url": "http://www.cio.noaa.gov/services_programs/info_quality.html"
-        }
+      "link_urls": [
+        "http://www.cio.noaa.gov/services_programs/info_quality.html"
+      ],
+      "link_texts": [
+        "Information Quality"
      ]
    },
    "text": "Information Quality"
@ -624,11 +476,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Help",
-          "url": "https://www.weather.gov/help"
-        }
+      "link_urls": [
+        "https://www.weather.gov/help"
+      ],
+      "link_texts": [
+        "Help"
      ]
    },
    "text": "Help"
@ -640,11 +492,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Glossary",
-          "url": "http://www.weather.gov/glossary"
-        }
+      "link_urls": [
+        "http://www.weather.gov/glossary"
+      ],
+      "link_texts": [
+        "Glossary"
      ]
    },
    "text": "Glossary"
@ -656,11 +508,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Privacy Policy",
-          "url": "https://www.weather.gov/privacy"
-        }
+      "link_urls": [
+        "https://www.weather.gov/privacy"
+      ],
+      "link_texts": [
+        "Privacy Policy"
      ]
    },
    "text": "Privacy Policy"
@ -672,11 +524,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Freedom of Information Act (FOIA)",
-          "url": "https://www.noaa.gov/foia-freedom-of-information-act"
-        }
+      "link_urls": [
+        "https://www.noaa.gov/foia-freedom-of-information-act"
+      ],
+      "link_texts": [
+        "Freedom of Information Act (FOIA)"
      ]
    },
    "text": "Freedom of Information Act (FOIA)"
@ -688,11 +540,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "About Us",
-          "url": "https://www.weather.gov/about"
-        }
+      "link_urls": [
+        "https://www.weather.gov/about"
+      ],
+      "link_texts": [
+        "About Us"
      ]
    },
    "text": "About Us"
@ -704,11 +556,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Career Opportunities",
-          "url": "https://www.weather.gov/careers"
-        }
+      "link_urls": [
+        "https://www.weather.gov/careers"
+      ],
+      "link_texts": [
+        "Career Opportunities"
      ]
    },
    "text": "Career Opportunities"
--- a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json
@ -16,11 +16,11 @@
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
      "page_number": 1,
-      "emphasized_texts": [
-        {
-          "text": "CHAPTER 1",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "CHAPTER 1"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "CHAPTER 1"
@ -32,11 +32,11 @@
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
      "page_number": 1,
-      "emphasized_texts": [
-        {
-          "text": "INTRODUCTION",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "INTRODUCTION"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "INTRODUCTION"
--- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/box/nested-2/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json
@ -16,11 +16,11 @@
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
      "page_number": 1,
-      "emphasized_texts": [
-        {
-          "text": "CHAPTER 1",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "CHAPTER 1"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "CHAPTER 1"
@ -32,11 +32,11 @@
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
      "page_number": 1,
-      "emphasized_texts": [
-        {
-          "text": "INTRODUCTION",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "INTRODUCTION"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "INTRODUCTION"
--- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json
@ -6,21 +6,19 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": null,
-          "url": "index.html"
-        },
-        {
-          "text": null,
-          "url": "https://twitter.com/stef/status/1617222428727586816"
-        }
+      "link_urls": [
+        "index.html",
+        "https://twitter.com/stef/status/1617222428727586816"
      ],
-      "emphasized_texts": [
-        {
-          "text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)",
-          "tag": "i"
-        }
+      "link_texts": [
+        null,
+        null
+      ],
+      "emphasized_text_contents": [
+        "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)"
+      ],
+      "emphasized_text_tags": [
+        "i"
      ]
    },
    "text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from.  The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
--- a/test_unstructured_ingest/expected-structured-output/github/test.html.json
+++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json
@ -16,11 +16,11 @@
      "data_source": {},
      "filetype": "text/html",
      "page_number": 1,
-      "links": [
-        {
-          "text": "Github Project Page",
-          "url": "http://github.com/dcneiner/Downloadify"
-        }
+      "link_urls": [
+        "http://github.com/dcneiner/Downloadify"
+      ],
+      "link_texts": [
+        "Github Project Page"
      ]
    },
    "text": "More info available at the Github Project Page"
--- a/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json
+++ b/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json
@ -5,11 +5,11 @@
    "metadata": {
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-      "emphasized_texts": [
-        {
-          "text": "Title",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "Title"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "Title"
@ -20,11 +20,11 @@
    "metadata": {
      "data_source": {},
      "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-      "emphasized_texts": [
-        {
-          "text": "This is a good reason to continue",
-          "tag": "b"
-        }
+      "emphasized_text_contents": [
+        "This is a good reason to continue"
+      ],
+      "emphasized_text_tags": [
+        "b"
      ]
    },
    "text": "This is a good reason to continue"
--- a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json
@ -14,11 +14,11 @@
        "devops@unstructuredio.onmicrosoft.com"
      ],
      "subject": "subfolder1_1",
-      "emphasized_texts": [
-        {
-          "text": "this is a message for the subfolder1_1",
-          "tag": "span"
-        }
+      "emphasized_text_contents": [
+        "this is a message for the subfolder1_1"
+      ],
+      "emphasized_text_tags": [
+        "span"
      ]
    },
    "text": "this is a message for the subfolder1_1"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.9.4-dev0"  # pragma: no cover
+__version__ = "0.10.0"  # pragma: no cover
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -143,7 +143,8 @@ class ElementMetadata:

    # Webpage specific metadata fields
    url: Optional[str] = None
-    links: Optional[List[Link]] = None
+    link_urls: Optional[List[str]] = None
+    link_texts: Optional[List[str]] = None

    # E-mail specific metadata fields
    sent_from: Optional[List[str]] = None
@ -157,7 +158,8 @@ class ElementMetadata:
    header_footer_type: Optional[str] = None

    # Formatting metadata fields
-    emphasized_texts: Optional[List[dict]] = None
+    emphasized_text_contents: Optional[List[str]] = None
+    emphasized_text_tags: Optional[List[str]] = None

    # Text format metadata fields
    text_as_html: Optional[str] = None
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -165,11 +165,23 @@ def _add_element_metadata(
        else None
    )
    links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
+    link_urls = [link.get("url") for link in links] if links else None
+    link_texts = [link.get("text") for link in links] if links else None
    emphasized_texts = (
        element.emphasized_texts
        if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
        else None
    )
+    emphasized_text_contents = (
+        [emphasized_text.get("text") for emphasized_text in emphasized_texts]
+        if emphasized_texts
+        else None
+    )
+    emphasized_text_tags = (
+        [emphasized_text.get("tag") for emphasized_text in emphasized_texts]
+        if emphasized_texts
+        else None
+    )
    metadata = ElementMetadata(
        coordinates=coordinates_metadata,
        filename=filename,
@ -177,8 +189,10 @@ def _add_element_metadata(
        page_number=page_number,
        url=url,
        text_as_html=text_as_html,
-        links=links,
-        emphasized_texts=emphasized_texts,
+        link_urls=link_urls,
+        link_texts=link_texts,
+        emphasized_text_contents=emphasized_text_contents,
+        emphasized_text_tags=emphasized_text_tags,
        section=section,
    )
    element.metadata = metadata.merge(element.metadata)
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -171,6 +171,9 @@ def partition_docx(
        if element_item.tag.endswith("tbl"):
            table = document.tables[table_index]
            emphasized_texts = _get_emphasized_texts_from_table(table)
+            emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
+                emphasized_texts,
+            )
            html_table = convert_ms_office_table_to_text(table, as_html=True)
            text_table = convert_ms_office_table_to_text(table, as_html=False)
            element = Table(text_table)
@ -180,7 +183,8 @@ def partition_docx(
                    filename=metadata_filename,
                    page_number=page_number,
                    last_modified=metadata_last_modified or last_modification_date,
-                    emphasized_texts=emphasized_texts if emphasized_texts else None,
+                    emphasized_text_contents=emphasized_text_contents,
+                    emphasized_text_tags=emphasized_text_tags,
                )
                elements.append(element)
            table_index += 1
@ -189,13 +193,17 @@ def partition_docx(
                is_list = True
            paragraph = docx.text.paragraph.Paragraph(element_item, document)
            emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
+            emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
+                emphasized_texts,
+            )
            para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
            if para_element is not None:
                para_element.metadata = ElementMetadata(
                    filename=metadata_filename,
                    page_number=page_number,
                    last_modified=metadata_last_modified or last_modification_date,
-                    emphasized_texts=emphasized_texts if emphasized_texts else None,
+                    emphasized_text_contents=emphasized_text_contents,
+                    emphasized_text_tags=emphasized_text_tags,
                )
                elements.append(para_element)
            is_list = False
@ -398,3 +406,30 @@ def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]:
                _emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
                emphasized_texts += _emphasized_texts
    return emphasized_texts
+
+
+def _extract_contents_and_tags(
+    emphasized_texts: List[dict],
+) -> Tuple[Optional[List[str]], Optional[List[str]]]:
+    """
+    Extract the text contents and tags from a list of dictionaries containing emphasized texts.
+
+    Args:
+    - emphasized_texts (List[dict]): A list containing dictionaries with keys "text" and "tag".
+
+    Returns:
+    - Tuple[List[str], List[str]]: A tuple containing two lists -
+                                   one for text contents and one for tags extracted from the input.
+    """
+    emphasized_text_contents = (
+        [emphasized_text["text"] for emphasized_text in emphasized_texts]
+        if emphasized_texts
+        else None
+    )
+    emphasized_text_tags = (
+        [emphasized_text["tag"] for emphasized_text in emphasized_texts]
+        if emphasized_texts
+        else None
+    )
+
+    return emphasized_text_contents, emphasized_text_tags