Feat/1060 update metadata fields (#1099)

Closes Github Issue #1060.

* update the metadata field links
* update the metadata field emphasized_texts
This commit is contained in:
Christine Straub 2023-08-15 21:33:06 -07:00 committed by GitHub
parent fe5048a834
commit 0e887cc36b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 536 additions and 633 deletions

View File

@ -1,7 +1,9 @@
## 0.9.4-dev0
## 0.10.0
### Enhancements
* Update the `links` and `emphasized_texts` metadata fields
### Features
### Fixes

View File

@ -16,6 +16,7 @@ from unstructured.documents.elements import (
)
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import (
_extract_contents_and_tags,
_get_emphasized_texts_from_paragraph,
_get_emphasized_texts_from_table,
partition_docx,
@ -63,6 +64,26 @@ def expected_elements():
]
@pytest.fixture()
def expected_emphasized_texts():
return [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
@pytest.fixture()
def expected_emphasized_text_contents():
return ["bold", "italic", "bold-italic", "bold-italic"]
@pytest.fixture()
def expected_emphasized_text_tags():
return ["b", "i", "b", "i"]
def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
@ -293,19 +314,14 @@ def test_partition_docx_from_file_without_metadata_date(
def test_get_emphasized_texts_from_paragraph(
expected_emphasized_texts,
filename="example-docs/fake-doc-emphasized-text.docx",
):
expected = [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
document = docx.Document(filename)
paragraph = document.paragraphs[1]
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
assert paragraph.text == "I am a bold italic bold-italic text."
assert emphasized_texts == expected
assert emphasized_texts == expected_emphasized_texts
paragraph = document.paragraphs[2]
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
@ -319,18 +335,29 @@ def test_get_emphasized_texts_from_paragraph(
def test_get_emphasized_texts_from_table(
expected_emphasized_texts,
filename="example-docs/fake-doc-emphasized-text.docx",
):
expected = [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
document = docx.Document(filename)
table = document.tables[0]
emphasized_texts = _get_emphasized_texts_from_table(table)
assert emphasized_texts == expected
assert emphasized_texts == expected_emphasized_texts
def test_extract_contents_and_tags(
expected_emphasized_texts,
expected_emphasized_text_contents,
expected_emphasized_text_tags,
):
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
expected_emphasized_texts,
)
assert emphasized_text_contents == expected_emphasized_text_contents
assert emphasized_text_tags == expected_emphasized_text_tags
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
assert emphasized_text_contents is None
assert emphasized_text_tags is None
@pytest.mark.parametrize(
@ -340,24 +367,22 @@ def test_get_emphasized_texts_from_table(
("fake-doc-emphasized-text.doc", partition_doc),
],
)
def test_partition_docx_grabs_emphasized_texts(filename, partition_func):
def test_partition_docx_grabs_emphasized_texts(
filename,
partition_func,
expected_emphasized_text_contents,
expected_emphasized_text_tags,
):
elements = partition_func(filename=f"example-docs/{filename}")
assert isinstance(elements[0], Table)
assert elements[0].metadata.emphasized_texts == [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags
assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
assert elements[1].metadata.emphasized_texts == [
{"text": "bold", "tag": "b"},
{"text": "italic", "tag": "i"},
{"text": "bold-italic", "tag": "b"},
{"text": "bold-italic", "tag": "i"},
]
assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags
assert elements[2] == NarrativeText("I am a normal text.")
assert elements[2].metadata.emphasized_texts is None
assert elements[2].metadata.emphasized_text_contents is None
assert elements[2].metadata.emphasized_text_tags is None

View File

@ -455,34 +455,24 @@ def test_partition_html_grabs_links():
elements = partition_html(text=html_text)
assert elements[0] == NarrativeText("Hello there I am a very important link!")
assert elements[0].metadata.links == [
{
"text": "very important link!",
"url": "/link",
},
]
assert elements[0].metadata.link_urls == ["/link"]
assert elements[0].metadata.link_texts == ["very important link!"]
assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.links is None
assert elements[1].metadata.link_urls is None
assert elements[1].metadata.link_texts is None
assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.links == [
{
"text": "Parrots",
"url": "https://en.wikipedia.org/wiki/Parrot",
},
]
assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
assert elements[2].metadata.link_texts == ["Parrots"]
assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.links is None
assert elements[3].metadata.link_urls is None
assert elements[3].metadata.link_texts is None
assert elements[4] == Title("A lone link!")
assert elements[4].metadata.links == [
{
"text": "A lone link!",
"url": "/loner",
},
]
assert elements[4].metadata.link_urls == ["/loner"]
assert elements[4].metadata.link_texts == ["A lone link!"]
def test_partition_html_from_filename_with_skip_headers_and_footers(
@ -570,26 +560,25 @@ def test_partition_html_grabs_emphasized_texts():
elements = partition_html(text=html_text)
assert elements[0] == NarrativeText("Hello there I am a very important text!")
assert elements[0].metadata.emphasized_texts == [
{"text": "important", "tag": "strong"},
]
assert elements[0].metadata.emphasized_text_contents == ["important"]
assert elements[0].metadata.emphasized_text_tags == ["strong"]
assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.emphasized_texts == [
{"text": "list", "tag": "span"},
{"text": "my favorite things", "tag": "b"},
{"text": "favorite", "tag": "i"},
assert elements[1].metadata.emphasized_text_contents == [
"list",
"my favorite things",
"favorite",
]
assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]
assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.emphasized_texts == [
{"text": "Parrots", "tag": "em"},
]
assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
assert elements[2].metadata.emphasized_text_tags == ["em"]
assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.emphasized_texts is None
assert elements[3].metadata.emphasized_text_contents is None
assert elements[3].metadata.emphasized_text_tags is None
assert elements[4] == Title("A lone span text!")
assert elements[4].metadata.emphasized_texts == [
{"text": "A lone span text!", "tag": "span"},
]
assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
assert elements[4].metadata.emphasized_text_tags == ["span"]

View File

@ -15,21 +15,19 @@
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -36,11 +36,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Weather.gov",
"url": "https://www.weather.gov"
}
"link_urls": [
"https://www.weather.gov"
],
"link_texts": [
"Weather.gov"
]
},
"text": "Weather.gov >"
@ -52,11 +52,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "News Around NOAA",
"url": "https://www.weather.gov/news"
}
"link_urls": [
"https://www.weather.gov/news"
],
"link_texts": [
"News Around NOAA"
]
},
"text": "News Around NOAA > Are You Weather-Ready for the Spring?"
@ -68,91 +68,51 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Weather Safety",
"url": "http://www.weather.gov/safetycampaign"
},
{
"text": "Air Quality",
"url": "https://www.weather.gov/safety/airquality"
},
{
"text": "Beach Hazards",
"url": "https://www.weather.gov/safety/beachhazards"
},
{
"text": "Cold",
"url": "https://www.weather.gov/safety/cold"
},
{
"text": "Cold Water",
"url": "https://www.weather.gov/safety/coldwater"
},
{
"text": "Drought",
"url": "https://www.weather.gov/safety/drought"
},
{
"text": "Floods",
"url": "https://www.weather.gov/safety/flood"
},
{
"text": "Fog",
"url": "https://www.weather.gov/safety/fog"
},
{
"text": "Heat",
"url": "https://www.weather.gov/safety/heat"
},
{
"text": " Hurricanes",
"url": "https://www.weather.gov/safety/hurricane"
},
{
"text": " Lightning Safety",
"url": "https://www.weather.gov/safety/lightning"
},
{
"text": "Rip Currents",
"url": "https://www.weather.gov/safety/ripcurrent"
},
{
"text": "Safe Boating",
"url": "https://www.weather.gov/safety/safeboating"
},
{
"text": "Space Weather",
"url": "https://www.weather.gov/safety/space"
},
{
"text": "Sun (Ultraviolet Radiation)",
"url": "https://www.weather.gov/safety/heat-uv"
},
{
"text": " Thunderstorms & Tornadoes",
"url": "https://www.weather.gov/safety/thunderstorm"
},
{
"text": "Tornado",
"url": "https://www.weather.gov/safety/tornado"
},
{
"text": "Tsunami",
"url": "https://www.weather.gov/safety/tsunami"
},
{
"text": "Wildfire",
"url": "https://www.weather.gov/safety/wildfire"
},
{
"text": "Wind",
"url": "https://www.weather.gov/safety/wind"
},
{
"text": "Winter",
"url": "https://www.weather.gov/safety/winter "
}
"link_urls": [
"http://www.weather.gov/safetycampaign",
"https://www.weather.gov/safety/airquality",
"https://www.weather.gov/safety/beachhazards",
"https://www.weather.gov/safety/cold",
"https://www.weather.gov/safety/coldwater",
"https://www.weather.gov/safety/drought",
"https://www.weather.gov/safety/flood",
"https://www.weather.gov/safety/fog",
"https://www.weather.gov/safety/heat",
"https://www.weather.gov/safety/hurricane",
"https://www.weather.gov/safety/lightning",
"https://www.weather.gov/safety/ripcurrent",
"https://www.weather.gov/safety/safeboating",
"https://www.weather.gov/safety/space",
"https://www.weather.gov/safety/heat-uv",
"https://www.weather.gov/safety/thunderstorm",
"https://www.weather.gov/safety/tornado",
"https://www.weather.gov/safety/tsunami",
"https://www.weather.gov/safety/wildfire",
"https://www.weather.gov/safety/wind",
"https://www.weather.gov/safety/winter "
],
"link_texts": [
"Weather Safety",
"Air Quality",
"Beach Hazards",
"Cold",
"Cold Water",
"Drought",
"Floods",
"Fog",
"Heat",
" Hurricanes",
" Lightning Safety",
"Rip Currents",
"Safe Boating",
"Space Weather",
"Sun (Ultraviolet Radiation)",
" Thunderstorms & Tornadoes",
"Tornado",
"Tsunami",
"Wildfire",
"Wind",
"Winter"
]
},
"text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter"
@ -164,35 +124,23 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Safety Campaigns",
"url": "https://www.weather.gov/safetycampaign"
},
{
"text": "Seasonal Safety Campaigns",
"url": "https://www.weather.gov/safetycampaign"
},
{
"text": "#SafePlaceSelfie",
"url": "https://www.weather.gov/wrn/safeplaceselfie"
},
{
"text": "Deaf & Hard of Hearing",
"url": "https://www.weather.gov/wrn/dhh-safety"
},
{
"text": "Intellectual Disabilities",
"url": "https://www.weather.gov/wrn/intellectualdisabilities"
},
{
"text": "Spanish-language Content",
"url": "https://www.weather.gov/wrn/fall2020-espanol-sm"
},
{
"text": "The Great Outdoors",
"url": "https://www.noaa.gov/explainers/great-outdoors-weather-safety"
}
"link_urls": [
"https://www.weather.gov/safetycampaign",
"https://www.weather.gov/safetycampaign",
"https://www.weather.gov/wrn/safeplaceselfie",
"https://www.weather.gov/wrn/dhh-safety",
"https://www.weather.gov/wrn/intellectualdisabilities",
"https://www.weather.gov/wrn/fall2020-espanol-sm",
"https://www.noaa.gov/explainers/great-outdoors-weather-safety"
],
"link_texts": [
"Safety Campaigns",
"Seasonal Safety Campaigns",
"#SafePlaceSelfie",
"Deaf & Hard of Hearing",
"Intellectual Disabilities",
"Spanish-language Content",
"The Great Outdoors"
]
},
"text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors"
@ -204,59 +152,35 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Ambassador",
"url": "https://www.weather.gov/wrn/ambassadors"
},
{
"text": "About WRN Ambassadors",
"url": "https://www.weather.gov/wrn/ambassadors"
},
{
"text": "Become an Ambassador",
"url": "https://www.weather.gov/wrn/amb-tou"
},
{
"text": "Ambassadors of Excellence",
"url": "https://www.weather.gov/wrn/ambassador_recognition"
},
{
"text": "People of WRN",
"url": "https://www.weather.gov/people/"
},
{
"text": " FAQS",
"url": "https://www.weather.gov/wrn/amb-faqs"
},
{
"text": "Tell Your Success Story",
"url": "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform"
},
{
"text": " Success Stories",
"url": " https://www.weather.gov/wrn/success-stories"
},
{
"text": "Tri-fold",
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf"
},
{
"text": "Aviation",
"url": "https://www.weather.gov/wrn/aviation"
},
{
"text": " Current Ambassadors",
"url": " http://www.weather.gov/wrn/current-ambassadors"
},
{
"text": "Brochure",
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
},
{
"text": "En Español",
"url": "https://www.weather.gov/wrn/en-espanol"
}
"link_urls": [
"https://www.weather.gov/wrn/ambassadors",
"https://www.weather.gov/wrn/ambassadors",
"https://www.weather.gov/wrn/amb-tou",
"https://www.weather.gov/wrn/ambassador_recognition",
"https://www.weather.gov/people/",
"https://www.weather.gov/wrn/amb-faqs",
"https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform",
" https://www.weather.gov/wrn/success-stories",
"http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf",
"https://www.weather.gov/wrn/aviation",
" http://www.weather.gov/wrn/current-ambassadors",
"http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
"https://www.weather.gov/wrn/en-espanol"
],
"link_texts": [
"Ambassador",
"About WRN Ambassadors",
"Become an Ambassador",
"Ambassadors of Excellence",
"People of WRN",
" FAQS",
"Tell Your Success Story",
" Success Stories",
"Tri-fold",
"Aviation",
" Current Ambassadors",
"Brochure",
"En Español"
]
},
"text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español"
@ -268,51 +192,31 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Education",
"url": "http://www.weather.gov/owlie/"
},
{
"text": "NWS Education Home",
"url": "http://www.weather.gov/owlie/"
},
{
"text": "Be A Force Of Nature",
"url": "https://www.weather.gov/wrn/force"
},
{
"text": "WRN Kids Flyer",
"url": " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf"
},
{
"text": "Wireless Emergency Alerts",
"url": "https://www.weather.gov/wrn/wea"
},
{
"text": "NOAA Weather Radio",
"url": "http://www.nws.noaa.gov/nwr/"
},
{
"text": "Mobile Weather",
"url": "https://www.weather.gov/wrn/mobile-phone"
},
{
"text": "Brochures",
"url": "http://www.weather.gov/owlie/publication_brochures"
},
{
"text": "Hourly Weather Forecast",
"url": "https://www.weather.gov/wrn/hourly-weather-graph"
},
{
"text": "Citizen Science",
"url": "http://www.weather.gov/media/wrn/citizen_science_page.pdf"
},
{
"text": "Intellectual Disabilities",
"url": "https://www.weather.gov/wrn/intellectualdisabilities"
}
"link_urls": [
"http://www.weather.gov/owlie/",
"http://www.weather.gov/owlie/",
"https://www.weather.gov/wrn/force",
" http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf",
"https://www.weather.gov/wrn/wea",
"http://www.nws.noaa.gov/nwr/",
"https://www.weather.gov/wrn/mobile-phone",
"http://www.weather.gov/owlie/publication_brochures",
"https://www.weather.gov/wrn/hourly-weather-graph",
"http://www.weather.gov/media/wrn/citizen_science_page.pdf",
"https://www.weather.gov/wrn/intellectualdisabilities"
],
"link_texts": [
"Education",
"NWS Education Home",
"Be A Force Of Nature",
"WRN Kids Flyer",
"Wireless Emergency Alerts",
"NOAA Weather Radio",
"Mobile Weather",
"Brochures",
"Hourly Weather Forecast",
"Citizen Science",
"Intellectual Disabilities"
]
},
"text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities"
@ -324,47 +228,29 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Collaboration",
"url": "https://www.weather.gov/wrn/collaborate"
},
{
"text": "Get Involved ",
"url": "https://www.weather.gov/wrn/get-involved"
},
{
"text": "Social Media",
"url": "http://www.weather.gov/socialmedia"
},
{
"text": "WRN Ambassadors ",
"url": "https://www.weather.gov/wrn/ambassadors"
},
{
"text": "Enterprise Resources",
"url": "https://www.weather.gov/enterprise/"
},
{
"text": "StormReady",
"url": "http://www.weather.gov/stormready/"
},
{
"text": "TsunamiReady",
"url": "https://www.weather.gov/tsunamiready/"
},
{
"text": "NWSChat (core partners only)",
"url": "https://nwschat.weather.gov/"
},
{
"text": "InteractiveNWS (iNWS) (core partners only)",
"url": "https://inws.ncep.noaa.gov/"
},
{
"text": "SKYWARN",
"url": "https://www.weather.gov/SKYWARN"
}
"link_urls": [
"https://www.weather.gov/wrn/collaborate",
"https://www.weather.gov/wrn/get-involved",
"http://www.weather.gov/socialmedia",
"https://www.weather.gov/wrn/ambassadors",
"https://www.weather.gov/enterprise/",
"http://www.weather.gov/stormready/",
"https://www.weather.gov/tsunamiready/",
"https://nwschat.weather.gov/",
"https://inws.ncep.noaa.gov/",
"https://www.weather.gov/SKYWARN"
],
"link_texts": [
"Collaboration",
"Get Involved ",
"Social Media",
"WRN Ambassadors ",
"Enterprise Resources",
"StormReady",
"TsunamiReady",
"NWSChat (core partners only)",
"InteractiveNWS (iNWS) (core partners only)",
"SKYWARN"
]
},
"text": "Collaboration Get Involved Social Media WRN Ambassadors Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only) SKYWARN"
@ -376,27 +262,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": " News & Events",
"url": "http://www.weather.gov/news/"
},
{
"text": "Latest News",
"url": " http://www.weather.gov/news/"
},
{
"text": "Calendar",
"url": "https://www.weather.gov/wrn/calendar"
},
{
"text": "Meetings & Workshops",
"url": " https://www.weather.gov/wrn/workshops"
},
{
"text": "NWS Aware Newsletter",
"url": "https://www.weather.gov/publications/aware"
}
"link_urls": [
"http://www.weather.gov/news/",
" http://www.weather.gov/news/",
"https://www.weather.gov/wrn/calendar",
" https://www.weather.gov/wrn/workshops",
"https://www.weather.gov/publications/aware"
],
"link_texts": [
" News & Events",
"Latest News",
"Calendar",
"Meetings & Workshops",
"NWS Aware Newsletter"
]
},
"text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter"
@ -408,11 +286,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "International",
"url": "https://www.weather.gov/wrn/wrns"
}
"link_urls": [
"https://www.weather.gov/wrn/wrns"
],
"link_texts": [
"International"
]
},
"text": "International"
@ -424,51 +302,31 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "About",
"url": "https://www.weather.gov/wrn/about"
},
{
"text": "Contact Us",
"url": " https://www.weather.gov/wrn/contact"
},
{
"text": " What is WRN?",
"url": "https://www.weather.gov/wrn/about"
},
{
"text": " WRN FAQ",
"url": "https://www.weather.gov/wrn/faqs"
},
{
"text": "WRN Brochure",
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
},
{
"text": "Hazard Simplification",
"url": "https://www.weather.gov/hazardsimplification/"
},
{
"text": "IDSS Brochure",
"url": "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf"
},
{
"text": "Roadmap",
"url": "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf"
},
{
"text": "Strategic Plan",
"url": "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf"
},
{
"text": "WRN International",
"url": " https://www.weather.gov/wrn/international"
},
{
"text": "Social Science",
"url": "https://vlab.noaa.gov/web/nws-social-science"
}
"link_urls": [
"https://www.weather.gov/wrn/about",
" https://www.weather.gov/wrn/contact",
"https://www.weather.gov/wrn/about",
"https://www.weather.gov/wrn/faqs",
"http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
"https://www.weather.gov/hazardsimplification/",
"https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf",
"http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf",
"https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf",
" https://www.weather.gov/wrn/international",
"https://vlab.noaa.gov/web/nws-social-science"
],
"link_texts": [
"About",
"Contact Us",
" What is WRN?",
" WRN FAQ",
"WRN Brochure",
"Hazard Simplification",
"IDSS Brochure",
"Roadmap",
"Strategic Plan",
"WRN International",
"Social Science"
]
},
"text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science"
@ -500,11 +358,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"emphasized_texts": [
{
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.",
"tag": "strong"
}
"emphasized_text_contents": [
"First, take steps to better prepare for the seasonal hazards weather can throw at you."
],
"emphasized_text_tags": [
"strong"
]
},
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
@ -516,17 +374,17 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Spring Safety website",
"url": "https://www.weather.gov/wrn/spring-safety"
}
"link_urls": [
"https://www.weather.gov/wrn/spring-safety"
],
"emphasized_texts": [
{
"text": "Second, encourage others to become Weather-Ready as well.",
"tag": "strong"
}
"link_texts": [
"Spring Safety website"
],
"emphasized_text_contents": [
"Second, encourage others to become Weather-Ready as well."
],
"emphasized_text_tags": [
"strong"
]
},
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic."
@ -538,11 +396,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "infographics",
"url": "https://www.weather.gov/wrn/spring-infographics"
}
"link_urls": [
"https://www.weather.gov/wrn/spring-infographics"
],
"link_texts": [
"infographics"
]
},
"text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in springs moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available."
@ -564,23 +422,17 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "US Dept of Commerce",
"url": "http://www.commerce.gov"
},
{
"text": "National Oceanic and Atmospheric Administration",
"url": "http://www.noaa.gov"
},
{
"text": "National Weather Service",
"url": "https://www.weather.gov"
},
{
"text": "Comments? Questions? Please Contact Us.",
"url": "https://www.weather.gov/news/contact"
}
"link_urls": [
"http://www.commerce.gov",
"http://www.noaa.gov",
"https://www.weather.gov",
"https://www.weather.gov/news/contact"
],
"link_texts": [
"US Dept of Commerce",
"National Oceanic and Atmospheric Administration",
"National Weather Service",
"Comments? Questions? Please Contact Us."
]
},
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us."
@ -592,11 +444,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Disclaimer",
"url": "https://www.weather.gov/disclaimer"
}
"link_urls": [
"https://www.weather.gov/disclaimer"
],
"link_texts": [
"Disclaimer"
]
},
"text": "Disclaimer"
@ -608,11 +460,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Information Quality",
"url": "http://www.cio.noaa.gov/services_programs/info_quality.html"
}
"link_urls": [
"http://www.cio.noaa.gov/services_programs/info_quality.html"
],
"link_texts": [
"Information Quality"
]
},
"text": "Information Quality"
@ -624,11 +476,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Help",
"url": "https://www.weather.gov/help"
}
"link_urls": [
"https://www.weather.gov/help"
],
"link_texts": [
"Help"
]
},
"text": "Help"
@ -640,11 +492,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Glossary",
"url": "http://www.weather.gov/glossary"
}
"link_urls": [
"http://www.weather.gov/glossary"
],
"link_texts": [
"Glossary"
]
},
"text": "Glossary"
@ -656,11 +508,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Privacy Policy",
"url": "https://www.weather.gov/privacy"
}
"link_urls": [
"https://www.weather.gov/privacy"
],
"link_texts": [
"Privacy Policy"
]
},
"text": "Privacy Policy"
@ -672,11 +524,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Freedom of Information Act (FOIA)",
"url": "https://www.noaa.gov/foia-freedom-of-information-act"
}
"link_urls": [
"https://www.noaa.gov/foia-freedom-of-information-act"
],
"link_texts": [
"Freedom of Information Act (FOIA)"
]
},
"text": "Freedom of Information Act (FOIA)"
@ -688,11 +540,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "About Us",
"url": "https://www.weather.gov/about"
}
"link_urls": [
"https://www.weather.gov/about"
],
"link_texts": [
"About Us"
]
},
"text": "About Us"
@ -704,11 +556,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Career Opportunities",
"url": "https://www.weather.gov/careers"
}
"link_urls": [
"https://www.weather.gov/careers"
],
"link_texts": [
"Career Opportunities"
]
},
"text": "Career Opportunities"

View File

@ -16,11 +16,11 @@
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"page_number": 1,
"emphasized_texts": [
{
"text": "CHAPTER 1",
"tag": "b"
}
"emphasized_text_contents": [
"CHAPTER 1"
],
"emphasized_text_tags": [
"b"
]
},
"text": "CHAPTER 1"
@ -32,11 +32,11 @@
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"page_number": 1,
"emphasized_texts": [
{
"text": "INTRODUCTION",
"tag": "b"
}
"emphasized_text_contents": [
"INTRODUCTION"
],
"emphasized_text_tags": [
"b"
]
},
"text": "INTRODUCTION"

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -16,11 +16,11 @@
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"page_number": 1,
"emphasized_texts": [
{
"text": "CHAPTER 1",
"tag": "b"
}
"emphasized_text_contents": [
"CHAPTER 1"
],
"emphasized_text_tags": [
"b"
]
},
"text": "CHAPTER 1"
@ -32,11 +32,11 @@
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"page_number": 1,
"emphasized_texts": [
{
"text": "INTRODUCTION",
"tag": "b"
}
"emphasized_text_contents": [
"INTRODUCTION"
],
"emphasized_text_tags": [
"b"
]
},
"text": "INTRODUCTION"

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -6,21 +6,19 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": null,
"url": "index.html"
},
{
"text": null,
"url": "https://twitter.com/stef/status/1617222428727586816"
}
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"emphasized_texts": [
{
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
"tag": "i"
}
"link_texts": [
null,
null
],
"emphasized_text_contents": [
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
],
"emphasized_text_tags": [
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."

View File

@ -16,11 +16,11 @@
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"links": [
{
"text": "Github Project Page",
"url": "http://github.com/dcneiner/Downloadify"
}
"link_urls": [
"http://github.com/dcneiner/Downloadify"
],
"link_texts": [
"Github Project Page"
]
},
"text": "More info available at the Github Project Page"

View File

@ -5,11 +5,11 @@
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"emphasized_texts": [
{
"text": "Title",
"tag": "b"
}
"emphasized_text_contents": [
"Title"
],
"emphasized_text_tags": [
"b"
]
},
"text": "Title"
@ -20,11 +20,11 @@
"metadata": {
"data_source": {},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"emphasized_texts": [
{
"text": "This is a good reason to continue",
"tag": "b"
}
"emphasized_text_contents": [
"This is a good reason to continue"
],
"emphasized_text_tags": [
"b"
]
},
"text": "This is a good reason to continue"

View File

@ -14,11 +14,11 @@
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "subfolder1_1",
"emphasized_texts": [
{
"text": "this is a message for the subfolder1_1",
"tag": "span"
}
"emphasized_text_contents": [
"this is a message for the subfolder1_1"
],
"emphasized_text_tags": [
"span"
]
},
"text": "this is a message for the subfolder1_1"

View File

@ -1 +1 @@
__version__ = "0.9.4-dev0" # pragma: no cover
__version__ = "0.10.0" # pragma: no cover

View File

@ -143,7 +143,8 @@ class ElementMetadata:
# Webpage specific metadata fields
url: Optional[str] = None
links: Optional[List[Link]] = None
link_urls: Optional[List[str]] = None
link_texts: Optional[List[str]] = None
# E-mail specific metadata fields
sent_from: Optional[List[str]] = None
@ -157,7 +158,8 @@ class ElementMetadata:
header_footer_type: Optional[str] = None
# Formatting metadata fields
emphasized_texts: Optional[List[dict]] = None
emphasized_text_contents: Optional[List[str]] = None
emphasized_text_tags: Optional[List[str]] = None
# Text format metadata fields
text_as_html: Optional[str] = None

View File

@ -165,11 +165,23 @@ def _add_element_metadata(
else None
)
links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
link_urls = [link.get("url") for link in links] if links else None
link_texts = [link.get("text") for link in links] if links else None
emphasized_texts = (
element.emphasized_texts
if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
else None
)
emphasized_text_contents = (
[emphasized_text.get("text") for emphasized_text in emphasized_texts]
if emphasized_texts
else None
)
emphasized_text_tags = (
[emphasized_text.get("tag") for emphasized_text in emphasized_texts]
if emphasized_texts
else None
)
metadata = ElementMetadata(
coordinates=coordinates_metadata,
filename=filename,
@ -177,8 +189,10 @@ def _add_element_metadata(
page_number=page_number,
url=url,
text_as_html=text_as_html,
links=links,
emphasized_texts=emphasized_texts,
link_urls=link_urls,
link_texts=link_texts,
emphasized_text_contents=emphasized_text_contents,
emphasized_text_tags=emphasized_text_tags,
section=section,
)
element.metadata = metadata.merge(element.metadata)

View File

@ -171,6 +171,9 @@ def partition_docx(
if element_item.tag.endswith("tbl"):
table = document.tables[table_index]
emphasized_texts = _get_emphasized_texts_from_table(table)
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
emphasized_texts,
)
html_table = convert_ms_office_table_to_text(table, as_html=True)
text_table = convert_ms_office_table_to_text(table, as_html=False)
element = Table(text_table)
@ -180,7 +183,8 @@ def partition_docx(
filename=metadata_filename,
page_number=page_number,
last_modified=metadata_last_modified or last_modification_date,
emphasized_texts=emphasized_texts if emphasized_texts else None,
emphasized_text_contents=emphasized_text_contents,
emphasized_text_tags=emphasized_text_tags,
)
elements.append(element)
table_index += 1
@ -189,13 +193,17 @@ def partition_docx(
is_list = True
paragraph = docx.text.paragraph.Paragraph(element_item, document)
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
emphasized_texts,
)
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
if para_element is not None:
para_element.metadata = ElementMetadata(
filename=metadata_filename,
page_number=page_number,
last_modified=metadata_last_modified or last_modification_date,
emphasized_texts=emphasized_texts if emphasized_texts else None,
emphasized_text_contents=emphasized_text_contents,
emphasized_text_tags=emphasized_text_tags,
)
elements.append(para_element)
is_list = False
@ -398,3 +406,30 @@ def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]:
_emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
emphasized_texts += _emphasized_texts
return emphasized_texts
def _extract_contents_and_tags(
emphasized_texts: List[dict],
) -> Tuple[Optional[List[str]], Optional[List[str]]]:
"""
Extract the text contents and tags from a list of dictionaries containing emphasized texts.
Args:
- emphasized_texts (List[dict]): A list containing dictionaries with keys "text" and "tag".
Returns:
- Tuple[List[str], List[str]]: A tuple containing two lists -
one for text contents and one for tags extracted from the input.
"""
emphasized_text_contents = (
[emphasized_text["text"] for emphasized_text in emphasized_texts]
if emphasized_texts
else None
)
emphasized_text_tags = (
[emphasized_text["tag"] for emphasized_text in emphasized_texts]
if emphasized_texts
else None
)
return emphasized_text_contents, emphasized_text_tags