mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 14:45:31 +00:00
Feat/1060 update metadata fields (#1099)
Closes Github Issue #1060. * update the metadata field links * update the metadata field emphasized_texts
This commit is contained in:
parent
fe5048a834
commit
0e887cc36b
@ -1,7 +1,9 @@
|
||||
## 0.9.4-dev0
|
||||
## 0.10.0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Update the `links` and `emphasized_texts` metadata fields
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -16,6 +16,7 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.partition.doc import partition_doc
|
||||
from unstructured.partition.docx import (
|
||||
_extract_contents_and_tags,
|
||||
_get_emphasized_texts_from_paragraph,
|
||||
_get_emphasized_texts_from_table,
|
||||
partition_docx,
|
||||
@ -63,6 +64,26 @@ def expected_elements():
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_emphasized_texts():
|
||||
return [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_emphasized_text_contents():
|
||||
return ["bold", "italic", "bold-italic", "bold-italic"]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def expected_emphasized_text_tags():
|
||||
return ["b", "i", "b", "i"]
|
||||
|
||||
|
||||
def test_partition_docx_from_filename(mock_document, expected_elements, tmpdir):
|
||||
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
|
||||
mock_document.save(filename)
|
||||
@ -293,19 +314,14 @@ def test_partition_docx_from_file_without_metadata_date(
|
||||
|
||||
|
||||
def test_get_emphasized_texts_from_paragraph(
|
||||
expected_emphasized_texts,
|
||||
filename="example-docs/fake-doc-emphasized-text.docx",
|
||||
):
|
||||
expected = [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
document = docx.Document(filename)
|
||||
paragraph = document.paragraphs[1]
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
assert paragraph.text == "I am a bold italic bold-italic text."
|
||||
assert emphasized_texts == expected
|
||||
assert emphasized_texts == expected_emphasized_texts
|
||||
|
||||
paragraph = document.paragraphs[2]
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
@ -319,18 +335,29 @@ def test_get_emphasized_texts_from_paragraph(
|
||||
|
||||
|
||||
def test_get_emphasized_texts_from_table(
|
||||
expected_emphasized_texts,
|
||||
filename="example-docs/fake-doc-emphasized-text.docx",
|
||||
):
|
||||
expected = [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
document = docx.Document(filename)
|
||||
table = document.tables[0]
|
||||
emphasized_texts = _get_emphasized_texts_from_table(table)
|
||||
assert emphasized_texts == expected
|
||||
assert emphasized_texts == expected_emphasized_texts
|
||||
|
||||
|
||||
def test_extract_contents_and_tags(
|
||||
expected_emphasized_texts,
|
||||
expected_emphasized_text_contents,
|
||||
expected_emphasized_text_tags,
|
||||
):
|
||||
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
|
||||
expected_emphasized_texts,
|
||||
)
|
||||
assert emphasized_text_contents == expected_emphasized_text_contents
|
||||
assert emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags([])
|
||||
assert emphasized_text_contents is None
|
||||
assert emphasized_text_tags is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -340,24 +367,22 @@ def test_get_emphasized_texts_from_table(
|
||||
("fake-doc-emphasized-text.doc", partition_doc),
|
||||
],
|
||||
)
|
||||
def test_partition_docx_grabs_emphasized_texts(filename, partition_func):
|
||||
def test_partition_docx_grabs_emphasized_texts(
|
||||
filename,
|
||||
partition_func,
|
||||
expected_emphasized_text_contents,
|
||||
expected_emphasized_text_tags,
|
||||
):
|
||||
elements = partition_func(filename=f"example-docs/{filename}")
|
||||
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.emphasized_texts == [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
|
||||
assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
|
||||
assert elements[1].metadata.emphasized_texts == [
|
||||
{"text": "bold", "tag": "b"},
|
||||
{"text": "italic", "tag": "i"},
|
||||
{"text": "bold-italic", "tag": "b"},
|
||||
{"text": "bold-italic", "tag": "i"},
|
||||
]
|
||||
assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
|
||||
assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags
|
||||
|
||||
assert elements[2] == NarrativeText("I am a normal text.")
|
||||
assert elements[2].metadata.emphasized_texts is None
|
||||
assert elements[2].metadata.emphasized_text_contents is None
|
||||
assert elements[2].metadata.emphasized_text_tags is None
|
||||
|
||||
@ -455,34 +455,24 @@ def test_partition_html_grabs_links():
|
||||
elements = partition_html(text=html_text)
|
||||
|
||||
assert elements[0] == NarrativeText("Hello there I am a very important link!")
|
||||
assert elements[0].metadata.links == [
|
||||
{
|
||||
"text": "very important link!",
|
||||
"url": "/link",
|
||||
},
|
||||
]
|
||||
assert elements[0].metadata.link_urls == ["/link"]
|
||||
assert elements[0].metadata.link_texts == ["very important link!"]
|
||||
|
||||
assert elements[1] == NarrativeText("Here is a list of my favorite things")
|
||||
assert elements[1].metadata.links is None
|
||||
assert elements[1].metadata.link_urls is None
|
||||
assert elements[1].metadata.link_texts is None
|
||||
|
||||
assert elements[2] == ListItem("Parrots")
|
||||
assert elements[2].metadata.links == [
|
||||
{
|
||||
"text": "Parrots",
|
||||
"url": "https://en.wikipedia.org/wiki/Parrot",
|
||||
},
|
||||
]
|
||||
assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
|
||||
assert elements[2].metadata.link_texts == ["Parrots"]
|
||||
|
||||
assert elements[3] == ListItem("Dogs")
|
||||
assert elements[3].metadata.links is None
|
||||
assert elements[3].metadata.link_urls is None
|
||||
assert elements[3].metadata.link_texts is None
|
||||
|
||||
assert elements[4] == Title("A lone link!")
|
||||
assert elements[4].metadata.links == [
|
||||
{
|
||||
"text": "A lone link!",
|
||||
"url": "/loner",
|
||||
},
|
||||
]
|
||||
assert elements[4].metadata.link_urls == ["/loner"]
|
||||
assert elements[4].metadata.link_texts == ["A lone link!"]
|
||||
|
||||
|
||||
def test_partition_html_from_filename_with_skip_headers_and_footers(
|
||||
@ -570,26 +560,25 @@ def test_partition_html_grabs_emphasized_texts():
|
||||
elements = partition_html(text=html_text)
|
||||
|
||||
assert elements[0] == NarrativeText("Hello there I am a very important text!")
|
||||
assert elements[0].metadata.emphasized_texts == [
|
||||
{"text": "important", "tag": "strong"},
|
||||
]
|
||||
assert elements[0].metadata.emphasized_text_contents == ["important"]
|
||||
assert elements[0].metadata.emphasized_text_tags == ["strong"]
|
||||
|
||||
assert elements[1] == NarrativeText("Here is a list of my favorite things")
|
||||
assert elements[1].metadata.emphasized_texts == [
|
||||
{"text": "list", "tag": "span"},
|
||||
{"text": "my favorite things", "tag": "b"},
|
||||
{"text": "favorite", "tag": "i"},
|
||||
assert elements[1].metadata.emphasized_text_contents == [
|
||||
"list",
|
||||
"my favorite things",
|
||||
"favorite",
|
||||
]
|
||||
assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]
|
||||
|
||||
assert elements[2] == ListItem("Parrots")
|
||||
assert elements[2].metadata.emphasized_texts == [
|
||||
{"text": "Parrots", "tag": "em"},
|
||||
]
|
||||
assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
|
||||
assert elements[2].metadata.emphasized_text_tags == ["em"]
|
||||
|
||||
assert elements[3] == ListItem("Dogs")
|
||||
assert elements[3].metadata.emphasized_texts is None
|
||||
assert elements[3].metadata.emphasized_text_contents is None
|
||||
assert elements[3].metadata.emphasized_text_tags is None
|
||||
|
||||
assert elements[4] == Title("A lone span text!")
|
||||
assert elements[4].metadata.emphasized_texts == [
|
||||
{"text": "A lone span text!", "tag": "span"},
|
||||
]
|
||||
assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
|
||||
assert elements[4].metadata.emphasized_text_tags == ["span"]
|
||||
|
||||
@ -15,21 +15,19 @@
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -36,11 +36,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Weather.gov",
|
||||
"url": "https://www.weather.gov"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov"
|
||||
],
|
||||
"link_texts": [
|
||||
"Weather.gov"
|
||||
]
|
||||
},
|
||||
"text": "Weather.gov >"
|
||||
@ -52,11 +52,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "News Around NOAA",
|
||||
"url": "https://www.weather.gov/news"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/news"
|
||||
],
|
||||
"link_texts": [
|
||||
"News Around NOAA"
|
||||
]
|
||||
},
|
||||
"text": "News Around NOAA > Are You Weather-Ready for the Spring?"
|
||||
@ -68,91 +68,51 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Weather Safety",
|
||||
"url": "http://www.weather.gov/safetycampaign"
|
||||
},
|
||||
{
|
||||
"text": "Air Quality",
|
||||
"url": "https://www.weather.gov/safety/airquality"
|
||||
},
|
||||
{
|
||||
"text": "Beach Hazards",
|
||||
"url": "https://www.weather.gov/safety/beachhazards"
|
||||
},
|
||||
{
|
||||
"text": "Cold",
|
||||
"url": "https://www.weather.gov/safety/cold"
|
||||
},
|
||||
{
|
||||
"text": "Cold Water",
|
||||
"url": "https://www.weather.gov/safety/coldwater"
|
||||
},
|
||||
{
|
||||
"text": "Drought",
|
||||
"url": "https://www.weather.gov/safety/drought"
|
||||
},
|
||||
{
|
||||
"text": "Floods",
|
||||
"url": "https://www.weather.gov/safety/flood"
|
||||
},
|
||||
{
|
||||
"text": "Fog",
|
||||
"url": "https://www.weather.gov/safety/fog"
|
||||
},
|
||||
{
|
||||
"text": "Heat",
|
||||
"url": "https://www.weather.gov/safety/heat"
|
||||
},
|
||||
{
|
||||
"text": " Hurricanes",
|
||||
"url": "https://www.weather.gov/safety/hurricane"
|
||||
},
|
||||
{
|
||||
"text": " Lightning Safety",
|
||||
"url": "https://www.weather.gov/safety/lightning"
|
||||
},
|
||||
{
|
||||
"text": "Rip Currents",
|
||||
"url": "https://www.weather.gov/safety/ripcurrent"
|
||||
},
|
||||
{
|
||||
"text": "Safe Boating",
|
||||
"url": "https://www.weather.gov/safety/safeboating"
|
||||
},
|
||||
{
|
||||
"text": "Space Weather",
|
||||
"url": "https://www.weather.gov/safety/space"
|
||||
},
|
||||
{
|
||||
"text": "Sun (Ultraviolet Radiation)",
|
||||
"url": "https://www.weather.gov/safety/heat-uv"
|
||||
},
|
||||
{
|
||||
"text": " Thunderstorms & Tornadoes",
|
||||
"url": "https://www.weather.gov/safety/thunderstorm"
|
||||
},
|
||||
{
|
||||
"text": "Tornado",
|
||||
"url": "https://www.weather.gov/safety/tornado"
|
||||
},
|
||||
{
|
||||
"text": "Tsunami",
|
||||
"url": "https://www.weather.gov/safety/tsunami"
|
||||
},
|
||||
{
|
||||
"text": "Wildfire",
|
||||
"url": "https://www.weather.gov/safety/wildfire"
|
||||
},
|
||||
{
|
||||
"text": "Wind",
|
||||
"url": "https://www.weather.gov/safety/wind"
|
||||
},
|
||||
{
|
||||
"text": "Winter",
|
||||
"url": "https://www.weather.gov/safety/winter "
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.weather.gov/safetycampaign",
|
||||
"https://www.weather.gov/safety/airquality",
|
||||
"https://www.weather.gov/safety/beachhazards",
|
||||
"https://www.weather.gov/safety/cold",
|
||||
"https://www.weather.gov/safety/coldwater",
|
||||
"https://www.weather.gov/safety/drought",
|
||||
"https://www.weather.gov/safety/flood",
|
||||
"https://www.weather.gov/safety/fog",
|
||||
"https://www.weather.gov/safety/heat",
|
||||
"https://www.weather.gov/safety/hurricane",
|
||||
"https://www.weather.gov/safety/lightning",
|
||||
"https://www.weather.gov/safety/ripcurrent",
|
||||
"https://www.weather.gov/safety/safeboating",
|
||||
"https://www.weather.gov/safety/space",
|
||||
"https://www.weather.gov/safety/heat-uv",
|
||||
"https://www.weather.gov/safety/thunderstorm",
|
||||
"https://www.weather.gov/safety/tornado",
|
||||
"https://www.weather.gov/safety/tsunami",
|
||||
"https://www.weather.gov/safety/wildfire",
|
||||
"https://www.weather.gov/safety/wind",
|
||||
"https://www.weather.gov/safety/winter "
|
||||
],
|
||||
"link_texts": [
|
||||
"Weather Safety",
|
||||
"Air Quality",
|
||||
"Beach Hazards",
|
||||
"Cold",
|
||||
"Cold Water",
|
||||
"Drought",
|
||||
"Floods",
|
||||
"Fog",
|
||||
"Heat",
|
||||
" Hurricanes",
|
||||
" Lightning Safety",
|
||||
"Rip Currents",
|
||||
"Safe Boating",
|
||||
"Space Weather",
|
||||
"Sun (Ultraviolet Radiation)",
|
||||
" Thunderstorms & Tornadoes",
|
||||
"Tornado",
|
||||
"Tsunami",
|
||||
"Wildfire",
|
||||
"Wind",
|
||||
"Winter"
|
||||
]
|
||||
},
|
||||
"text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter"
|
||||
@ -164,35 +124,23 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Safety Campaigns",
|
||||
"url": "https://www.weather.gov/safetycampaign"
|
||||
},
|
||||
{
|
||||
"text": "Seasonal Safety Campaigns",
|
||||
"url": "https://www.weather.gov/safetycampaign"
|
||||
},
|
||||
{
|
||||
"text": "#SafePlaceSelfie",
|
||||
"url": "https://www.weather.gov/wrn/safeplaceselfie"
|
||||
},
|
||||
{
|
||||
"text": "Deaf & Hard of Hearing",
|
||||
"url": "https://www.weather.gov/wrn/dhh-safety"
|
||||
},
|
||||
{
|
||||
"text": "Intellectual Disabilities",
|
||||
"url": "https://www.weather.gov/wrn/intellectualdisabilities"
|
||||
},
|
||||
{
|
||||
"text": "Spanish-language Content",
|
||||
"url": "https://www.weather.gov/wrn/fall2020-espanol-sm"
|
||||
},
|
||||
{
|
||||
"text": "The Great Outdoors",
|
||||
"url": "https://www.noaa.gov/explainers/great-outdoors-weather-safety"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/safetycampaign",
|
||||
"https://www.weather.gov/safetycampaign",
|
||||
"https://www.weather.gov/wrn/safeplaceselfie",
|
||||
"https://www.weather.gov/wrn/dhh-safety",
|
||||
"https://www.weather.gov/wrn/intellectualdisabilities",
|
||||
"https://www.weather.gov/wrn/fall2020-espanol-sm",
|
||||
"https://www.noaa.gov/explainers/great-outdoors-weather-safety"
|
||||
],
|
||||
"link_texts": [
|
||||
"Safety Campaigns",
|
||||
"Seasonal Safety Campaigns",
|
||||
"#SafePlaceSelfie",
|
||||
"Deaf & Hard of Hearing",
|
||||
"Intellectual Disabilities",
|
||||
"Spanish-language Content",
|
||||
"The Great Outdoors"
|
||||
]
|
||||
},
|
||||
"text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors"
|
||||
@ -204,59 +152,35 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Ambassador",
|
||||
"url": "https://www.weather.gov/wrn/ambassadors"
|
||||
},
|
||||
{
|
||||
"text": "About WRN Ambassadors",
|
||||
"url": "https://www.weather.gov/wrn/ambassadors"
|
||||
},
|
||||
{
|
||||
"text": "Become an Ambassador",
|
||||
"url": "https://www.weather.gov/wrn/amb-tou"
|
||||
},
|
||||
{
|
||||
"text": "Ambassadors of Excellence",
|
||||
"url": "https://www.weather.gov/wrn/ambassador_recognition"
|
||||
},
|
||||
{
|
||||
"text": "People of WRN",
|
||||
"url": "https://www.weather.gov/people/"
|
||||
},
|
||||
{
|
||||
"text": " FAQS",
|
||||
"url": "https://www.weather.gov/wrn/amb-faqs"
|
||||
},
|
||||
{
|
||||
"text": "Tell Your Success Story",
|
||||
"url": "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform"
|
||||
},
|
||||
{
|
||||
"text": " Success Stories",
|
||||
"url": " https://www.weather.gov/wrn/success-stories"
|
||||
},
|
||||
{
|
||||
"text": "Tri-fold",
|
||||
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Aviation",
|
||||
"url": "https://www.weather.gov/wrn/aviation"
|
||||
},
|
||||
{
|
||||
"text": " Current Ambassadors",
|
||||
"url": " http://www.weather.gov/wrn/current-ambassadors"
|
||||
},
|
||||
{
|
||||
"text": "Brochure",
|
||||
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
|
||||
},
|
||||
{
|
||||
"text": "En Español",
|
||||
"url": "https://www.weather.gov/wrn/en-espanol"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/ambassadors",
|
||||
"https://www.weather.gov/wrn/ambassadors",
|
||||
"https://www.weather.gov/wrn/amb-tou",
|
||||
"https://www.weather.gov/wrn/ambassador_recognition",
|
||||
"https://www.weather.gov/people/",
|
||||
"https://www.weather.gov/wrn/amb-faqs",
|
||||
"https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform",
|
||||
" https://www.weather.gov/wrn/success-stories",
|
||||
"http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf",
|
||||
"https://www.weather.gov/wrn/aviation",
|
||||
" http://www.weather.gov/wrn/current-ambassadors",
|
||||
"http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
|
||||
"https://www.weather.gov/wrn/en-espanol"
|
||||
],
|
||||
"link_texts": [
|
||||
"Ambassador",
|
||||
"About WRN Ambassadors",
|
||||
"Become an Ambassador",
|
||||
"Ambassadors of Excellence",
|
||||
"People of WRN",
|
||||
" FAQS",
|
||||
"Tell Your Success Story",
|
||||
" Success Stories",
|
||||
"Tri-fold",
|
||||
"Aviation",
|
||||
" Current Ambassadors",
|
||||
"Brochure",
|
||||
"En Español"
|
||||
]
|
||||
},
|
||||
"text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español"
|
||||
@ -268,51 +192,31 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Education",
|
||||
"url": "http://www.weather.gov/owlie/"
|
||||
},
|
||||
{
|
||||
"text": "NWS Education Home",
|
||||
"url": "http://www.weather.gov/owlie/"
|
||||
},
|
||||
{
|
||||
"text": "Be A Force Of Nature",
|
||||
"url": "https://www.weather.gov/wrn/force"
|
||||
},
|
||||
{
|
||||
"text": "WRN Kids Flyer",
|
||||
"url": " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Wireless Emergency Alerts",
|
||||
"url": "https://www.weather.gov/wrn/wea"
|
||||
},
|
||||
{
|
||||
"text": "NOAA Weather Radio",
|
||||
"url": "http://www.nws.noaa.gov/nwr/"
|
||||
},
|
||||
{
|
||||
"text": "Mobile Weather",
|
||||
"url": "https://www.weather.gov/wrn/mobile-phone"
|
||||
},
|
||||
{
|
||||
"text": "Brochures",
|
||||
"url": "http://www.weather.gov/owlie/publication_brochures"
|
||||
},
|
||||
{
|
||||
"text": "Hourly Weather Forecast",
|
||||
"url": "https://www.weather.gov/wrn/hourly-weather-graph"
|
||||
},
|
||||
{
|
||||
"text": "Citizen Science",
|
||||
"url": "http://www.weather.gov/media/wrn/citizen_science_page.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Intellectual Disabilities",
|
||||
"url": "https://www.weather.gov/wrn/intellectualdisabilities"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.weather.gov/owlie/",
|
||||
"http://www.weather.gov/owlie/",
|
||||
"https://www.weather.gov/wrn/force",
|
||||
" http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf",
|
||||
"https://www.weather.gov/wrn/wea",
|
||||
"http://www.nws.noaa.gov/nwr/",
|
||||
"https://www.weather.gov/wrn/mobile-phone",
|
||||
"http://www.weather.gov/owlie/publication_brochures",
|
||||
"https://www.weather.gov/wrn/hourly-weather-graph",
|
||||
"http://www.weather.gov/media/wrn/citizen_science_page.pdf",
|
||||
"https://www.weather.gov/wrn/intellectualdisabilities"
|
||||
],
|
||||
"link_texts": [
|
||||
"Education",
|
||||
"NWS Education Home",
|
||||
"Be A Force Of Nature",
|
||||
"WRN Kids Flyer",
|
||||
"Wireless Emergency Alerts",
|
||||
"NOAA Weather Radio",
|
||||
"Mobile Weather",
|
||||
"Brochures",
|
||||
"Hourly Weather Forecast",
|
||||
"Citizen Science",
|
||||
"Intellectual Disabilities"
|
||||
]
|
||||
},
|
||||
"text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities"
|
||||
@ -324,47 +228,29 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Collaboration",
|
||||
"url": "https://www.weather.gov/wrn/collaborate"
|
||||
},
|
||||
{
|
||||
"text": "Get Involved ",
|
||||
"url": "https://www.weather.gov/wrn/get-involved"
|
||||
},
|
||||
{
|
||||
"text": "Social Media",
|
||||
"url": "http://www.weather.gov/socialmedia"
|
||||
},
|
||||
{
|
||||
"text": "WRN Ambassadors ",
|
||||
"url": "https://www.weather.gov/wrn/ambassadors"
|
||||
},
|
||||
{
|
||||
"text": "Enterprise Resources",
|
||||
"url": "https://www.weather.gov/enterprise/"
|
||||
},
|
||||
{
|
||||
"text": "StormReady",
|
||||
"url": "http://www.weather.gov/stormready/"
|
||||
},
|
||||
{
|
||||
"text": "TsunamiReady",
|
||||
"url": "https://www.weather.gov/tsunamiready/"
|
||||
},
|
||||
{
|
||||
"text": "NWSChat (core partners only)",
|
||||
"url": "https://nwschat.weather.gov/"
|
||||
},
|
||||
{
|
||||
"text": "InteractiveNWS (iNWS) (core partners only)",
|
||||
"url": "https://inws.ncep.noaa.gov/"
|
||||
},
|
||||
{
|
||||
"text": "SKYWARN",
|
||||
"url": "https://www.weather.gov/SKYWARN"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/collaborate",
|
||||
"https://www.weather.gov/wrn/get-involved",
|
||||
"http://www.weather.gov/socialmedia",
|
||||
"https://www.weather.gov/wrn/ambassadors",
|
||||
"https://www.weather.gov/enterprise/",
|
||||
"http://www.weather.gov/stormready/",
|
||||
"https://www.weather.gov/tsunamiready/",
|
||||
"https://nwschat.weather.gov/",
|
||||
"https://inws.ncep.noaa.gov/",
|
||||
"https://www.weather.gov/SKYWARN"
|
||||
],
|
||||
"link_texts": [
|
||||
"Collaboration",
|
||||
"Get Involved ",
|
||||
"Social Media",
|
||||
"WRN Ambassadors ",
|
||||
"Enterprise Resources",
|
||||
"StormReady",
|
||||
"TsunamiReady",
|
||||
"NWSChat (core partners only)",
|
||||
"InteractiveNWS (iNWS) (core partners only)",
|
||||
"SKYWARN"
|
||||
]
|
||||
},
|
||||
"text": "Collaboration Get Involved Social Media WRN Ambassadors Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only) SKYWARN"
|
||||
@ -376,27 +262,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": " News & Events",
|
||||
"url": "http://www.weather.gov/news/"
|
||||
},
|
||||
{
|
||||
"text": "Latest News",
|
||||
"url": " http://www.weather.gov/news/"
|
||||
},
|
||||
{
|
||||
"text": "Calendar",
|
||||
"url": "https://www.weather.gov/wrn/calendar"
|
||||
},
|
||||
{
|
||||
"text": "Meetings & Workshops",
|
||||
"url": " https://www.weather.gov/wrn/workshops"
|
||||
},
|
||||
{
|
||||
"text": "NWS Aware Newsletter",
|
||||
"url": "https://www.weather.gov/publications/aware"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.weather.gov/news/",
|
||||
" http://www.weather.gov/news/",
|
||||
"https://www.weather.gov/wrn/calendar",
|
||||
" https://www.weather.gov/wrn/workshops",
|
||||
"https://www.weather.gov/publications/aware"
|
||||
],
|
||||
"link_texts": [
|
||||
" News & Events",
|
||||
"Latest News",
|
||||
"Calendar",
|
||||
"Meetings & Workshops",
|
||||
"NWS Aware Newsletter"
|
||||
]
|
||||
},
|
||||
"text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter"
|
||||
@ -408,11 +286,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "International",
|
||||
"url": "https://www.weather.gov/wrn/wrns"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/wrns"
|
||||
],
|
||||
"link_texts": [
|
||||
"International"
|
||||
]
|
||||
},
|
||||
"text": "International"
|
||||
@ -424,51 +302,31 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "About",
|
||||
"url": "https://www.weather.gov/wrn/about"
|
||||
},
|
||||
{
|
||||
"text": "Contact Us",
|
||||
"url": " https://www.weather.gov/wrn/contact"
|
||||
},
|
||||
{
|
||||
"text": " What is WRN?",
|
||||
"url": "https://www.weather.gov/wrn/about"
|
||||
},
|
||||
{
|
||||
"text": " WRN FAQ",
|
||||
"url": "https://www.weather.gov/wrn/faqs"
|
||||
},
|
||||
{
|
||||
"text": "WRN Brochure",
|
||||
"url": "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Hazard Simplification",
|
||||
"url": "https://www.weather.gov/hazardsimplification/"
|
||||
},
|
||||
{
|
||||
"text": "IDSS Brochure",
|
||||
"url": "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Roadmap",
|
||||
"url": "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf"
|
||||
},
|
||||
{
|
||||
"text": "Strategic Plan",
|
||||
"url": "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf"
|
||||
},
|
||||
{
|
||||
"text": "WRN International",
|
||||
"url": " https://www.weather.gov/wrn/international"
|
||||
},
|
||||
{
|
||||
"text": "Social Science",
|
||||
"url": "https://vlab.noaa.gov/web/nws-social-science"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/about",
|
||||
" https://www.weather.gov/wrn/contact",
|
||||
"https://www.weather.gov/wrn/about",
|
||||
"https://www.weather.gov/wrn/faqs",
|
||||
"http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf",
|
||||
"https://www.weather.gov/hazardsimplification/",
|
||||
"https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf",
|
||||
"http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf",
|
||||
"https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf",
|
||||
" https://www.weather.gov/wrn/international",
|
||||
"https://vlab.noaa.gov/web/nws-social-science"
|
||||
],
|
||||
"link_texts": [
|
||||
"About",
|
||||
"Contact Us",
|
||||
" What is WRN?",
|
||||
" WRN FAQ",
|
||||
"WRN Brochure",
|
||||
"Hazard Simplification",
|
||||
"IDSS Brochure",
|
||||
"Roadmap",
|
||||
"Strategic Plan",
|
||||
"WRN International",
|
||||
"Social Science"
|
||||
]
|
||||
},
|
||||
"text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science"
|
||||
@ -500,11 +358,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.",
|
||||
"tag": "strong"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"First, take steps to better prepare for the seasonal hazards weather can throw at you."
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||
@ -516,17 +374,17 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Spring Safety website",
|
||||
"url": "https://www.weather.gov/wrn/spring-safety"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/spring-safety"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "Second, encourage others to become Weather-Ready as well.",
|
||||
"tag": "strong"
|
||||
}
|
||||
"link_texts": [
|
||||
"Spring Safety website"
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"Second, encourage others to become Weather-Ready as well."
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic."
|
||||
@ -538,11 +396,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "infographics",
|
||||
"url": "https://www.weather.gov/wrn/spring-infographics"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/wrn/spring-infographics"
|
||||
],
|
||||
"link_texts": [
|
||||
"infographics"
|
||||
]
|
||||
},
|
||||
"text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available."
|
||||
@ -564,23 +422,17 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "US Dept of Commerce",
|
||||
"url": "http://www.commerce.gov"
|
||||
},
|
||||
{
|
||||
"text": "National Oceanic and Atmospheric Administration",
|
||||
"url": "http://www.noaa.gov"
|
||||
},
|
||||
{
|
||||
"text": "National Weather Service",
|
||||
"url": "https://www.weather.gov"
|
||||
},
|
||||
{
|
||||
"text": "Comments? Questions? Please Contact Us.",
|
||||
"url": "https://www.weather.gov/news/contact"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.commerce.gov",
|
||||
"http://www.noaa.gov",
|
||||
"https://www.weather.gov",
|
||||
"https://www.weather.gov/news/contact"
|
||||
],
|
||||
"link_texts": [
|
||||
"US Dept of Commerce",
|
||||
"National Oceanic and Atmospheric Administration",
|
||||
"National Weather Service",
|
||||
"Comments? Questions? Please Contact Us."
|
||||
]
|
||||
},
|
||||
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us."
|
||||
@ -592,11 +444,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Disclaimer",
|
||||
"url": "https://www.weather.gov/disclaimer"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/disclaimer"
|
||||
],
|
||||
"link_texts": [
|
||||
"Disclaimer"
|
||||
]
|
||||
},
|
||||
"text": "Disclaimer"
|
||||
@ -608,11 +460,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Information Quality",
|
||||
"url": "http://www.cio.noaa.gov/services_programs/info_quality.html"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.cio.noaa.gov/services_programs/info_quality.html"
|
||||
],
|
||||
"link_texts": [
|
||||
"Information Quality"
|
||||
]
|
||||
},
|
||||
"text": "Information Quality"
|
||||
@ -624,11 +476,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Help",
|
||||
"url": "https://www.weather.gov/help"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/help"
|
||||
],
|
||||
"link_texts": [
|
||||
"Help"
|
||||
]
|
||||
},
|
||||
"text": "Help"
|
||||
@ -640,11 +492,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Glossary",
|
||||
"url": "http://www.weather.gov/glossary"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://www.weather.gov/glossary"
|
||||
],
|
||||
"link_texts": [
|
||||
"Glossary"
|
||||
]
|
||||
},
|
||||
"text": "Glossary"
|
||||
@ -656,11 +508,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Privacy Policy",
|
||||
"url": "https://www.weather.gov/privacy"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/privacy"
|
||||
],
|
||||
"link_texts": [
|
||||
"Privacy Policy"
|
||||
]
|
||||
},
|
||||
"text": "Privacy Policy"
|
||||
@ -672,11 +524,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Freedom of Information Act (FOIA)",
|
||||
"url": "https://www.noaa.gov/foia-freedom-of-information-act"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.noaa.gov/foia-freedom-of-information-act"
|
||||
],
|
||||
"link_texts": [
|
||||
"Freedom of Information Act (FOIA)"
|
||||
]
|
||||
},
|
||||
"text": "Freedom of Information Act (FOIA)"
|
||||
@ -688,11 +540,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "About Us",
|
||||
"url": "https://www.weather.gov/about"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/about"
|
||||
],
|
||||
"link_texts": [
|
||||
"About Us"
|
||||
]
|
||||
},
|
||||
"text": "About Us"
|
||||
@ -704,11 +556,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Career Opportunities",
|
||||
"url": "https://www.weather.gov/careers"
|
||||
}
|
||||
"link_urls": [
|
||||
"https://www.weather.gov/careers"
|
||||
],
|
||||
"link_texts": [
|
||||
"Career Opportunities"
|
||||
]
|
||||
},
|
||||
"text": "Career Opportunities"
|
||||
|
||||
@ -16,11 +16,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "CHAPTER 1",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"CHAPTER 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "CHAPTER 1"
|
||||
@ -32,11 +32,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "INTRODUCTION",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"INTRODUCTION"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "INTRODUCTION"
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -16,11 +16,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "CHAPTER 1",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"CHAPTER 1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "CHAPTER 1"
|
||||
@ -32,11 +32,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"page_number": 1,
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "INTRODUCTION",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"INTRODUCTION"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "INTRODUCTION"
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -6,21 +6,19 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": null,
|
||||
"url": "index.html"
|
||||
},
|
||||
{
|
||||
"text": null,
|
||||
"url": "https://twitter.com/stef/status/1617222428727586816"
|
||||
}
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)",
|
||||
"tag": "i"
|
||||
}
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
"(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
|
||||
@ -16,11 +16,11 @@
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"links": [
|
||||
{
|
||||
"text": "Github Project Page",
|
||||
"url": "http://github.com/dcneiner/Downloadify"
|
||||
}
|
||||
"link_urls": [
|
||||
"http://github.com/dcneiner/Downloadify"
|
||||
],
|
||||
"link_texts": [
|
||||
"Github Project Page"
|
||||
]
|
||||
},
|
||||
"text": "More info available at the Github Project Page"
|
||||
|
||||
@ -5,11 +5,11 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "Title",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"Title"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "Title"
|
||||
@ -20,11 +20,11 @@
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "This is a good reason to continue",
|
||||
"tag": "b"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"This is a good reason to continue"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"b"
|
||||
]
|
||||
},
|
||||
"text": "This is a good reason to continue"
|
||||
|
||||
@ -14,11 +14,11 @@
|
||||
"devops@unstructuredio.onmicrosoft.com"
|
||||
],
|
||||
"subject": "subfolder1_1",
|
||||
"emphasized_texts": [
|
||||
{
|
||||
"text": "this is a message for the subfolder1_1",
|
||||
"tag": "span"
|
||||
}
|
||||
"emphasized_text_contents": [
|
||||
"this is a message for the subfolder1_1"
|
||||
],
|
||||
"emphasized_text_tags": [
|
||||
"span"
|
||||
]
|
||||
},
|
||||
"text": "this is a message for the subfolder1_1"
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.9.4-dev0" # pragma: no cover
|
||||
__version__ = "0.10.0" # pragma: no cover
|
||||
|
||||
@ -143,7 +143,8 @@ class ElementMetadata:
|
||||
|
||||
# Webpage specific metadata fields
|
||||
url: Optional[str] = None
|
||||
links: Optional[List[Link]] = None
|
||||
link_urls: Optional[List[str]] = None
|
||||
link_texts: Optional[List[str]] = None
|
||||
|
||||
# E-mail specific metadata fields
|
||||
sent_from: Optional[List[str]] = None
|
||||
@ -157,7 +158,8 @@ class ElementMetadata:
|
||||
header_footer_type: Optional[str] = None
|
||||
|
||||
# Formatting metadata fields
|
||||
emphasized_texts: Optional[List[dict]] = None
|
||||
emphasized_text_contents: Optional[List[str]] = None
|
||||
emphasized_text_tags: Optional[List[str]] = None
|
||||
|
||||
# Text format metadata fields
|
||||
text_as_html: Optional[str] = None
|
||||
|
||||
@ -165,11 +165,23 @@ def _add_element_metadata(
|
||||
else None
|
||||
)
|
||||
links = element.links if hasattr(element, "links") and len(element.links) > 0 else None
|
||||
link_urls = [link.get("url") for link in links] if links else None
|
||||
link_texts = [link.get("text") for link in links] if links else None
|
||||
emphasized_texts = (
|
||||
element.emphasized_texts
|
||||
if hasattr(element, "emphasized_texts") and len(element.emphasized_texts) > 0
|
||||
else None
|
||||
)
|
||||
emphasized_text_contents = (
|
||||
[emphasized_text.get("text") for emphasized_text in emphasized_texts]
|
||||
if emphasized_texts
|
||||
else None
|
||||
)
|
||||
emphasized_text_tags = (
|
||||
[emphasized_text.get("tag") for emphasized_text in emphasized_texts]
|
||||
if emphasized_texts
|
||||
else None
|
||||
)
|
||||
metadata = ElementMetadata(
|
||||
coordinates=coordinates_metadata,
|
||||
filename=filename,
|
||||
@ -177,8 +189,10 @@ def _add_element_metadata(
|
||||
page_number=page_number,
|
||||
url=url,
|
||||
text_as_html=text_as_html,
|
||||
links=links,
|
||||
emphasized_texts=emphasized_texts,
|
||||
link_urls=link_urls,
|
||||
link_texts=link_texts,
|
||||
emphasized_text_contents=emphasized_text_contents,
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
section=section,
|
||||
)
|
||||
element.metadata = metadata.merge(element.metadata)
|
||||
|
||||
@ -171,6 +171,9 @@ def partition_docx(
|
||||
if element_item.tag.endswith("tbl"):
|
||||
table = document.tables[table_index]
|
||||
emphasized_texts = _get_emphasized_texts_from_table(table)
|
||||
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
|
||||
emphasized_texts,
|
||||
)
|
||||
html_table = convert_ms_office_table_to_text(table, as_html=True)
|
||||
text_table = convert_ms_office_table_to_text(table, as_html=False)
|
||||
element = Table(text_table)
|
||||
@ -180,7 +183,8 @@ def partition_docx(
|
||||
filename=metadata_filename,
|
||||
page_number=page_number,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
emphasized_texts=emphasized_texts if emphasized_texts else None,
|
||||
emphasized_text_contents=emphasized_text_contents,
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
)
|
||||
elements.append(element)
|
||||
table_index += 1
|
||||
@ -189,13 +193,17 @@ def partition_docx(
|
||||
is_list = True
|
||||
paragraph = docx.text.paragraph.Paragraph(element_item, document)
|
||||
emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
|
||||
emphasized_texts,
|
||||
)
|
||||
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
|
||||
if para_element is not None:
|
||||
para_element.metadata = ElementMetadata(
|
||||
filename=metadata_filename,
|
||||
page_number=page_number,
|
||||
last_modified=metadata_last_modified or last_modification_date,
|
||||
emphasized_texts=emphasized_texts if emphasized_texts else None,
|
||||
emphasized_text_contents=emphasized_text_contents,
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
)
|
||||
elements.append(para_element)
|
||||
is_list = False
|
||||
@ -398,3 +406,30 @@ def _get_emphasized_texts_from_table(table: DocxTable) -> List[dict]:
|
||||
_emphasized_texts = _get_emphasized_texts_from_paragraph(paragraph)
|
||||
emphasized_texts += _emphasized_texts
|
||||
return emphasized_texts
|
||||
|
||||
|
||||
def _extract_contents_and_tags(
|
||||
emphasized_texts: List[dict],
|
||||
) -> Tuple[Optional[List[str]], Optional[List[str]]]:
|
||||
"""
|
||||
Extract the text contents and tags from a list of dictionaries containing emphasized texts.
|
||||
|
||||
Args:
|
||||
- emphasized_texts (List[dict]): A list containing dictionaries with keys "text" and "tag".
|
||||
|
||||
Returns:
|
||||
- Tuple[List[str], List[str]]: A tuple containing two lists -
|
||||
one for text contents and one for tags extracted from the input.
|
||||
"""
|
||||
emphasized_text_contents = (
|
||||
[emphasized_text["text"] for emphasized_text in emphasized_texts]
|
||||
if emphasized_texts
|
||||
else None
|
||||
)
|
||||
emphasized_text_tags = (
|
||||
[emphasized_text["tag"] for emphasized_text in emphasized_texts]
|
||||
if emphasized_texts
|
||||
else None
|
||||
)
|
||||
|
||||
return emphasized_text_contents, emphasized_text_tags
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user