mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 23:51:47 +00:00
fix: separate elements by <br> tag in partition_html (#1314)
### Summary Closes #1230. Updates `partition_html` to split on `<br>` tags that appear within text elements. ### Testing The following is code previously produced one giant element on `main`. ```python from unstructured.partition.html import partition_html filename = "example-docs/ideas-page.html" elements = partition_html(filename=filename) len(elements) # Should be 4 print("\n\n".join([str(el) for el in elements)]) ``` The output should be: ```python January 2023 (Someone fed my essays into GPT to make something that could answer questions based on them, then asked it where good ideas come from. The answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange, or missing, or broken? You can see anomalies in everyday life (much of standup comedy is based on this), but the best place to look for them is at the frontiers of knowledge. Knowledge grows fractally. From a distance its edges look smooth, but when you learn enough to get close to one, you'll notice it's full of gaps. These gaps will seem obvious; it will seem inexplicable that no one has tried x or wondered about y. In the best case, exploring such gaps yields whole new fractal buds. ```
This commit is contained in:
parent
09cc4bfa5f
commit
22974f61ce
@ -1,4 +1,4 @@
|
||||
## 0.10.13-dev2
|
||||
## 0.10.13-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* `partition_html` breaks on `<br>` elements.
|
||||
* Ingest error handling to properly raise errors when wrapped
|
||||
|
||||
## 0.10.12
|
||||
@ -31,7 +32,7 @@
|
||||
|
||||
* Bump unstructured-inference
|
||||
* Avoid divide-by-zero errors swith `safe_division` (0.5.21)
|
||||
|
||||
|
||||
## 0.10.11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -265,12 +265,28 @@ def test_partition_html_raises_with_too_many_specified():
|
||||
partition_html(filename=filename, text=text)
|
||||
|
||||
|
||||
def test_partition_html_on_ideas_page():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
|
||||
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
|
||||
elements = partition_html(filename=filename)
|
||||
document_text = "\n\n".join([str(el) for el in elements])
|
||||
assert document_text.startswith("January 2023(Someone fed my essays into GPT")
|
||||
assert document_text.endswith("whole new fractal buds.")
|
||||
assert len(elements) == 4
|
||||
|
||||
assert elements[0] == Title("January 2023")
|
||||
assert elements[0].metadata.emphasized_text_contents is None
|
||||
assert elements[0].metadata.link_urls is None
|
||||
|
||||
assert elements[1].text.startswith("(Someone fed my essays")
|
||||
assert elements[1].text.endswith("I would have said.)")
|
||||
assert len(elements[1].metadata.emphasized_text_contents) == 1
|
||||
assert len(elements[1].metadata.link_urls) == 1
|
||||
|
||||
assert elements[2].text.startswith("The way to get new ideas")
|
||||
assert elements[2].text.endswith("the frontiers of knowledge.")
|
||||
assert elements[2].metadata.emphasized_text_contents is None
|
||||
assert elements[2].metadata.link_urls is None
|
||||
|
||||
assert elements[3].text.startswith("Knowledge grows fractally")
|
||||
assert elements[3].text.endswith("whole new fractal buds.")
|
||||
assert elements[3].metadata.emphasized_text_contents is None
|
||||
assert elements[3].metadata.link_urls is None
|
||||
|
||||
|
||||
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):
|
||||
|
||||
@ -1,18 +1,27 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -22,6 +31,28 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filename": "ideas-page.html",
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -353,7 +353,7 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
|
||||
"element_id": "073a8fd4fe21204eff8c0ca133f6993f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
@ -365,7 +365,17 @@
|
||||
"strong"
|
||||
]
|
||||
},
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d97aee85f18639e200b29757e5783dad",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -416,26 +426,98 @@
|
||||
"text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
|
||||
"type": "Title",
|
||||
"element_id": "c9b4b8b324383371034a3682d0d712d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"http://www.commerce.gov"
|
||||
],
|
||||
"link_texts": [
|
||||
"US Dept of Commerce"
|
||||
]
|
||||
},
|
||||
"text": "US Dept of Commerce"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "668c4fe04cbbc45c7e91b0b675dd48a3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"http://www.noaa.gov"
|
||||
],
|
||||
"link_texts": [
|
||||
"National Oceanic and Atmospheric Administration"
|
||||
]
|
||||
},
|
||||
"text": "National Oceanic and Atmospheric Administration"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "a5c0620dc25afae7e2761c210037b45c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"https://www.weather.gov"
|
||||
],
|
||||
"link_texts": [
|
||||
"National Weather Service"
|
||||
]
|
||||
},
|
||||
"text": "National Weather Service"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "41f6e17bf5e9a407fcca74e902f802a0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "News Around NOAA"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "d27040ad6074797db8e535d1fba3b5d8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "1325 East West Highway"
|
||||
},
|
||||
{
|
||||
"type": "Address",
|
||||
"element_id": "7ab3e0275d15e2c26b18983db0685ddb",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Silver Spring, MD 20910"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1b0316a06a8f4d5b672669bb9f5b2877",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"http://www.commerce.gov",
|
||||
"http://www.noaa.gov",
|
||||
"https://www.weather.gov",
|
||||
"https://www.weather.gov/news/contact"
|
||||
],
|
||||
"link_texts": [
|
||||
"US Dept of Commerce",
|
||||
"National Oceanic and Atmospheric Administration",
|
||||
"National Weather Service",
|
||||
"Comments? Questions? Please Contact Us."
|
||||
]
|
||||
},
|
||||
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us."
|
||||
"text": "Comments? Questions? Please Contact Us."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -219,26 +219,6 @@
|
||||
},
|
||||
"text": "Nice to have:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7f999c0456e4e85cc028aa6ed90455d4",
|
||||
@ -255,26 +235,6 @@
|
||||
},
|
||||
"text": "Not in scope:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "e8b61a28d07e977379b42df455a1cde4",
|
||||
|
||||
@ -133,16 +133,6 @@
|
||||
},
|
||||
"text": "Notes"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ""
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",
|
||||
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -1,17 +1,25 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "January 2023"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
|
||||
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1,
|
||||
"link_urls": [
|
||||
"index.html",
|
||||
"https://twitter.com/stef/status/1617222428727586816"
|
||||
],
|
||||
"link_texts": [
|
||||
null,
|
||||
null
|
||||
],
|
||||
"emphasized_text_contents": [
|
||||
@ -21,6 +29,26 @@
|
||||
"i"
|
||||
]
|
||||
},
|
||||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4eafbff98b81999dfbf3572440d22393",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
|
||||
}
|
||||
]
|
||||
@ -36,14 +36,24 @@
|
||||
"text": "Filename"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
|
||||
"type": "Title",
|
||||
"element_id": "4112a488690bdbc1d39d5b78068eae9f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded"
|
||||
"text": "File Contents"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f89c9cf63bd2e72f560ee043d942a1e7",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Whatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.13-dev2" # pragma: no cover
|
||||
__version__ = "0.10.13-dev3" # pragma: no cover
|
||||
|
||||
@ -36,9 +36,10 @@ TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
|
||||
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
|
||||
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
|
||||
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
|
||||
TEXTBREAK_TAGS: Final[List[str]] = ["br"]
|
||||
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
|
||||
EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
|
||||
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]
|
||||
EMPTY_TAGS: Final[List[str]] = ["br", "hr"]
|
||||
SECTION_TAGS: Final[List[str]] = ["div", "pre"]
|
||||
|
||||
|
||||
@ -136,10 +137,18 @@ class HTMLDocument(XMLDocument):
|
||||
continue
|
||||
|
||||
if _is_text_tag(tag_elem):
|
||||
element = _parse_tag(tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
if _has_break_tags(tag_elem):
|
||||
flattened_elems = _unfurl_break_tags(tag_elem)
|
||||
for _tag_elem in flattened_elems:
|
||||
element = _parse_tag(_tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
|
||||
else:
|
||||
element = _parse_tag(tag_elem)
|
||||
if element is not None:
|
||||
page.elements.append(element)
|
||||
descendanttag_elems = tuple(tag_elem.iterdescendants())
|
||||
|
||||
elif _is_container_with_text(tag_elem):
|
||||
links = _get_links_from_tag(tag_elem)
|
||||
@ -385,6 +394,29 @@ def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) ->
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _has_break_tags(tag_elem: etree.Element) -> bool:
|
||||
for descendant in tag_elem.iterdescendants():
|
||||
if descendant.tag in TEXTBREAK_TAGS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
|
||||
unfurled = []
|
||||
children = tag_elem.getchildren()
|
||||
for child in children:
|
||||
if not _has_break_tags(child):
|
||||
unfurled.append(child)
|
||||
else:
|
||||
if child.text:
|
||||
_tag_elem = etree.Element(child.tag)
|
||||
_tag_elem.text = child.text
|
||||
unfurled.append(_tag_elem)
|
||||
unfurled.extend(_unfurl_break_tags(child))
|
||||
|
||||
return unfurled
|
||||
|
||||
|
||||
def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
|
||||
"""Deteremines if a tag potentially contains narrative text."""
|
||||
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user