fix: separate elements by <br> tag in partition_html (#1314)

### Summary

Closes #1230. Updates `partition_html` to split on `<br>` tags that
appear within text elements.


### Testing

The following is code previously produced one giant element on `main`.

```python
from unstructured.partition.html import partition_html

filename = "example-docs/ideas-page.html"
elements = partition_html(filename=filename)

len(elements) # Should be 4
print("\n\n".join([str(el) for el in elements)])
```

The output should be:

```python
January 2023

(Someone fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from.  The
answer was ok, but not what I would have said. This is what I would have said.)

The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.

Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.
```
This commit is contained in:
Matt Robinson 2023-09-07 09:16:31 -04:00 committed by GitHub
parent 09cc4bfa5f
commit 22974f61ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 427 additions and 109 deletions

View File

@ -1,4 +1,4 @@
## 0.10.13-dev2
## 0.10.13-dev3
### Enhancements
@ -10,6 +10,7 @@
### Fixes
* `partition_html` breaks on `<br>` elements.
* Ingest error handling to properly raise errors when wrapped
## 0.10.12
@ -31,7 +32,7 @@
* Bump unstructured-inference
* Avoid divide-by-zero errors swith `safe_division` (0.5.21)
## 0.10.11
### Enhancements

View File

@ -265,12 +265,28 @@ def test_partition_html_raises_with_too_many_specified():
partition_html(filename=filename, text=text)
def test_partition_html_on_ideas_page():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
elements = partition_html(filename=filename)
document_text = "\n\n".join([str(el) for el in elements])
assert document_text.startswith("January 2023(Someone fed my essays into GPT")
assert document_text.endswith("whole new fractal buds.")
assert len(elements) == 4
assert elements[0] == Title("January 2023")
assert elements[0].metadata.emphasized_text_contents is None
assert elements[0].metadata.link_urls is None
assert elements[1].text.startswith("(Someone fed my essays")
assert elements[1].text.endswith("I would have said.)")
assert len(elements[1].metadata.emphasized_text_contents) == 1
assert len(elements[1].metadata.link_urls) == 1
assert elements[2].text.startswith("The way to get new ideas")
assert elements[2].text.endswith("the frontiers of knowledge.")
assert elements[2].metadata.emphasized_text_contents is None
assert elements[2].metadata.link_urls is None
assert elements[3].text.startswith("Knowledge grows fractally")
assert elements[3].text.endswith("whole new fractal buds.")
assert elements[3].metadata.emphasized_text_contents is None
assert elements[3].metadata.link_urls is None
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):

View File

@ -1,18 +1,27 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -22,6 +31,28 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filename": "ideas-page.html",
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -353,7 +353,7 @@
},
{
"type": "NarrativeText",
"element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
"element_id": "073a8fd4fe21204eff8c0ca133f6993f",
"metadata": {
"data_source": {},
"filetype": "text/html",
@ -365,7 +365,17 @@
"strong"
]
},
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you."
},
{
"type": "NarrativeText",
"element_id": "d97aee85f18639e200b29757e5783dad",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
},
{
"type": "NarrativeText",
@ -416,26 +426,98 @@
"text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready."
},
{
"type": "NarrativeText",
"element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
"type": "Title",
"element_id": "c9b4b8b324383371034a3682d0d712d2",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"http://www.commerce.gov"
],
"link_texts": [
"US Dept of Commerce"
]
},
"text": "US Dept of Commerce"
},
{
"type": "Title",
"element_id": "668c4fe04cbbc45c7e91b0b675dd48a3",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"http://www.noaa.gov"
],
"link_texts": [
"National Oceanic and Atmospheric Administration"
]
},
"text": "National Oceanic and Atmospheric Administration"
},
{
"type": "Title",
"element_id": "a5c0620dc25afae7e2761c210037b45c",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"https://www.weather.gov"
],
"link_texts": [
"National Weather Service"
]
},
"text": "National Weather Service"
},
{
"type": "Title",
"element_id": "41f6e17bf5e9a407fcca74e902f802a0",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "News Around NOAA"
},
{
"type": "Title",
"element_id": "d27040ad6074797db8e535d1fba3b5d8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "1325 East West Highway"
},
{
"type": "Address",
"element_id": "7ab3e0275d15e2c26b18983db0685ddb",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Silver Spring, MD 20910"
},
{
"type": "Title",
"element_id": "1b0316a06a8f4d5b672669bb9f5b2877",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"http://www.commerce.gov",
"http://www.noaa.gov",
"https://www.weather.gov",
"https://www.weather.gov/news/contact"
],
"link_texts": [
"US Dept of Commerce",
"National Oceanic and Atmospheric Administration",
"National Weather Service",
"Comments? Questions? Please Contact Us."
]
},
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us."
"text": "Comments? Questions? Please Contact Us."
},
{
"type": "Title",

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -219,26 +219,6 @@
},
"text": "Nice to have:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "7f999c0456e4e85cc028aa6ed90455d4",
@ -255,26 +235,6 @@
},
"text": "Not in scope:"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "e8b61a28d07e977379b42df455a1cde4",

View File

@ -133,16 +133,6 @@
},
"text": "Notes"
},
{
"type": "ListItem",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": ""
},
{
"type": "Title",
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -1,17 +1,25 @@
[
{
"type": "Title",
"element_id": "17c1a6701c263407d0fcf7c3ebfb2986",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "January 2023"
},
{
"type": "NarrativeText",
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8",
"element_id": "6ea0e510b7ea64f87b55c1fe388cba7f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1,
"link_urls": [
"index.html",
"https://twitter.com/stef/status/1617222428727586816"
],
"link_texts": [
null,
null
],
"emphasized_text_contents": [
@ -21,6 +29,26 @@
"i"
]
},
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
"text": "(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)"
},
{
"type": "NarrativeText",
"element_id": "a8ce0a2e7d66af2000e6c3bd36994411",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge."
},
{
"type": "NarrativeText",
"element_id": "4eafbff98b81999dfbf3572440d22393",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds."
}
]

View File

@ -36,14 +36,24 @@
"text": "Filename"
},
{
"type": "NarrativeText",
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
"type": "Title",
"element_id": "4112a488690bdbc1d39d5b78068eae9f",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded"
"text": "File Contents"
},
{
"type": "NarrativeText",
"element_id": "f89c9cf63bd2e72f560ee043d942a1e7",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Whatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded"
},
{
"type": "NarrativeText",

View File

@ -1 +1 @@
__version__ = "0.10.13-dev2" # pragma: no cover
__version__ = "0.10.13-dev3" # pragma: no cover

View File

@ -36,9 +36,10 @@ TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
TEXTBREAK_TAGS: Final[List[str]] = ["br"]
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
HEADER_OR_FOOTER_TAGS: Final[List[str]] = ["header", "footer"]
EMPTY_TAGS: Final[List[str]] = ["br", "hr"]
SECTION_TAGS: Final[List[str]] = ["div", "pre"]
@ -136,10 +137,18 @@ class HTMLDocument(XMLDocument):
continue
if _is_text_tag(tag_elem):
element = _parse_tag(tag_elem)
if element is not None:
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
if _has_break_tags(tag_elem):
flattened_elems = _unfurl_break_tags(tag_elem)
for _tag_elem in flattened_elems:
element = _parse_tag(_tag_elem)
if element is not None:
page.elements.append(element)
else:
element = _parse_tag(tag_elem)
if element is not None:
page.elements.append(element)
descendanttag_elems = tuple(tag_elem.iterdescendants())
elif _is_container_with_text(tag_elem):
links = _get_links_from_tag(tag_elem)
@ -385,6 +394,29 @@ def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) ->
return text.strip()
def _has_break_tags(tag_elem: etree.Element) -> bool:
for descendant in tag_elem.iterdescendants():
if descendant.tag in TEXTBREAK_TAGS:
return True
return False
def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]:
unfurled = []
children = tag_elem.getchildren()
for child in children:
if not _has_break_tags(child):
unfurled.append(child)
else:
if child.text:
_tag_elem = etree.Element(child.tag)
_tag_elem.text = child.text
unfurled.append(_tag_elem)
unfurled.extend(_unfurl_break_tags(child))
return unfurled
def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
"""Deteremines if a tag potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,