fix: ensure all text is maintained in html output (#335)

* fix: ensure all text is maintained in html pages

* add back in replace unicode quotes

* changelog and version bump

* apt-get update in ci

* white space differences in output
This commit is contained in:
Matt Robinson 2023-03-02 14:03:13 -05:00 committed by GitHub
parent ed074b5828
commit a5da3de43b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 70 additions and 12 deletions

View File

@ -104,6 +104,7 @@ jobs:
run: |
source .venv/bin/activate
make install-detectron2
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
make test
make check-coverage

View File

@ -1,4 +1,4 @@
## 0.5.2-dev1
## 0.5.2
### Enhancements
@ -9,10 +9,11 @@ rather than a "tmp-ingest-" dir in the working directory.
### Fixes
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
`DEBIAN_FRONTEND=noninteractive` as a command
* `unstructured-ingest` no longer re-downloads files when --preserve-downloads
is used without --download-dir.
* Fixed an issue that was causing text to be skipped in some HTML documents.
## 0.5.1

View File

@ -0,0 +1,44 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html><script type="text/javascript">
<!--
(new Image).src="https://store.yahoo.net/cgi-bin/refsd?e=http://paulgraham.com/getideas.html&h=paulgraham.com&v=1.0&dr=" + escape(document.referrer);
-->
</script>
<head><title>How to Get New Ideas</title><!-- <META NAME="ROBOTS" CONTENT="NOODP"> -->
<link rel="shortcut icon" href="http://ycombinator.com/arc/arc.png">
</head><body bgcolor=ffffff background="https://sep.yimg.com/ca/I/paulgraham_2271_0" text=000000 link=000099 vlink=464646><table border=0 cellspacing=0 cellpadding=0><tr valign=top><td><map name=c04963d10de5f><area shape=rect coords="0,0,67,21" href="index.html"><area shape=rect coords="0,21,67,42" href="articles.html"><area shape=rect coords="0,42,67,63" href="http://www.amazon.com/gp/product/0596006624"><area shape=rect coords="0,63,67,84" href="books.html"><area shape=rect coords="0,84,67,105" href="http://ycombinator.com"><area shape=rect coords="0,105,67,126" href="arc.html"><area shape=rect coords="0,126,67,147" href="bel.html"><area shape=rect coords="0,147,67,168" href="lisp.html"><area shape=rect coords="0,168,67,189" href="antispam.html"><area shape=rect coords="0,189,67,210" href="kedrosky.html"><area shape=rect coords="0,210,67,231" href="faq.html"><area shape=rect coords="0,231,67,252" href="raq.html"><area shape=rect coords="0,252,67,273" href="quo.html"><area shape=rect coords="0,273,67,294" href="rss.html"><area shape=rect coords="0,294,67,315" href="bio.html"><area shape=rect coords="0,315,67,336" href="https://twitter.com/paulg"><area shape=rect coords="0,336,67,357" href="https://mas.to/@paulg"></map><img src="https://s.yimg.com/aah/paulgraham/img-20.gif" width=69 height=357 usemap=#c04963d10de5f border=0 hspace=0 vspace=0 ismap></td><td><img src="https://sep.yimg.com/ca/Img/trans_1x1.gif" height=1 width=26 border=0></td><td><a href="index.html"><img src="https://sep.yimg.com/ca/I/paulgraham_2271_3232" width=410 height=45 border=0 hspace=0 vspace=0></a><br><br><table border=0 cellspacing=0 cellpadding=0 width=435><tr valign=top><td width=435><img src="https://s.yimg.com/aah/paulgraham/how-to-get-new-ideas-1.gif" width=176 height=18 border=0 hspace=0 vspace=0 alt="How to Get New Ideas"><br><br><font size=2 face="verdana">January 2023<br><br><i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer
questions based on them, then asked it where good ideas come from. The
answer was ok, but not what I would have said. This is what I would have said.)</i><br><br>The way to get new ideas is to notice anomalies: what seems strange,
or missing, or broken? You can see anomalies in everyday life (much
of standup comedy is based on this), but the best place to look for
them is at the frontiers of knowledge.<br><br>Knowledge grows fractally.
From a distance its edges look smooth, but when you learn enough
to get close to one, you'll notice it's full of gaps. These gaps
will seem obvious; it will seem inexplicable that no one has tried
x or wondered about y. In the best case, exploring such gaps yields
whole new fractal buds.<br><br></font></td></tr></table><table border=0 cellspacing=0 cellpadding=0 width=435><tr><td><font size=2 face="verdana"><br><br><hr></font></td></tr></table></td></tr></table></body>
<script type="text/javascript">
csell_env = 'bf1';
var storeCheckoutDomain = 'order.store.yahoo.net';
</script>
<script type="text/javascript">
// Begin Yahoo Store Generated Code
</script> <script type="text/javascript" src="https://s.turbifycdn.com/lq/ult/ylc_1.9.js" ></script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/lib/smbiz/store/csell/beacon-a9518fc6e4.js" >
</script>
<script type="text/javascript">
// Begin Yahoo Store Generated Code
csell_page_data = {}; csell_page_rec_data = []; ts='TOK_STORE_ID';
</script>
<script type="text/javascript">
// Begin Yahoo Store Generated Code
function csell_GLOBAL_INIT_TAG() { var csell_token_map = {}; csell_token_map['TOK_ITEM_ID_LIST'] = 'getideas'; csell_token_map['TOK_BEACON_TYPE'] = 'prod'; csell_token_map['TOK_RAND_KEY'] = 't'; csell_token_map['TOK_SPACEID'] = '2022276099'; csell_token_map['TOK_IS_ORDERABLE'] = '2'; csell_token_map['TOK_STORE_ID'] = 'paulgraham'; csell_token_map['TOK_URL'] = ''; csell_token_map['TOK_ORDER_HOST'] = 'order.store.yahoo.net'; c = csell_page_data; var x = (typeof storeCheckoutDomain == 'string')?storeCheckoutDomain:'order.store.yahoo.net'; var t = csell_token_map; c['s'] = t['TOK_SPACEID']; c['url'] = t['TOK_URL']; c['si'] = t[ts]; c['ii'] = t['TOK_ITEM_ID_LIST']; c['bt'] = t['TOK_BEACON_TYPE']; c['rnd'] = t['TOK_RAND_KEY']; c['io'] = t['TOK_IS_ORDERABLE']; YStore.addItemUrl = 'http%s://'+x+'/'+t[ts]+'/ymix/MetaController.html?eventName.addEvent&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_itemId=%s&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_quantity=1&ysco_key_cs_item=1&sectionId=ysco.cart&ysco_key_store_id='+t[ts]; }
</script>
<script type="text/javascript">
// Begin Yahoo Store Generated Code
function csell_REC_VIEW_TAG() { var env = (typeof csell_env == 'string')?csell_env:'prod'; var p = csell_page_data; var a = '/sid='+p['si']+'/io='+p['io']+'/ii='+p['ii']+'/bt='+p['bt']+'-view'+'/en='+env; var r=Math.random(); YStore.CrossSellBeacon.renderBeaconWithRecData(p['url']+'/p/s='+p['s']+'/'+p['rnd']+'='+r+a); }
</script>
<script type="text/javascript">
// Begin Yahoo Store Generated Code
var csell_token_map = {}; csell_token_map['TOK_PAGE'] = 'p'; csell_token_map['TOK_WS_URL'] = 'https://paulgraham.csell.store.yahoo.net/cs/recommend?itemids=getideas&location=p'; csell_token_map['TOK_SHOW_CS_RECS'] = 'false'; csell_token_map['TOK_CURR_SYM'] = '$'; var t = csell_token_map; csell_GLOBAL_INIT_TAG(); YStore.page = t['TOK_PAGE']; YStore.currencySymbol = t['TOK_CURR_SYM']; YStore.crossSellUrl = t['TOK_WS_URL']; YStore.showCSRecs = t['TOK_SHOW_CS_RECS']; </script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/store/secure/recs-1.3.2.2.js" ></script> <script type="text/javascript" >
</script>
</html>

View File

@ -98,3 +98,11 @@ def test_partition_html_raises_with_too_many_specified():
with pytest.raises(ValueError):
partition_html(filename=filename, text=text)
def test_partition_html_on_ideas_page():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
elements = partition_html(filename=filename)
document_text = "\n\n".join([str(el) for el in elements])
assert document_text.startswith("January 2023(Someone fed my essays into GPT")
assert document_text.endswith("whole new fractal buds.")

View File

@ -8,8 +8,8 @@
}
},
{
"element_id": "4300054a3c2601f905282a7bc7199044",
"text": "More info available at the \n\t\tGithub Project Page",
"element_id": "d551bbfc9477547e4dce6264d8196c7b",
"text": "More info available at the Github Project Page",
"type": "Title",
"metadata": {
"page_number": 1
@ -24,8 +24,8 @@
}
},
{
"element_id": "a309823c9d508290682a198270b84bca",
"text": "File Contents\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
"type": "NarrativeText",
"metadata": {
"page_number": 1

View File

@ -1 +1 @@
__version__ = "0.5.2-dev1" # pragma: no cover
__version__ = "0.5.2" # pragma: no cover

View File

@ -97,6 +97,7 @@ class HTMLDocument(XMLDocument):
return self._pages
logger.info("Reading document ...")
pages: List[Page] = []
etree.strip_elements(self.document_tree, ["script"])
root = _find_main(self.document_tree)
articles = _find_articles(root)
@ -213,6 +214,8 @@ def _parse_tag(
processing the document tree again. In the future we might want to keep descendants too,
but we don't have a use for them at the moment."""
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
if tag_elem.tag == "script":
return None
text = _construct_text(tag_elem)
if not text:
return None
@ -265,11 +268,12 @@ def is_narrative_tag(text: str, tag: str) -> bool:
def _construct_text(tag_elem: etree.Element) -> str:
"""Extracts text from a text tag element."""
text = ""
for item in tag_elem.iter():
if item.text and item.tag != "script":
text += item.text
if item.tail:
text += item.tail
for item in tag_elem.itertext():
if item:
text += item
if tag_elem.tail:
text = text + tag_elem.tail
text = replace_unicode_quotes(text)
return text.strip()