mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 19:16:03 +00:00
fix: ensure all text is maintained in html output (#335)
* fix: ensure all text is maintained in html pages * add back in replace unicode quotes * changelog and version bump * apt-get update in ci * white space differences in output
This commit is contained in:
parent
ed074b5828
commit
a5da3de43b
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -104,6 +104,7 @@ jobs:
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
make install-detectron2
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
|
||||
make test
|
||||
make check-coverage
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
## 0.5.2-dev1
|
||||
## 0.5.2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,10 +9,11 @@ rather than a "tmp-ingest-" dir in the working directory.
|
||||
|
||||
### Fixes
|
||||
|
||||
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
|
||||
* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
|
||||
`DEBIAN_FRONTEND=noninteractive` as a command
|
||||
* `unstructured-ingest` no longer re-downloads files when --preserve-downloads
|
||||
is used without --download-dir.
|
||||
* Fixed an issue that was causing text to be skipped in some HTML documents.
|
||||
|
||||
## 0.5.1
|
||||
|
||||
|
||||
44
example-docs/ideas-page.html
Normal file
44
example-docs/ideas-page.html
Normal file
@ -0,0 +1,44 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
<html><script type="text/javascript">
|
||||
<!--
|
||||
(new Image).src="https://store.yahoo.net/cgi-bin/refsd?e=http://paulgraham.com/getideas.html&h=paulgraham.com&v=1.0&dr=" + escape(document.referrer);
|
||||
-->
|
||||
</script>
|
||||
<head><title>How to Get New Ideas</title><!-- <META NAME="ROBOTS" CONTENT="NOODP"> -->
|
||||
<link rel="shortcut icon" href="http://ycombinator.com/arc/arc.png">
|
||||
</head><body bgcolor=ffffff background="https://sep.yimg.com/ca/I/paulgraham_2271_0" text=000000 link=000099 vlink=464646><table border=0 cellspacing=0 cellpadding=0><tr valign=top><td><map name=c04963d10de5f><area shape=rect coords="0,0,67,21" href="index.html"><area shape=rect coords="0,21,67,42" href="articles.html"><area shape=rect coords="0,42,67,63" href="http://www.amazon.com/gp/product/0596006624"><area shape=rect coords="0,63,67,84" href="books.html"><area shape=rect coords="0,84,67,105" href="http://ycombinator.com"><area shape=rect coords="0,105,67,126" href="arc.html"><area shape=rect coords="0,126,67,147" href="bel.html"><area shape=rect coords="0,147,67,168" href="lisp.html"><area shape=rect coords="0,168,67,189" href="antispam.html"><area shape=rect coords="0,189,67,210" href="kedrosky.html"><area shape=rect coords="0,210,67,231" href="faq.html"><area shape=rect coords="0,231,67,252" href="raq.html"><area shape=rect coords="0,252,67,273" href="quo.html"><area shape=rect coords="0,273,67,294" href="rss.html"><area shape=rect coords="0,294,67,315" href="bio.html"><area shape=rect coords="0,315,67,336" href="https://twitter.com/paulg"><area shape=rect coords="0,336,67,357" href="https://mas.to/@paulg"></map><img src="https://s.yimg.com/aah/paulgraham/img-20.gif" width=69 height=357 usemap=#c04963d10de5f border=0 hspace=0 vspace=0 ismap></td><td><img src="https://sep.yimg.com/ca/Img/trans_1x1.gif" height=1 width=26 border=0></td><td><a href="index.html"><img src="https://sep.yimg.com/ca/I/paulgraham_2271_3232" width=410 height=45 border=0 hspace=0 vspace=0></a><br><br><table border=0 cellspacing=0 cellpadding=0 width=435><tr valign=top><td width=435><img src="https://s.yimg.com/aah/paulgraham/how-to-get-new-ideas-1.gif" width=176 height=18 border=0 hspace=0 vspace=0 alt="How to Get New Ideas"><br><br><font size=2 face="verdana">January 2023<br><br><i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer
|
||||
questions based on them, then asked it where good ideas come from. The
|
||||
answer was ok, but not what I would have said. This is what I would have said.)</i><br><br>The way to get new ideas is to notice anomalies: what seems strange,
|
||||
or missing, or broken? You can see anomalies in everyday life (much
|
||||
of standup comedy is based on this), but the best place to look for
|
||||
them is at the frontiers of knowledge.<br><br>Knowledge grows fractally.
|
||||
From a distance its edges look smooth, but when you learn enough
|
||||
to get close to one, you'll notice it's full of gaps. These gaps
|
||||
will seem obvious; it will seem inexplicable that no one has tried
|
||||
x or wondered about y. In the best case, exploring such gaps yields
|
||||
whole new fractal buds.<br><br></font></td></tr></table><table border=0 cellspacing=0 cellpadding=0 width=435><tr><td><font size=2 face="verdana"><br><br><hr></font></td></tr></table></td></tr></table></body>
|
||||
<script type="text/javascript">
|
||||
csell_env = 'bf1';
|
||||
var storeCheckoutDomain = 'order.store.yahoo.net';
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
// Begin Yahoo Store Generated Code
|
||||
</script> <script type="text/javascript" src="https://s.turbifycdn.com/lq/ult/ylc_1.9.js" ></script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/lib/smbiz/store/csell/beacon-a9518fc6e4.js" >
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
// Begin Yahoo Store Generated Code
|
||||
csell_page_data = {}; csell_page_rec_data = []; ts='TOK_STORE_ID';
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
// Begin Yahoo Store Generated Code
|
||||
function csell_GLOBAL_INIT_TAG() { var csell_token_map = {}; csell_token_map['TOK_ITEM_ID_LIST'] = 'getideas'; csell_token_map['TOK_BEACON_TYPE'] = 'prod'; csell_token_map['TOK_RAND_KEY'] = 't'; csell_token_map['TOK_SPACEID'] = '2022276099'; csell_token_map['TOK_IS_ORDERABLE'] = '2'; csell_token_map['TOK_STORE_ID'] = 'paulgraham'; csell_token_map['TOK_URL'] = ''; csell_token_map['TOK_ORDER_HOST'] = 'order.store.yahoo.net'; c = csell_page_data; var x = (typeof storeCheckoutDomain == 'string')?storeCheckoutDomain:'order.store.yahoo.net'; var t = csell_token_map; c['s'] = t['TOK_SPACEID']; c['url'] = t['TOK_URL']; c['si'] = t[ts]; c['ii'] = t['TOK_ITEM_ID_LIST']; c['bt'] = t['TOK_BEACON_TYPE']; c['rnd'] = t['TOK_RAND_KEY']; c['io'] = t['TOK_IS_ORDERABLE']; YStore.addItemUrl = 'http%s://'+x+'/'+t[ts]+'/ymix/MetaController.html?eventName.addEvent&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_itemId=%s&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_quantity=1&ysco_key_cs_item=1§ionId=ysco.cart&ysco_key_store_id='+t[ts]; }
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
// Begin Yahoo Store Generated Code
|
||||
function csell_REC_VIEW_TAG() { var env = (typeof csell_env == 'string')?csell_env:'prod'; var p = csell_page_data; var a = '/sid='+p['si']+'/io='+p['io']+'/ii='+p['ii']+'/bt='+p['bt']+'-view'+'/en='+env; var r=Math.random(); YStore.CrossSellBeacon.renderBeaconWithRecData(p['url']+'/p/s='+p['s']+'/'+p['rnd']+'='+r+a); }
|
||||
</script>
|
||||
<script type="text/javascript">
|
||||
// Begin Yahoo Store Generated Code
|
||||
var csell_token_map = {}; csell_token_map['TOK_PAGE'] = 'p'; csell_token_map['TOK_WS_URL'] = 'https://paulgraham.csell.store.yahoo.net/cs/recommend?itemids=getideas&location=p'; csell_token_map['TOK_SHOW_CS_RECS'] = 'false'; csell_token_map['TOK_CURR_SYM'] = '$'; var t = csell_token_map; csell_GLOBAL_INIT_TAG(); YStore.page = t['TOK_PAGE']; YStore.currencySymbol = t['TOK_CURR_SYM']; YStore.crossSellUrl = t['TOK_WS_URL']; YStore.showCSRecs = t['TOK_SHOW_CS_RECS']; </script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/store/secure/recs-1.3.2.2.js" ></script> <script type="text/javascript" >
|
||||
</script>
|
||||
</html>
|
||||
@ -98,3 +98,11 @@ def test_partition_html_raises_with_too_many_specified():
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_html(filename=filename, text=text)
|
||||
|
||||
|
||||
def test_partition_html_on_ideas_page():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
|
||||
elements = partition_html(filename=filename)
|
||||
document_text = "\n\n".join([str(el) for el in elements])
|
||||
assert document_text.startswith("January 2023(Someone fed my essays into GPT")
|
||||
assert document_text.endswith("whole new fractal buds.")
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4300054a3c2601f905282a7bc7199044",
|
||||
"text": "More info available at the \n\t\tGithub Project Page",
|
||||
"element_id": "d551bbfc9477547e4dce6264d8196c7b",
|
||||
"text": "More info available at the Github Project Page",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
@ -24,8 +24,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "a309823c9d508290682a198270b84bca",
|
||||
"text": "File Contents\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
|
||||
"element_id": "43f65b1c5bd47774b25c72e2f96de300",
|
||||
"text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.2-dev1" # pragma: no cover
|
||||
__version__ = "0.5.2" # pragma: no cover
|
||||
|
||||
@ -97,6 +97,7 @@ class HTMLDocument(XMLDocument):
|
||||
return self._pages
|
||||
logger.info("Reading document ...")
|
||||
pages: List[Page] = []
|
||||
etree.strip_elements(self.document_tree, ["script"])
|
||||
root = _find_main(self.document_tree)
|
||||
|
||||
articles = _find_articles(root)
|
||||
@ -213,6 +214,8 @@ def _parse_tag(
|
||||
processing the document tree again. In the future we might want to keep descendants too,
|
||||
but we don't have a use for them at the moment."""
|
||||
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
|
||||
if tag_elem.tag == "script":
|
||||
return None
|
||||
text = _construct_text(tag_elem)
|
||||
if not text:
|
||||
return None
|
||||
@ -265,11 +268,12 @@ def is_narrative_tag(text: str, tag: str) -> bool:
|
||||
def _construct_text(tag_elem: etree.Element) -> str:
|
||||
"""Extracts text from a text tag element."""
|
||||
text = ""
|
||||
for item in tag_elem.iter():
|
||||
if item.text and item.tag != "script":
|
||||
text += item.text
|
||||
if item.tail:
|
||||
text += item.tail
|
||||
for item in tag_elem.itertext():
|
||||
if item:
|
||||
text += item
|
||||
|
||||
if tag_elem.tail:
|
||||
text = text + tag_elem.tail
|
||||
|
||||
text = replace_unicode_quotes(text)
|
||||
return text.strip()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user