diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 733c6e5b1..7bfe98a68 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -104,6 +104,7 @@ jobs:
run: |
source .venv/bin/activate
make install-detectron2
+ sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
make test
make check-coverage
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14a3f4cc8..bbf4f8213 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.5.2-dev1
+## 0.5.2
### Enhancements
@@ -9,10 +9,11 @@ rather than a "tmp-ingest-" dir in the working directory.
### Fixes
-* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
+* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting
`DEBIAN_FRONTEND=noninteractive` as a command
* `unstructured-ingest` no longer re-downloads files when --preserve-downloads
is used without --download-dir.
+* Fixed an issue that was causing text to be skipped in some HTML documents.
## 0.5.1
diff --git a/example-docs/ideas-page.html b/example-docs/ideas-page.html
new file mode 100644
index 000000000..e66033e64
--- /dev/null
+++ b/example-docs/ideas-page.html
@@ -0,0 +1,44 @@
+
+
+
How to Get New Ideas
+
+
January 2023
(Someone fed my essays into GPT to make something that could answer
+questions based on them, then asked it where good ideas come from. The
+answer was ok, but not what I would have said. This is what I would have said.)
The way to get new ideas is to notice anomalies: what seems strange,
+or missing, or broken? You can see anomalies in everyday life (much
+of standup comedy is based on this), but the best place to look for
+them is at the frontiers of knowledge.
Knowledge grows fractally.
+From a distance its edges look smooth, but when you learn enough
+to get close to one, you'll notice it's full of gaps. These gaps
+will seem obvious; it will seem inexplicable that no one has tried
+x or wondered about y. In the best case, exploring such gaps yields
+whole new fractal buds.
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py
index 7457b363c..2e7653182 100644
--- a/test_unstructured/partition/test_html_partition.py
+++ b/test_unstructured/partition/test_html_partition.py
@@ -98,3 +98,11 @@ def test_partition_html_raises_with_too_many_specified():
with pytest.raises(ValueError):
partition_html(filename=filename, text=text)
+
+
+def test_partition_html_on_ideas_page():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html")
+ elements = partition_html(filename=filename)
+ document_text = "\n\n".join([str(el) for el in elements])
+ assert document_text.startswith("January 2023(Someone fed my essays into GPT")
+ assert document_text.endswith("whole new fractal buds.")
diff --git a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
index 3f9f0e9a8..08a7c585f 100644
--- a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
+++ b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json
@@ -8,8 +8,8 @@
}
},
{
- "element_id": "4300054a3c2601f905282a7bc7199044",
- "text": "More info available at the \n\t\tGithub Project Page",
+ "element_id": "d551bbfc9477547e4dce6264d8196c7b",
+ "text": "More info available at the Github Project Page",
"type": "Title",
"metadata": {
"page_number": 1
@@ -24,8 +24,8 @@
}
},
{
- "element_id": "a309823c9d508290682a198270b84bca",
- "text": "File Contents\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
+ "element_id": "43f65b1c5bd47774b25c72e2f96de300",
+ "text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
"type": "NarrativeText",
"metadata": {
"page_number": 1
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index e42bb33db..1e9a0d3de 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.2-dev1" # pragma: no cover
+__version__ = "0.5.2" # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index edb0ae75e..0535728af 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -97,6 +97,7 @@ class HTMLDocument(XMLDocument):
return self._pages
logger.info("Reading document ...")
pages: List[Page] = []
+ etree.strip_elements(self.document_tree, ["script"])
root = _find_main(self.document_tree)
articles = _find_articles(root)
@@ -213,6 +214,8 @@ def _parse_tag(
processing the document tree again. In the future we might want to keep descendants too,
but we don't have a use for them at the moment."""
ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1]
+ if tag_elem.tag == "script":
+ return None
text = _construct_text(tag_elem)
if not text:
return None
@@ -265,11 +268,12 @@ def is_narrative_tag(text: str, tag: str) -> bool:
def _construct_text(tag_elem: etree.Element) -> str:
"""Extracts text from a text tag element."""
text = ""
- for item in tag_elem.iter():
- if item.text and item.tag != "script":
- text += item.text
- if item.tail:
- text += item.tail
+ for item in tag_elem.itertext():
+ if item:
+ text += item
+
+ if tag_elem.tail:
+ text = text + tag_elem.tail
text = replace_unicode_quotes(text)
return text.strip()