diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 733c6e5b1..7bfe98a68 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -104,6 +104,7 @@ jobs: run: | source .venv/bin/activate make install-detectron2 + sudo apt-get update sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice make test make check-coverage diff --git a/CHANGELOG.md b/CHANGELOG.md index 14a3f4cc8..bbf4f8213 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.2-dev1 +## 0.5.2 ### Enhancements @@ -9,10 +9,11 @@ rather than a "tmp-ingest-" dir in the working directory. ### Fixes -* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting +* 'setup_ubuntu.sh` no longer fails in some contexts by interpreting `DEBIAN_FRONTEND=noninteractive` as a command * `unstructured-ingest` no longer re-downloads files when --preserve-downloads is used without --download-dir. +* Fixed an issue that was causing text to be skipped in some HTML documents. ## 0.5.1 diff --git a/example-docs/ideas-page.html b/example-docs/ideas-page.html new file mode 100644 index 000000000..e66033e64 --- /dev/null +++ b/example-docs/ideas-page.html @@ -0,0 +1,44 @@ + + +How to Get New Ideas + +


How to Get New Ideas

January 2023

(Someone fed my essays into GPT to make something that could answer +questions based on them, then asked it where good ideas come from. The +answer was ok, but not what I would have said. This is what I would have said.)

The way to get new ideas is to notice anomalies: what seems strange, +or missing, or broken? You can see anomalies in everyday life (much +of standup comedy is based on this), but the best place to look for +them is at the frontiers of knowledge.

Knowledge grows fractally. +From a distance its edges look smooth, but when you learn enough +to get close to one, you'll notice it's full of gaps. These gaps +will seem obvious; it will seem inexplicable that no one has tried +x or wondered about y. In the best case, exploring such gaps yields +whole new fractal buds.




+ + + + + + + \ No newline at end of file diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index 7457b363c..2e7653182 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -98,3 +98,11 @@ def test_partition_html_raises_with_too_many_specified(): with pytest.raises(ValueError): partition_html(filename=filename, text=text) + + +def test_partition_html_on_ideas_page(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "ideas-page.html") + elements = partition_html(filename=filename) + document_text = "\n\n".join([str(el) for el in elements]) + assert document_text.startswith("January 2023(Someone fed my essays into GPT") + assert document_text.endswith("whole new fractal buds.") diff --git a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json index 3f9f0e9a8..08a7c585f 100644 --- a/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github-downloadify/test.html.json @@ -8,8 +8,8 @@ } }, { - "element_id": "4300054a3c2601f905282a7bc7199044", - "text": "More info available at the \n\t\tGithub Project Page", + "element_id": "d551bbfc9477547e4dce6264d8196c7b", + "text": "More info available at the Github Project Page", "type": "Title", "metadata": { "page_number": 1 @@ -24,8 +24,8 @@ } }, { - "element_id": "a309823c9d508290682a198270b84bca", - "text": "File Contents\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded", + "element_id": "43f65b1c5bd47774b25c72e2f96de300", + "text": "File Contents\n\nWhatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded", "type": "NarrativeText", "metadata": { "page_number": 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e42bb33db..1e9a0d3de 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.2-dev1" # pragma: no cover +__version__ = "0.5.2" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index edb0ae75e..0535728af 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -97,6 +97,7 @@ class HTMLDocument(XMLDocument): return self._pages logger.info("Reading document ...") pages: List[Page] = [] + etree.strip_elements(self.document_tree, ["script"]) root = _find_main(self.document_tree) articles = _find_articles(root) @@ -213,6 +214,8 @@ def _parse_tag( processing the document tree again. In the future we might want to keep descendants too, but we don't have a use for them at the moment.""" ancestortags: Tuple[str, ...] = tuple(el.tag for el in tag_elem.iterancestors())[::-1] + if tag_elem.tag == "script": + return None text = _construct_text(tag_elem) if not text: return None @@ -265,11 +268,12 @@ def is_narrative_tag(text: str, tag: str) -> bool: def _construct_text(tag_elem: etree.Element) -> str: """Extracts text from a text tag element.""" text = "" - for item in tag_elem.iter(): - if item.text and item.tag != "script": - text += item.text - if item.tail: - text += item.tail + for item in tag_elem.itertext(): + if item: + text += item + + if tag_elem.tail: + text = text + tag_elem.tail text = replace_unicode_quotes(text) return text.strip()