2022-12-12 09:22:10 -05:00
|
|
|
import os
|
|
|
|
import pathlib
|
2024-03-18 02:09:44 +01:00
|
|
|
from tempfile import SpooledTemporaryFile
|
2023-02-07 09:09:34 -05:00
|
|
|
from unittest.mock import patch
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
import pytest
|
2023-02-07 09:09:34 -05:00
|
|
|
import requests
|
2023-03-23 20:14:57 -07:00
|
|
|
from requests.models import Response
|
2022-12-12 09:22:10 -05:00
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-06-27 21:52:39 +03:00
|
|
|
from unstructured.cleaners.core import clean_extra_whitespace
|
2023-10-03 11:54:36 -04:00
|
|
|
from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title
|
2023-09-15 09:51:22 -07:00
|
|
|
from unstructured.documents.html import HTMLTitle
|
2022-12-12 09:22:10 -05:00
|
|
|
from unstructured.partition.html import partition_html
|
|
|
|
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
2023-07-07 06:25:03 -07:00
|
|
|
EXPECTED_OUTPUT_LANGUAGE_DE = [
|
|
|
|
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
|
|
|
|
]
|
|
|
|
|
2022-12-12 09:22:10 -05:00
|
|
|
|
|
|
|
def test_partition_html_from_filename():
|
2023-05-15 18:25:39 -04:00
|
|
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
filename = os.path.join(directory, "example-10k.html")
|
2022-12-12 09:22:10 -05:00
|
|
|
elements = partition_html(filename=filename)
|
2023-02-08 10:11:15 -05:00
|
|
|
assert len(elements) > 0
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-05-15 18:25:39 -04:00
|
|
|
assert elements[0].metadata.filename == "example-10k.html"
|
|
|
|
assert elements[0].metadata.file_directory == directory
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-09-15 09:51:22 -07:00
|
|
|
def test_partition_html_from_filename_returns_html_elements():
|
|
|
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
filename = os.path.join(directory, "example-10k.html")
|
|
|
|
elements = partition_html(filename=filename)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert isinstance(elements[0], HTMLTitle)
|
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_html_from_filename_with_metadata_filename():
|
|
|
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
filename = os.path.join(directory, "example-10k.html")
|
|
|
|
elements = partition_html(filename=filename, metadata_filename="test")
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
|
|
|
|
|
|
|
|
2023-06-05 11:27:12 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding", "error"),
|
|
|
|
[
|
|
|
|
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error):
|
|
|
|
with pytest.raises(error):
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
with open(filename) as f:
|
|
|
|
partition_html(file=f, encoding=encoding)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-07-26 15:10:14 -04:00
|
|
|
[
|
|
|
|
"example-10k-utf-16.html",
|
|
|
|
"example-steelJIS-datasheet-utf-16.html",
|
|
|
|
"fake-html-lang-de.html",
|
|
|
|
],
|
2023-06-05 11:27:12 -07:00
|
|
|
)
|
|
|
|
def test_partition_html_from_filename_default_encoding(filename):
|
2023-07-05 15:02:22 -05:00
|
|
|
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
elements = partition_html(filename=filename_path)
|
2023-06-05 11:27:12 -07:00
|
|
|
assert len(elements) > 0
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == filename
|
2023-07-07 06:25:03 -07:00
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
2023-06-05 11:27:12 -07:00
|
|
|
|
|
|
|
|
2023-05-30 15:47:55 -05:00
|
|
|
def test_partition_html_from_filename_metadata_false():
|
|
|
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
filename = os.path.join(directory, "example-10k.html")
|
|
|
|
elements = partition_html(filename=filename, include_metadata=False)
|
|
|
|
metadata_present = any(element.metadata.to_dict() for element in elements)
|
|
|
|
assert not metadata_present
|
|
|
|
|
|
|
|
|
2023-02-08 10:11:15 -05:00
|
|
|
def test_partition_html_with_page_breaks():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
|
|
elements = partition_html(filename=filename, include_page_breaks=True)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2022-12-12 09:22:10 -05:00
|
|
|
assert len(elements) > 0
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "example-10k.html"
|
2022-12-12 09:22:10 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-12 09:22:10 -05:00
|
|
|
elements = partition_html(file=f)
|
|
|
|
assert len(elements) > 0
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_with_metadata_filename():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_html(file=f, metadata_filename="test")
|
|
|
|
assert len(elements) > 0
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
2022-12-12 09:22:10 -05:00
|
|
|
|
|
|
|
|
2023-06-05 11:27:12 -07:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding", "error"),
|
|
|
|
[
|
|
|
|
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_raises_encoding_error(filename, encoding, error):
|
|
|
|
with pytest.raises(error):
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
2023-07-05 15:02:22 -05:00
|
|
|
with open(filename) as f, pytest.raises(UnicodeEncodeError):
|
2023-06-05 11:27:12 -07:00
|
|
|
partition_html(file=f, encoding=encoding)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-07-26 15:10:14 -04:00
|
|
|
[
|
|
|
|
"example-10k-utf-16.html",
|
|
|
|
"example-steelJIS-datasheet-utf-16.html",
|
|
|
|
"fake-html-lang-de.html",
|
|
|
|
],
|
2023-06-05 11:27:12 -07:00
|
|
|
)
|
|
|
|
def test_partition_html_from_file_default_encoding(filename):
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
assert len(elements) > 0
|
2023-07-07 06:25:03 -07:00
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
2023-06-05 11:27:12 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "encoding", "error"),
|
|
|
|
[
|
|
|
|
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error):
|
|
|
|
with pytest.raises(error):
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
partition_html(file=f, encoding=encoding)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-07-26 15:10:14 -04:00
|
|
|
[
|
|
|
|
"example-10k-utf-16.html",
|
|
|
|
"example-steelJIS-datasheet-utf-16.html",
|
|
|
|
"fake-html-lang-de.html",
|
|
|
|
],
|
2023-06-05 11:27:12 -07:00
|
|
|
)
|
|
|
|
def test_partition_html_from_file_rb_default_encoding(filename):
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
assert len(elements) > 0
|
2023-07-07 06:25:03 -07:00
|
|
|
if filename == "fake-html-lang-de.html":
|
|
|
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
2023-06-05 11:27:12 -07:00
|
|
|
|
|
|
|
|
2022-12-12 09:22:10 -05:00
|
|
|
def test_partition_html_from_text():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-12 09:22:10 -05:00
|
|
|
text = f.read()
|
|
|
|
elements = partition_html(text=text)
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
2023-03-28 17:03:51 -04:00
|
|
|
def test_partition_html_from_text_works_with_empty_string():
|
|
|
|
assert partition_html(text="") == []
|
|
|
|
|
|
|
|
|
2023-02-07 09:09:34 -05:00
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, text, status_code, headers={}):
|
|
|
|
self.text = text
|
|
|
|
self.status_code = status_code
|
|
|
|
self.ok = status_code < 300
|
|
|
|
self.headers = headers
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-02-07 09:09:34 -05:00
|
|
|
text = f.read()
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
response = MockResponse(
|
|
|
|
text=text,
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
2023-02-07 09:09:34 -05:00
|
|
|
with patch.object(requests, "get", return_value=response) as _:
|
|
|
|
elements = partition_html(url="https://fake.url")
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_raises_with_bad_status_code():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-02-07 09:09:34 -05:00
|
|
|
text = f.read()
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
response = MockResponse(
|
|
|
|
text=text,
|
|
|
|
status_code=500,
|
|
|
|
headers={"Content-Type": "text/html"},
|
|
|
|
)
|
2023-09-19 22:26:36 -06:00
|
|
|
with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
|
|
|
|
partition_html(url="https://fake.url")
|
2023-02-07 09:09:34 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_raises_with_bad_content_type():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2023-02-07 09:09:34 -05:00
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
response = MockResponse(
|
2023-02-27 17:30:54 +01:00
|
|
|
text=text,
|
|
|
|
status_code=200,
|
|
|
|
headers={"Content-Type": "application/json"},
|
2023-02-07 09:09:34 -05:00
|
|
|
)
|
2023-09-19 22:26:36 -06:00
|
|
|
with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
|
|
|
|
partition_html(url="https://fake.url")
|
2023-02-07 09:09:34 -05:00
|
|
|
|
|
|
|
|
2023-03-23 20:14:57 -07:00
|
|
|
def test_partition_from_url_uses_headers(mocker):
|
|
|
|
test_url = "https://example.com"
|
|
|
|
test_headers = {"User-Agent": "test"}
|
|
|
|
|
|
|
|
response = Response()
|
|
|
|
response.status_code = 200
|
|
|
|
response._content = (
|
|
|
|
b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
|
|
|
|
)
|
|
|
|
response.headers = {"Content-Type": "text/html"}
|
|
|
|
|
|
|
|
mock_get = mocker.patch("requests.get", return_value=response)
|
|
|
|
|
|
|
|
partition_html(url=test_url, headers=test_headers)
|
|
|
|
|
|
|
|
# Check if requests.get was called with the correct arguments
|
2023-04-20 11:13:56 -04:00
|
|
|
mock_get.assert_called_once_with(test_url, headers=test_headers, verify=True)
|
2023-03-23 20:14:57 -07:00
|
|
|
|
|
|
|
|
2022-12-12 09:22:10 -05:00
|
|
|
def test_partition_html_raises_with_none_specified():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_html()
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_raises_with_too_many_specified():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
2023-02-27 17:30:54 +01:00
|
|
|
with open(filename) as f:
|
2022-12-12 09:22:10 -05:00
|
|
|
text = f.read()
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_html(filename=filename, text=text)
|
2023-03-02 14:03:13 -05:00
|
|
|
|
|
|
|
|
2023-09-07 09:16:31 -04:00
|
|
|
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
|
2023-03-02 14:03:13 -05:00
|
|
|
elements = partition_html(filename=filename)
|
2023-09-11 11:14:11 -07:00
|
|
|
assert len(elements) == 1
|
|
|
|
assert elements[0] == Table(
|
2024-01-04 13:53:19 -08:00
|
|
|
text="January 2023 ( Someone fed my essays into GPT to make something "
|
2023-09-11 11:14:11 -07:00
|
|
|
"that could answer\nquestions based on them, then asked it where good "
|
|
|
|
"ideas come from. The\nanswer was ok, but not what I would have said. "
|
|
|
|
"This is what I would have said.) The way to get new ideas is to notice "
|
|
|
|
"anomalies: what seems strange,\nor missing, or broken? You can see anomalies"
|
|
|
|
" in everyday life (much\nof standup comedy is based on this), but the best "
|
|
|
|
"place to look for\nthem is at the frontiers of knowledge. Knowledge grows "
|
|
|
|
"fractally.\nFrom a distance its edges look smooth, but when you learn "
|
|
|
|
"enough\nto get close to one, you'll notice it's full of gaps. These "
|
|
|
|
"gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "
|
|
|
|
"or wondered about y. In the best case, exploring such gaps yields\nwhole "
|
|
|
|
"new fractal buds.",
|
|
|
|
)
|
2023-09-07 09:16:31 -04:00
|
|
|
|
|
|
|
assert elements[0].metadata.emphasized_text_contents is None
|
|
|
|
assert elements[0].metadata.link_urls is None
|
2023-09-11 11:14:11 -07:00
|
|
|
assert elements[0].metadata.text_as_html is not None
|
2023-03-13 13:06:45 -07:00
|
|
|
|
|
|
|
|
|
|
|
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):
|
2023-07-26 15:10:14 -04:00
|
|
|
example_filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"example-docs",
|
|
|
|
"example-10k.html",
|
|
|
|
)
|
2023-03-13 13:06:45 -07:00
|
|
|
|
|
|
|
# create a file with no write permissions
|
|
|
|
read_only_file = tmp_path / "example-10k-readonly.html"
|
|
|
|
read_only_file.touch()
|
|
|
|
|
|
|
|
# set content of read_only_file to be that of example-10k.html
|
|
|
|
with open(example_filename) as f:
|
|
|
|
read_only_file.write_text(f.read())
|
|
|
|
|
|
|
|
# set read_only_file to be read only
|
|
|
|
read_only_file.chmod(0o444)
|
|
|
|
|
|
|
|
# partition html should still work
|
|
|
|
elements = partition_html(filename=read_only_file.resolve())
|
|
|
|
assert len(elements) > 0
|
2023-04-05 16:18:54 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_processes_chinese_chracters():
|
|
|
|
html_text = "<html><div><p>每日新闻</p></div></html>"
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
assert elements[0].text == "每日新闻"
|
2023-04-13 15:39:08 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_emoji_appears_with_emoji_utf8_code():
|
|
|
|
html_text = """\n<html charset="utf-8"><p>Hello 😀</p></html>"""
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
assert elements[0] == Title("Hello 😀")
|
2023-06-20 13:07:30 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_can_turn_off_assemble_articles():
|
|
|
|
html_text = """<html>
|
|
|
|
<article>
|
|
|
|
<h1>Some important stuff is going on!</h1>
|
|
|
|
<p>Here is a description of that stuff</p>
|
|
|
|
</article>
|
|
|
|
<article>
|
|
|
|
<h1>Some other important stuff is going on!</h1>
|
|
|
|
<p>Here is a description of that stuff</p>
|
|
|
|
</article>
|
|
|
|
<h4>This is outside of the article.</h4>
|
|
|
|
</html>
|
|
|
|
"""
|
|
|
|
elements = partition_html(text=html_text, html_assemble_articles=False)
|
|
|
|
assert elements[-1] == Title("This is outside of the article.")
|
2023-06-27 21:52:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_with_pre_tag():
|
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
|
|
|
|
elements = partition_html(filename=filename)
|
|
|
|
|
|
|
|
assert len(elements) > 0
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-08-25 00:14:48 -04:00
|
|
|
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
2023-06-27 21:52:39 +03:00
|
|
|
assert elements[0].metadata.filetype == "text/html"
|
|
|
|
assert elements[0].metadata.filename == "fake-html-pre.htm"
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_filename_exclude_metadata():
|
|
|
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
filename = os.path.join(directory, "example-10k.html")
|
|
|
|
elements = partition_html(filename=filename, include_metadata=False)
|
|
|
|
assert len(elements) > 0
|
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
|
|
|
assert elements[0].metadata.filename is None
|
|
|
|
assert elements[0].metadata.file_directory is None
|
2023-07-24 14:28:56 -04:00
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_html_metadata_date(mocker, filename="example-docs/fake-html.html"):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.html.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = partition_html(filename=filename)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.html.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_html(file=f)
|
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_explicit_get_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.html.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_html(file=f, date_from_file_object=True)
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.html.get_last_modified_date",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_html(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.partition.html.get_last_modified_date_from_file",
|
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename) as f:
|
2023-08-05 00:56:33 +03:00
|
|
|
elements = partition_html(
|
|
|
|
file=f,
|
|
|
|
metadata_last_modified=expected_last_modification_date,
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
def test_partition_html_from_file_without_metadata_date(
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
"""Test partition_html() with file that are not possible to get last modified date"""
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
sf = SpooledTemporaryFile()
|
|
|
|
sf.write(f.read())
|
|
|
|
sf.seek(0)
|
|
|
|
elements = partition_html(file=sf, date_from_file_object=True)
|
|
|
|
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_html_from_text_metadata_date(filename="example-docs/fake-html.html"):
|
|
|
|
elements = partition_html(text="<html><div><p>TEST</p></div></html>")
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified is None
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_text_custom_metadata_date(
|
|
|
|
filename="example-docs/fake-html.html",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_html(
|
|
|
|
text="<html><div><p>TEST</p></div></html>",
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
assert isinstance(elements[0], Title)
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2023-07-24 14:28:56 -04:00
|
|
|
def test_partition_html_grabs_links():
|
|
|
|
html_text = """<html>
|
|
|
|
<p>Hello there I am a <a href="/link">very important link!</a></p>
|
|
|
|
<p>Here is a list of my favorite things</p>
|
|
|
|
<ul>
|
|
|
|
<li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
|
|
|
|
<li>Dogs</li>
|
|
|
|
</ul>
|
|
|
|
<a href="/loner">A lone link!</a>
|
|
|
|
</html>"""
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
|
|
|
|
assert elements[0] == NarrativeText("Hello there I am a very important link!")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[0].metadata.link_urls == ["/link"]
|
|
|
|
assert elements[0].metadata.link_texts == ["very important link!"]
|
2023-07-24 14:28:56 -04:00
|
|
|
|
|
|
|
assert elements[1] == NarrativeText("Here is a list of my favorite things")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[1].metadata.link_urls is None
|
|
|
|
assert elements[1].metadata.link_texts is None
|
2023-07-24 14:28:56 -04:00
|
|
|
|
|
|
|
assert elements[2] == ListItem("Parrots")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
|
|
|
|
assert elements[2].metadata.link_texts == ["Parrots"]
|
2023-07-24 14:28:56 -04:00
|
|
|
|
|
|
|
assert elements[3] == ListItem("Dogs")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[3].metadata.link_urls is None
|
|
|
|
assert elements[3].metadata.link_texts is None
|
2023-07-24 14:28:56 -04:00
|
|
|
|
|
|
|
assert elements[4] == Title("A lone link!")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[4].metadata.link_urls == ["/loner"]
|
|
|
|
assert elements[4].metadata.link_texts == ["A lone link!"]
|
2023-08-03 12:24:25 -04:00
|
|
|
|
|
|
|
|
2023-08-05 00:56:33 +03:00
|
|
|
def test_partition_html_from_filename_with_skip_headers_and_footers(
|
|
|
|
filename="example-docs/fake-html-with-footer-and-header.html",
|
|
|
|
):
|
|
|
|
elements = partition_html(filename=filename, skip_headers_and_footers=True)
|
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
assert "footer" not in element.ancestortags
|
|
|
|
assert "header" not in element.ancestortags
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_file_with_skip_headers_and_footers(
|
|
|
|
filename="example-docs/fake-html-with-footer-and-header.html",
|
|
|
|
):
|
|
|
|
with open(filename) as f:
|
|
|
|
elements = partition_html(file=f, skip_headers_and_footers=True)
|
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
assert "footer" not in element.ancestortags
|
|
|
|
assert "header" not in element.ancestortags
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_text_with_skip_headers_and_footers():
|
|
|
|
text = """
|
|
|
|
<!DOCTYPE html>
|
|
|
|
<html>
|
|
|
|
<header>
|
|
|
|
<p>Header</p>
|
|
|
|
</header>
|
|
|
|
<body>
|
|
|
|
<h1>My First Heading</h1>
|
|
|
|
<p>My first paragraph.</p>
|
|
|
|
</body>
|
|
|
|
<footer>
|
|
|
|
<p>Footer</p>
|
|
|
|
</footer>
|
|
|
|
</html>"""
|
|
|
|
elements = partition_html(text=text, skip_headers_and_footers=True)
|
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
assert "footer" not in element.ancestortags
|
|
|
|
assert "header" not in element.ancestortags
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_from_url_with_skip_headers_and_footers(mocker):
|
|
|
|
test_url = "https://example.com"
|
|
|
|
test_headers = {"User-Agent": "test"}
|
|
|
|
|
|
|
|
response = Response()
|
|
|
|
response.status_code = 200
|
|
|
|
response._content = b"""<html>
|
|
|
|
<header>
|
|
|
|
<p>Header</p>
|
|
|
|
</header>
|
|
|
|
<body>
|
|
|
|
<h1>My First Heading</h1>
|
|
|
|
<p>My first paragraph.</p>
|
|
|
|
</body>
|
|
|
|
<footer>
|
|
|
|
<p>Footer</p>
|
|
|
|
</footer>
|
|
|
|
</html>"""
|
|
|
|
response.headers = {"Content-Type": "text/html"}
|
|
|
|
|
|
|
|
mocker.patch("requests.get", return_value=response)
|
|
|
|
|
|
|
|
elements = partition_html(url=test_url, headers=test_headers, skip_headers_and_footers=True)
|
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
assert "footer" not in element.ancestortags
|
|
|
|
assert "header" not in element.ancestortags
|
|
|
|
|
|
|
|
|
2023-08-03 12:24:25 -04:00
|
|
|
def test_partition_html_grabs_emphasized_texts():
|
|
|
|
html_text = """<html>
|
|
|
|
<p>Hello there I am a very <strong>important</strong> text!</p>
|
|
|
|
<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
|
|
|
|
<ul>
|
|
|
|
<li><em>Parrots</em></li>
|
|
|
|
<li>Dogs</li>
|
|
|
|
</ul>
|
|
|
|
<span>A lone span text!</span>
|
|
|
|
</html>"""
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
|
|
|
|
assert elements[0] == NarrativeText("Hello there I am a very important text!")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[0].metadata.emphasized_text_contents == ["important"]
|
|
|
|
assert elements[0].metadata.emphasized_text_tags == ["strong"]
|
2023-08-03 12:24:25 -04:00
|
|
|
|
|
|
|
assert elements[1] == NarrativeText("Here is a list of my favorite things")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[1].metadata.emphasized_text_contents == [
|
|
|
|
"list",
|
|
|
|
"my favorite things",
|
|
|
|
"favorite",
|
2023-08-03 12:24:25 -04:00
|
|
|
]
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]
|
2023-08-03 12:24:25 -04:00
|
|
|
|
|
|
|
assert elements[2] == ListItem("Parrots")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
|
|
|
|
assert elements[2].metadata.emphasized_text_tags == ["em"]
|
2023-08-03 12:24:25 -04:00
|
|
|
|
|
|
|
assert elements[3] == ListItem("Dogs")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[3].metadata.emphasized_text_contents is None
|
|
|
|
assert elements[3].metadata.emphasized_text_tags is None
|
2023-08-03 12:24:25 -04:00
|
|
|
|
|
|
|
assert elements[4] == Title("A lone span text!")
|
2023-08-15 21:33:06 -07:00
|
|
|
assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
|
|
|
|
assert elements[4].metadata.emphasized_text_tags == ["span"]
|
2023-08-25 00:14:48 -04:00
|
|
|
|
|
|
|
|
2023-08-29 16:59:26 -04:00
|
|
|
def test_partition_html_with_json():
|
2023-10-12 12:47:55 -07:00
|
|
|
elements = partition_html(example_doc_path("example-10k.html"))
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2023-08-25 00:14:48 -04:00
|
|
|
def test_pre_tag_parsing_respects_order():
|
|
|
|
html_text = """
|
|
|
|
<pre>The Big Brown Bear</pre>
|
|
|
|
<div>The big brown bear is growling.</div>
|
|
|
|
<pre>The big brown bear is sleeping.</pre>
|
|
|
|
<div>The Big Blue Bear</div>
|
|
|
|
"""
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
assert elements == [
|
|
|
|
Title("The Big Brown Bear"),
|
|
|
|
NarrativeText("The big brown bear is growling."),
|
|
|
|
NarrativeText("The big brown bear is sleeping."),
|
|
|
|
Title("The Big Blue Bear"),
|
|
|
|
]
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_add_chunking_strategy_on_partition_html(
|
|
|
|
filename="example-docs/example-10k.html",
|
|
|
|
):
|
|
|
|
elements = partition_html(filename=filename)
|
|
|
|
chunk_elements = partition_html(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-03 11:54:36 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_html_heading_title_detection():
|
|
|
|
html_text = """
|
|
|
|
<p>This is a section of narrative text, it's long, flows and has meaning</p>
|
|
|
|
<h1>This is a section of narrative text, it's long, flows and has meaning</h1>
|
|
|
|
<h2>A heading that is at the second level</h2>
|
|
|
|
<h3>Finally, the third heading</h3>
|
|
|
|
<h2>December 1-17, 2017</h2>
|
|
|
|
<h3>email@example.com</h3>
|
|
|
|
<h3><li>- bulleted item</li></h3>
|
|
|
|
"""
|
|
|
|
elements = partition_html(text=html_text)
|
|
|
|
assert elements == [
|
|
|
|
NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
|
|
|
|
Title("This is a section of narrative text, it's long, flows and has meaning"),
|
|
|
|
Title("A heading that is at the second level"),
|
|
|
|
Title("Finally, the third heading"),
|
|
|
|
Title("December 1-17, 2017"),
|
|
|
|
EmailAddress("email@example.com"),
|
|
|
|
ListItem("- bulleted item"),
|
|
|
|
]
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_element_metadata_has_languages():
|
|
|
|
filename = "example-docs/example-10k.html"
|
|
|
|
elements = partition_html(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_html_respects_detect_language_per_element():
|
|
|
|
filename = "example-docs/language-docs/eng_spa_mult.html"
|
|
|
|
elements = partition_html(filename=filename, detect_language_per_element=True)
|
|
|
|
langs = [element.metadata.languages for element in elements]
|
|
|
|
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
|
2023-10-20 18:21:59 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("tag", "expected"),
|
|
|
|
[
|
2023-11-20 08:29:32 -08:00
|
|
|
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
|
|
|
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
|
2023-10-20 18:21:59 -05:00
|
|
|
],
|
|
|
|
)
|
2023-11-20 08:29:32 -08:00
|
|
|
def test_partition_html_with_table_without_tbody(tag: str, expected: str):
|
|
|
|
table_html = (
|
|
|
|
f"<table>\n"
|
|
|
|
f" <{tag}>\n"
|
|
|
|
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
|
|
|
|
f" </{tag}>\n"
|
|
|
|
f"</table>"
|
|
|
|
)
|
2023-10-20 18:21:59 -05:00
|
|
|
partitions = partition_html(text=table_html)
|
|
|
|
assert partitions[0].metadata.text_as_html == expected
|