mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00

* Apply import sorting ruff . --select I --fix * Remove unnecessary open mode parameter ruff . --select UP015 --fix * Use f-string formatting rather than .format * Remove extraneous parentheses Also use "" instead of str() * Resolve missing trailing commas ruff . --select COM --fix * Rewrite list() and dict() calls using literals ruff . --select C4 --fix * Add () to pytest.fixture, use tuples for parametrize, etc. ruff . --select PT --fix * Simplify code: merge conditionals, context managers ruff . --select SIM --fix * Import without unnecessary alias ruff . --select PLR0402 --fix * Apply formatting via black * Rewrite ValueError somewhat Slightly unrelated to the rest of the PR * Apply formatting to tests via black * Update expected exception message to match 0d81564 * Satisfy E501 line too long in test * Update changelog & version * Add ruff to make tidy and test deps * Run 'make tidy' * Update changelog & version * Update changelog & version * Add ruff to 'check' target Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
import os
|
|
import pathlib
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from unstructured.documents.elements import PageBreak
|
|
from unstructured.partition.html import partition_html
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
|
def test_partition_html_from_filename():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
elements = partition_html(filename=filename)
|
|
assert PageBreak() not in elements
|
|
assert len(elements) > 0
|
|
|
|
|
|
def test_partition_html_with_page_breaks():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
elements = partition_html(filename=filename, include_page_breaks=True)
|
|
assert PageBreak() in elements
|
|
assert len(elements) > 0
|
|
|
|
|
|
def test_partition_html_from_file():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
elements = partition_html(file=f)
|
|
assert len(elements) > 0
|
|
|
|
|
|
def test_partition_html_from_text():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
elements = partition_html(text=text)
|
|
assert len(elements) > 0
|
|
|
|
|
|
class MockResponse:
|
|
def __init__(self, text, status_code, headers={}):
|
|
self.text = text
|
|
self.status_code = status_code
|
|
self.ok = status_code < 300
|
|
self.headers = headers
|
|
|
|
|
|
def test_partition_html_from_url():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
|
|
response = MockResponse(text=text, status_code=200, headers={"Content-Type": "text/html"})
|
|
with patch.object(requests, "get", return_value=response) as _:
|
|
elements = partition_html(url="https://fake.url")
|
|
|
|
assert len(elements) > 0
|
|
|
|
|
|
def test_partition_html_from_url_raises_with_bad_status_code():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
|
|
response = MockResponse(text=text, status_code=500, headers={"Content-Type": "text/html"})
|
|
with patch.object(requests, "get", return_value=response) as _:
|
|
with pytest.raises(ValueError):
|
|
partition_html(url="https://fake.url")
|
|
|
|
|
|
def test_partition_html_from_url_raises_with_bad_content_type():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
|
|
response = MockResponse(
|
|
text=text,
|
|
status_code=200,
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
with patch.object(requests, "get", return_value=response) as _:
|
|
with pytest.raises(ValueError):
|
|
partition_html(url="https://fake.url")
|
|
|
|
|
|
def test_partition_html_raises_with_none_specified():
|
|
with pytest.raises(ValueError):
|
|
partition_html()
|
|
|
|
|
|
def test_partition_html_raises_with_too_many_specified():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
|
|
with pytest.raises(ValueError):
|
|
partition_html(filename=filename, text=text)
|