unstructured/test_unstructured/partition/test_html_partition.py

825 lines
28 KiB
Python
Raw Normal View History

import os
import pathlib
from tempfile import SpooledTemporaryFile
from unittest.mock import patch
import pytest
import requests
from requests.models import Response
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title
from unstructured.documents.html import HTMLTitle
from unstructured.partition.html import partition_html
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXPECTED_OUTPUT_LANGUAGE_DE = [
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
]
def test_partition_html_from_filename():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename)
assert len(elements) > 0
assert "PageBreak" not in [elem.category for elem in elements]
assert elements[0].metadata.filename == "example-10k.html"
assert elements[0].metadata.file_directory == directory
def test_partition_html_from_filename_returns_html_elements():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename)
assert len(elements) > 0
assert isinstance(elements[0], HTMLTitle)
def test_partition_html_from_filename_with_metadata_filename():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename, metadata_filename="test")
assert len(elements) > 0
assert all(element.metadata.filename == "test" for element in elements)
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
],
)
def test_partition_html_from_filename_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename) as f:
partition_html(file=f, encoding=encoding)
@pytest.mark.parametrize(
"filename",
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
[
"example-10k-utf-16.html",
"example-steelJIS-datasheet-utf-16.html",
"fake-html-lang-de.html",
],
)
def test_partition_html_from_filename_default_encoding(filename):
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_html(filename=filename_path)
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == filename
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
def test_partition_html_from_filename_metadata_false():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename, include_metadata=False)
metadata_present = any(element.metadata.to_dict() for element in elements)
assert not metadata_present
def test_partition_html_with_page_breaks():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
elements = partition_html(filename=filename, include_page_breaks=True)
assert "PageBreak" in [elem.category for elem in elements]
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "example-10k.html"
def test_partition_html_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
elements = partition_html(file=f)
assert len(elements) > 0
for element in elements:
assert element.metadata.filename is None
def test_partition_html_from_file_with_metadata_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
elements = partition_html(file=f, metadata_filename="test")
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == "test"
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
],
)
def test_partition_html_from_file_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename) as f, pytest.raises(UnicodeEncodeError):
partition_html(file=f, encoding=encoding)
@pytest.mark.parametrize(
"filename",
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
[
"example-10k-utf-16.html",
"example-steelJIS-datasheet-utf-16.html",
"fake-html-lang-de.html",
],
)
def test_partition_html_from_file_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename) as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[
("example-10k-utf-16.html", "utf-8", UnicodeDecodeError),
("example-steelJIS-datasheet-utf-16.html", "utf-8", UnicodeDecodeError),
],
)
def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename, "rb") as f:
partition_html(file=f, encoding=encoding)
@pytest.mark.parametrize(
"filename",
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
[
"example-10k-utf-16.html",
"example-steelJIS-datasheet-utf-16.html",
"fake-html-lang-de.html",
],
)
def test_partition_html_from_file_rb_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename, "rb") as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
def test_partition_html_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
text = f.read()
elements = partition_html(text=text)
assert len(elements) > 0
def test_partition_html_from_text_works_with_empty_string():
assert partition_html(text="") == []
class MockResponse:
def __init__(self, text, status_code, headers={}):
self.text = text
self.status_code = status_code
self.ok = status_code < 300
self.headers = headers
def test_partition_html_from_url():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
text = f.read()
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
response = MockResponse(
text=text,
status_code=200,
headers={"Content-Type": "text/html"},
)
with patch.object(requests, "get", return_value=response) as _:
elements = partition_html(url="https://fake.url")
assert len(elements) > 0
def test_partition_html_from_url_raises_with_bad_status_code():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
text = f.read()
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
response = MockResponse(
text=text,
status_code=500,
headers={"Content-Type": "text/html"},
)
with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
partition_html(url="https://fake.url")
def test_partition_html_from_url_raises_with_bad_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
text = f.read()
response = MockResponse(
text=text,
status_code=200,
headers={"Content-Type": "application/json"},
)
with patch.object(requests, "get", return_value=response) as _, pytest.raises(ValueError):
partition_html(url="https://fake.url")
def test_partition_from_url_uses_headers(mocker):
test_url = "https://example.com"
test_headers = {"User-Agent": "test"}
response = Response()
response.status_code = 200
response._content = (
b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
)
response.headers = {"Content-Type": "text/html"}
mock_get = mocker.patch("requests.get", return_value=response)
partition_html(url=test_url, headers=test_headers)
# Check if requests.get was called with the correct arguments
mock_get.assert_called_once_with(test_url, headers=test_headers, verify=True)
def test_partition_html_raises_with_none_specified():
with pytest.raises(ValueError):
partition_html()
def test_partition_html_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
with open(filename) as f:
text = f.read()
with pytest.raises(ValueError):
partition_html(filename=filename, text=text)
def test_partition_html_on_ideas_page(filename="example-docs/ideas-page.html"):
elements = partition_html(filename=filename)
assert len(elements) == 1
assert elements[0] == Table(
text="January 2023 ( Someone fed my essays into GPT to make something "
"that could answer\nquestions based on them, then asked it where good "
"ideas come from. The\nanswer was ok, but not what I would have said. "
"This is what I would have said.) The way to get new ideas is to notice "
"anomalies: what seems strange,\nor missing, or broken? You can see anomalies"
" in everyday life (much\nof standup comedy is based on this), but the best "
"place to look for\nthem is at the frontiers of knowledge. Knowledge grows "
"fractally.\nFrom a distance its edges look smooth, but when you learn "
"enough\nto get close to one, you'll notice it's full of gaps. These "
"gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx "
"or wondered about y. In the best case, exploring such gaps yields\nwhole "
"new fractal buds.",
)
assert elements[0].metadata.emphasized_text_contents is None
assert elements[0].metadata.link_urls is None
assert elements[0].metadata.text_as_html is not None
def test_user_without_file_write_permission_can_partition_html(tmp_path, monkeypatch):
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
example_filename = os.path.join(
DIRECTORY,
"..",
"..",
"example-docs",
"example-10k.html",
)
# create a file with no write permissions
read_only_file = tmp_path / "example-10k-readonly.html"
read_only_file.touch()
# set content of read_only_file to be that of example-10k.html
with open(example_filename) as f:
read_only_file.write_text(f.read())
# set read_only_file to be read only
read_only_file.chmod(0o444)
# partition html should still work
elements = partition_html(filename=read_only_file.resolve())
assert len(elements) > 0
def test_partition_html_processes_chinese_chracters():
html_text = "<html><div><p>每日新闻</p></div></html>"
elements = partition_html(text=html_text)
assert elements[0].text == "每日新闻"
def test_emoji_appears_with_emoji_utf8_code():
html_text = """\n<html charset="utf-8"><p>Hello &#128512;</p></html>"""
elements = partition_html(text=html_text)
assert elements[0] == Title("Hello 😀")
def test_partition_html_can_turn_off_assemble_articles():
html_text = """<html>
<article>
<h1>Some important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<article>
<h1>Some other important stuff is going on!</h1>
<p>Here is a description of that stuff</p>
</article>
<h4>This is outside of the article.</h4>
</html>
"""
elements = partition_html(text=html_text, html_assemble_articles=False)
assert elements[-1] == Title("This is outside of the article.")
def test_partition_html_with_pre_tag():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-html-pre.htm")
elements = partition_html(filename=filename)
assert len(elements) > 0
assert "PageBreak" not in [elem.category for elem in elements]
assert clean_extra_whitespace(elements[0].text).startswith("[107th Congress Public Law 56]")
assert isinstance(elements[0], NarrativeText)
assert elements[0].metadata.filetype == "text/html"
assert elements[0].metadata.filename == "fake-html-pre.htm"
def test_partition_html_from_filename_exclude_metadata():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename, include_metadata=False)
assert len(elements) > 0
assert "PageBreak" not in [elem.category for elem in elements]
assert elements[0].metadata.filename is None
assert elements[0].metadata.file_directory is None
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_metadata_date(mocker, filename="example-docs/fake-html.html"):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.html.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_html(filename=filename)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == mocked_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_from_file_metadata_date(
mocker,
filename="example-docs/fake-html.html",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.html.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename) as f:
elements = partition_html(file=f)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified is None
def test_partition_html_from_file_explicit_get_metadata_date(
mocker,
filename="example-docs/fake-html.html",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.html.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename) as f:
elements = partition_html(file=f, date_from_file_object=True)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == mocked_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_custom_metadata_date(
mocker,
filename="example-docs/fake-html.html",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.html.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_html(
filename=filename,
metadata_last_modified=expected_last_modification_date,
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == expected_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_from_file_custom_metadata_date(
mocker,
filename="example-docs/fake-html.html",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.html.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename) as f:
elements = partition_html(
file=f,
metadata_last_modified=expected_last_modification_date,
)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == expected_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_from_file_without_metadata_date(
filename="example-docs/fake-html.html",
):
"""Test partition_html() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
elements = partition_html(file=sf, date_from_file_object=True)
assert elements[0].metadata.last_modified is None
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_from_text_metadata_date(filename="example-docs/fake-html.html"):
elements = partition_html(text="<html><div><p>TEST</p></div></html>")
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified is None
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_from_text_custom_metadata_date(
filename="example-docs/fake-html.html",
):
expected_last_modification_date = "2020-07-05T09:24:28"
elements = partition_html(
text="<html><div><p>TEST</p></div></html>",
metadata_last_modified=expected_last_modification_date,
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
assert isinstance(elements[0], Title)
assert elements[0].metadata.last_modified == expected_last_modification_date
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_html_grabs_links():
html_text = """<html>
<p>Hello there I am a <a href="/link">very important link!</a></p>
<p>Here is a list of my favorite things</p>
<ul>
<li><a href="https://en.wikipedia.org/wiki/Parrot">Parrots</a></li>
<li>Dogs</li>
</ul>
<a href="/loner">A lone link!</a>
</html>"""
elements = partition_html(text=html_text)
assert elements[0] == NarrativeText("Hello there I am a very important link!")
assert elements[0].metadata.link_urls == ["/link"]
assert elements[0].metadata.link_texts == ["very important link!"]
assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.link_urls is None
assert elements[1].metadata.link_texts is None
assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.link_urls == ["https://en.wikipedia.org/wiki/Parrot"]
assert elements[2].metadata.link_texts == ["Parrots"]
assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.link_urls is None
assert elements[3].metadata.link_texts is None
assert elements[4] == Title("A lone link!")
assert elements[4].metadata.link_urls == ["/loner"]
assert elements[4].metadata.link_texts == ["A lone link!"]
def test_partition_html_from_filename_with_skip_headers_and_footers(
filename="example-docs/fake-html-with-footer-and-header.html",
):
elements = partition_html(filename=filename, skip_headers_and_footers=True)
for element in elements:
assert "footer" not in element.ancestortags
assert "header" not in element.ancestortags
def test_partition_html_from_file_with_skip_headers_and_footers(
filename="example-docs/fake-html-with-footer-and-header.html",
):
with open(filename) as f:
elements = partition_html(file=f, skip_headers_and_footers=True)
for element in elements:
assert "footer" not in element.ancestortags
assert "header" not in element.ancestortags
def test_partition_html_from_text_with_skip_headers_and_footers():
text = """
<!DOCTYPE html>
<html>
<header>
<p>Header</p>
</header>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
<footer>
<p>Footer</p>
</footer>
</html>"""
elements = partition_html(text=text, skip_headers_and_footers=True)
for element in elements:
assert "footer" not in element.ancestortags
assert "header" not in element.ancestortags
def test_partition_html_from_url_with_skip_headers_and_footers(mocker):
test_url = "https://example.com"
test_headers = {"User-Agent": "test"}
response = Response()
response.status_code = 200
response._content = b"""<html>
<header>
<p>Header</p>
</header>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
<footer>
<p>Footer</p>
</footer>
</html>"""
response.headers = {"Content-Type": "text/html"}
mocker.patch("requests.get", return_value=response)
elements = partition_html(url=test_url, headers=test_headers, skip_headers_and_footers=True)
for element in elements:
assert "footer" not in element.ancestortags
assert "header" not in element.ancestortags
def test_partition_html_grabs_emphasized_texts():
html_text = """<html>
<p>Hello there I am a very <strong>important</strong> text!</p>
<p>Here is a <span>list</span> of <b>my <i>favorite</i> things</b></p>
<ul>
<li><em>Parrots</em></li>
<li>Dogs</li>
</ul>
<span>A lone span text!</span>
</html>"""
elements = partition_html(text=html_text)
assert elements[0] == NarrativeText("Hello there I am a very important text!")
assert elements[0].metadata.emphasized_text_contents == ["important"]
assert elements[0].metadata.emphasized_text_tags == ["strong"]
assert elements[1] == NarrativeText("Here is a list of my favorite things")
assert elements[1].metadata.emphasized_text_contents == [
"list",
"my favorite things",
"favorite",
]
assert elements[1].metadata.emphasized_text_tags == ["span", "b", "i"]
assert elements[2] == ListItem("Parrots")
assert elements[2].metadata.emphasized_text_contents == ["Parrots"]
assert elements[2].metadata.emphasized_text_tags == ["em"]
assert elements[3] == ListItem("Dogs")
assert elements[3].metadata.emphasized_text_contents is None
assert elements[3].metadata.emphasized_text_tags is None
assert elements[4] == Title("A lone span text!")
assert elements[4].metadata.emphasized_text_contents == ["A lone span text!"]
assert elements[4].metadata.emphasized_text_tags == ["span"]
def test_partition_html_with_json():
elements = partition_html(example_doc_path("example-10k.html"))
assert_round_trips_through_JSON(elements)
def test_pre_tag_parsing_respects_order():
html_text = """
<pre>The Big Brown Bear</pre>
<div>The big brown bear is growling.</div>
<pre>The big brown bear is sleeping.</pre>
<div>The Big Blue Bear</div>
"""
elements = partition_html(text=html_text)
assert elements == [
Title("The Big Brown Bear"),
NarrativeText("The big brown bear is growling."),
NarrativeText("The big brown bear is sleeping."),
Title("The Big Blue Bear"),
]
def test_add_chunking_strategy_on_partition_html(
filename="example-docs/example-10k.html",
):
elements = partition_html(filename=filename)
chunk_elements = partition_html(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_html_heading_title_detection():
html_text = """
<p>This is a section of narrative text, it's long, flows and has meaning</p>
<h1>This is a section of narrative text, it's long, flows and has meaning</h1>
<h2>A heading that is at the second level</h2>
<h3>Finally, the third heading</h3>
<h2>December 1-17, 2017</h2>
<h3>email@example.com</h3>
<h3><li>- bulleted item</li></h3>
"""
elements = partition_html(text=html_text)
assert elements == [
NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
Title("This is a section of narrative text, it's long, flows and has meaning"),
Title("A heading that is at the second level"),
Title("Finally, the third heading"),
Title("December 1-17, 2017"),
EmailAddress("email@example.com"),
ListItem("- bulleted item"),
]
def test_partition_html_element_metadata_has_languages():
filename = "example-docs/example-10k.html"
elements = partition_html(filename=filename)
assert elements[0].metadata.languages == ["eng"]
def test_partition_html_respects_detect_language_per_element():
filename = "example-docs/language-docs/eng_spa_mult.html"
elements = partition_html(filename=filename, detect_language_per_element=True)
langs = [element.metadata.languages for element in elements]
assert langs == [["eng"], ["spa", "eng"], ["eng"], ["eng"], ["spa"]]
@pytest.mark.parametrize(
("tag", "expected"),
[
("thead", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
("tfoot", "<table><tr><td>Header 1</td><td>Header 2</td></tr></table>"),
],
)
def test_partition_html_with_table_without_tbody(tag: str, expected: str):
table_html = (
f"<table>\n"
f" <{tag}>\n"
f" <tr><th>Header 1</th><th>Header 2</th></tr>\n"
f" </{tag}>\n"
f"</table>"
)
partitions = partition_html(text=table_html)
assert partitions[0].metadata.text_as_html == expected
def test_all_element_ids_are_unique():
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
assert len(ids) == len(set(ids))
def test_element_ids_are_deterministic():
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
assert ids == [
"cba9e551ed975e0f8a1956095894e92a",
"f540ea3b6569aafeb433df6616e79971",
"f4a34ee0fac26589fffdb53d0dfedbaf",
"15168aeddbd19da60791109a5a45af65",
"0c027f66120dd96271489dd0bb69bff5",
"abe89090c2e46dda8fff81053cc79f17",
]
def test_partition_html_b_tag_parsing():
html_text = """
<!DOCTYPE html>
<html>
<body>
<div>
<h1>Header 1</h1>
<p>Text</p>
<h2>Header 2</h2>
<pre>
<b>Param1</b> = Y<br><b>Param2</b> = 1<br><b>Param3</b> = 2<br><b>Param4</b> = A
<br><b>Param5</b> = A,B,C,D,E<br><b>Param6</b> = 7<br><b>Param7</b> = Five<br>
</pre>
</div>
</body>
</html>
"""
elements = partition_html(text=html_text)
element_text = "|".join(e.text for e in elements)
assert element_text == (
"Header 1|Text|Header 2|Param1 = Y|Param2 = 1|Param3 = 2|Param4 = A|"
"Param5 = A,B,C,D,E|Param6 = 7|Param7 = Five"
)
def test_partition_html_tag_tail_parsing():
html_text = """
<html>
<body>
<div>
Head
<div><span>Nested</span></div>
Tail
</div>
</body>
</html>
"""
elements = partition_html(text=html_text)
element_text = "|".join([str(el).strip() for el in elements])
assert element_text == "Head|Nested|Tail"
def test_partition_html_links():
html_text = """<html>
<a href="/loner">A lone link!</a>
<p>Hello <a href="/link">link!</a></p>
<p>\n Hello <a href="/link">link!</a></p>
<p><a href="/wiki/parrots">Parrots</a> and <a href="/wiki/dogs">Dogs</a></p>
</html>"""
expected_results = [
[
{"text": "A lone link!", "url": "/loner", "start_index": -1},
],
[
{"text": "link!", "url": "/link", "start_index": 6},
],
[
{"text": "link!", "url": "/link", "start_index": 6},
],
[
{"text": "Parrots", "url": "/wiki/parrots", "start_index": 0},
{"text": "Dogs", "url": "/wiki/dogs", "start_index": 12},
],
]
elements = partition_html(text=html_text)
for el_idx, el in enumerate(elements):
expected_result = expected_results[el_idx]
for link_idx, (text, url, start_index) in enumerate(
zip(el.metadata.link_texts, el.metadata.link_urls, el.metadata.link_start_indexes)
):
assert text == expected_result[link_idx]["text"]
assert url == expected_result[link_idx]["url"]
assert start_index == expected_result[link_idx]["start_index"]