unstructured/test_unstructured/partition/test_xml_partition.py
Matt Robinson d9aed66b65
feat: add document date for remaining file types (#930) (#969)
* feat: add document date for remaining file types (#930)

* feat: add functions for getting modification date

* feat: add date field to metadata from csv file

* feat: add tests for csv patition

* feat: add date field to metadata from html file

* feat: add tests for html partition

* fix: return file name onlyif possible

* feat: add csv tests

* fix: renaming

* feat: add filed metadata_date  as date of last mod

* feat: add tests for partition_docx

* feat: add filed metadata_date  to .doc file

* feat: add tests for partition_doc

* feat: add metadata_date  to .epub file

* feat: add tests for partition_epub

* fix: fix test mocking

* feat: add metadata_date for image partition

* feat: add test for image partition

* feat: add coorrdinate system argument

* feat: add date to element metadata

* feat: add metadata_date for JSON partition

* feat: add test for JSON partition

* fix: rename variable

* feat: add metadata_date for md partition

* feat: add test for md partition

* feat: update doc string

* feat: add metadata_date for .odt partition

* feat: update .odt string

* feat: add metadata_date for .org partition

* feat: add tests for .org partition

* feat: add metadata_date for .pdf partition

* feat: add tests for .pdf partition

* feat: add metadata_date for .pptx partition

* feat: add metadata_date for .ppt partition

* feat: add tests for .ppt partition

* feat: add tests for .pptx partition

* feat: add metadata_date for .rst partition

* feat: add tests for .rst partition

* fix: get modification date after file checking

* feat: add tests for .rtf partition

* feat: add tests for .rtf partition

* feat: add metadata_date for .txt partition

* fix: rename argument

* feat: add tests for .txt partition

* feat: update doc string rst patrition function

* feat: add metadata_date for .tsv partition

* feat: add tests for .tsv partition

* feat: add metadata_date for .xlsx partition

* feat: add tests for .xlsx partition

* fix: clean up

* feat: add tests for .xml partition

* feat: add tests for .xml partition

* fix: use `or ` instead of `if`

* fix: fix epub tests

* fix: remove not used code

* fix: add try block for getting file name

* fix: applying linter changes

* fix: fix test_partition_file

* feat: add metadata_date for email

* feat: add test for email partition

* feat: add metadata_date for msg

* feat: add tests for msg partition

* feat: update CHANGELOG file

* fix: update partitions doc string

* don't push

* fix: clean up code

* linting, linting, linting

* remove unnecessary example doc

* update version and changelog

* ingest-test-fixtures-update

* set metadata date in test

---------

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>

* ingest-test-fixtures-update

* Update ingest test fixtures (#970)

Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>

* Revert "Update ingest test fixtures (#970)"

This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2.

* remove date from metadata in outputs

* update docstring ordering

* remove print

* remove print

* remove print

* linting, linting, linting

* fix version and test

* fix changelog

* fix changelog

* update version

---------

Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00

234 lines
7.4 KiB
Python

import os
import pathlib
import pytest
from unstructured.partition.xml import partition_xml
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
def test_partition_xml_from_filename_with_metadata_filename():
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
elements = partition_xml(filename=file_path, xml_keep_tags=False, metadata_filename="test")
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
def test_partition_xml_from_file_with_metadata_filename():
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", "factbook.xml")
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename="test")
assert elements[0].text == "United States"
assert elements[0].metadata.filename == "test"
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=False, metadata_filename=file_path)
assert elements[0].text == "United States"
assert elements[0].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=True)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_filename_with_tags_raises_encoding_error(filename, encoding, error):
with pytest.raises(error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
partition_xml(filename=file_path, xml_keep_tags=True, encoding=encoding)
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_rb_with_tags_default_encoding(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path, "rb") as f:
elements = partition_xml(file=f, xml_keep_tags=True, metadata_filename=file_path)
assert elements[5].text == "<name>United States</name>"
assert elements[5].metadata.filename == filename
@pytest.mark.parametrize(
("filename", "encoding", "error"),
[("factbook-utf-16.xml", "utf-8", UnicodeDecodeError)],
)
def test_partition_xml_from_file_rb_with_tags_raises_encoding_error(filename, encoding, error):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with pytest.raises(error), open(file_path, "rb") as f:
partition_xml(
file=f,
xml_keep_tags=True,
metadata_filename=file_path,
encoding=encoding,
)
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_filename_exclude_metadata(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
elements = partition_xml(filename=file_path, xml_keep_tags=False, include_metadata=False)
assert elements[0].text == "United States"
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
@pytest.mark.parametrize(
"filename",
["factbook.xml", "factbook-utf-16.xml"],
)
def test_partition_xml_from_file_exclude_metadata(filename):
file_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(file_path) as f:
elements = partition_xml(
file=f,
xml_keep_tags=False,
metadata_filename=file_path,
include_metadata=False,
)
assert elements[0].text == "United States"
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}
def test_partition_xml_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_xml(
filename=filename,
)
assert elements[0].metadata.date == mocked_last_modification_date
def test_partition_xml_with_custom_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = partition_xml(
filename=filename,
metadata_date=expected_last_modification_date,
)
assert elements[0].metadata.date == expected_last_modification_date
def test_partition_xml_from_file_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_xml(
file=f,
)
assert elements[0].metadata.date == mocked_last_modification_date
def test_partition_xml_from_file_with_custom_metadata_date(
mocker,
filename="example-docs/factbook.xml",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2020-07-05T09:24:28"
mocker.patch(
"unstructured.partition.xml.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = partition_xml(file=f, metadata_date=expected_last_modification_date)
assert elements[0].metadata.date == expected_last_modification_date