226 lines
7.6 KiB
Python
Raw Normal View History

# pyright: reportPrivateUsage=false
"""Test suite for `unstructured.partition.odt` module."""
from __future__ import annotations
from typing import Any, Iterator
import pytest
from pytest_mock import MockFixture
from test_unstructured.unit_utils import (
ANY,
FixtureRequest,
assert_round_trips_through_JSON,
example_doc_path,
method_mock,
)
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import (
CompositeElement,
Element,
Table,
TableChunk,
Text,
Title,
)
from unstructured.partition.docx import partition_docx
from unstructured.partition.odt import partition_odt
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
def test_partition_odt_matches_partition_docx():
odt_file_path = example_doc_path("simple.odt")
docx_file_path = example_doc_path("simple.docx")
assert partition_odt(odt_file_path) == partition_docx(docx_file_path)
# -- document-source (file or filename) ----------------------------------------------------------
def test_partition_odt_from_filename():
elements = partition_odt(example_doc_path("fake.odt"))
assert elements == [
Title("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
" Time 1pm 2pm 3pm"
" Leader Sarah Mark Ryan"
),
]
assert all(e.metadata.filename == "fake.odt" for e in elements)
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
# -- document is ultimately partitioned by partition_docx() --
assert {e.metadata.detection_origin for e in elements} == {"docx"}
def test_partition_odt_from_file():
with open(example_doc_path("fake.odt"), "rb") as f:
elements = partition_odt(file=f)
assert elements == [
Title("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
" Time 1pm 2pm 3pm"
" Leader Sarah Mark Ryan"
),
]
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_odt_from_filename_gets_the_ODT_filename_in_metadata_not_the_DOCX_filename():
elements = partition_odt(example_doc_path("simple.odt"))
assert all(e.metadata.filename == "simple.odt" for e in elements), (
f"Expected all elements to have 'simple.odt' as their filename, but got:"
f" {repr(elements[0].metadata.filename)}"
)
def test_partition_odt_from_filename_with_metadata_filename():
elements = partition_odt(example_doc_path("fake.odt"), metadata_filename="test")
assert all(e.metadata.filename == "test" for e in elements)
def test_partition_odt_from_file_with_metadata_filename():
with open(example_doc_path("fake.odt"), "rb") as f:
elements = partition_odt(file=f, metadata_filename="test")
assert all(e.metadata.filename == "test" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_odt_gets_the_ODT_MIME_type_in_metadata_filetype():
ODT_MIME_TYPE = "application/vnd.oasis.opendocument.text"
elements = partition_odt(example_doc_path("simple.odt"))
assert all(e.metadata.filetype == ODT_MIME_TYPE for e in elements), (
f"Expected all elements to have '{ODT_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.text_as_html ----------------------------------------------------------------------
@pytest.mark.parametrize("kwargs", [{}, {"infer_table_structure": True}])
def test_partition_odt_adds_text_as_html_when_infer_table_structure_is_omitted_or_True(
kwargs: dict[str, Any],
):
with open(example_doc_path("fake.odt"), "rb") as f:
elements = partition_odt(file=f, **kwargs)
table = elements[1]
assert isinstance(table, Table)
assert table.metadata.text_as_html is not None
assert table.metadata.text_as_html.startswith("<table>")
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_odt_suppresses_text_as_html_when_infer_table_structure_is_False():
with open(example_doc_path("fake.odt"), "rb") as f:
elements = partition_odt(file=f, infer_table_structure=False)
table = elements[1]
assert isinstance(table, Table)
assert table.metadata.text_as_html is None
# -- .metadata.last_modified ---------------------------------------------------------------------
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_odt_pulls_last_modified_from_filesystem(mocker: MockFixture):
filesystem_last_modified = "2029-07-05T09:24:28"
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
mocker.patch(
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
"unstructured.partition.odt.get_last_modified_date", return_value=filesystem_last_modified
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
elements = partition_odt(example_doc_path("fake.odt"))
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
assert all(e.metadata.last_modified == filesystem_last_modified for e in elements)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
def test_partition_odt_prefers_metadata_last_modified_when_provided(mocker: MockFixture):
filesystem_last_modified = "2029-07-05T09:24:28"
metadata_last_modified = "2020-07-05T09:24:28"
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
mocker.patch(
rfctr(part): prepare for pluggable auto-partitioners 1 (#3655) **Summary** In preparation for pluggable auto-partitioners simplify metadata as discussed. **Additional Context** - Pluggable auto-partitioners requires partitioners to have a consistent call signature. An arbitrary partitioner provided at runtime needs to have a call signature that is known and consistent. Basically `partition_x(filename, *, file, **kwargs)`. - The current `auto.partition()` is highly coupled to each distinct file-type partitioner, deciding which arguments to forward to each. - This is driven by the existence of "delegating" partitioners, those that convert their file-type and then call a second partitioner to do the actual partitioning. Both the delegating and proxy partitioners are decorated with metadata-post-processing decorators and those decorators are not idempotent. We call the situation where those decorators would run twice "double-decorating". For example, EPUB converts to HTML and calls `partition_html()` and both `partition_epub()` and `partition_html()` are decorated. - The way double-decorating has been avoided in the past is to avoid sending the arguments the metadata decorators are sensitive to to the proxy partitioner. This is very obscure, complex to reason about, error-prone, and just overall not a viable strategy. The better solution is to not decorate delegating partitioners and let the proxy partitioner handle all the metadata. - This first step in preparation for that is part of simplifying the metadata processing by removing unused or unwanted legacy parameters. - `date_from_file_object` is a misnomer because a file-object never contains last-modified data. - It can never produce useful results in the API where last-modified information must be provided by `metadata_last_modified`. - It is an undocumented parameter so not in use. - Using it can produce incorrect metadata.
2024-09-23 15:23:10 -07:00
"unstructured.partition.odt.get_last_modified_date", return_value=filesystem_last_modified
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
elements = partition_odt(
example_doc_path("simple.odt"), metadata_last_modified=metadata_last_modified
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
# -- .metadata.languages -------------------------------------------------------------------------
def test_partition_odt_adds_languages_metadata():
elements = partition_odt(example_doc_path("simple.odt"))
assert all(e.metadata.languages == ["eng"] for e in elements)
def test_partition_odt_respects_detect_language_per_element_arg():
elements = partition_odt(
example_doc_path("language-docs/eng_spa_mult.odt"), detect_language_per_element=True
chore: Table chunking (#1540) This change is adding to our `add_chunking_strategy` logic so that we are able to chunk Table elements' `text` and `text_as_html` params. In order to keep the functionality under the same `by_title` chunking strategy we have renamed the `combine_under_n_chars` to `max_characters`. It functions the same way for the combining elements under Title's, as well as specifying a chunk size (in chars) for TableChunk elements. *renaming the variable to `max_characters` will also reflect the 'hard max' we will implement for large elements in followup PRs Additionally -> some lint changes snuck in when I ran `make tidy` hence the minor changes in unrelated files :) TODO: ✅ add unit tests --> note: added where I could to unit tests! Some unit tests I just clarified that the chunking strategy was now 'by_title' because we don't have a file example that has Table elements to test the 'by_num_characters' chunking strategy ✅ update changelog To manually test: ``` In [1]: filename="example-docs/example-10k.html" In [2]: from unstructured.chunking.title import chunk_table_element In [3]: from unstructured.partition.auto import partition In [4]: elements = partition(filename) # element at -2 happens to be a Table, and we'll get chunks of char size 4 here In [5]: chunks = chunk_table_element(elements[-2], 4) # examine text and text_as_html params ln [6]: for c in chunks: print(c.text) print(c.metadata.text_as_html) ``` --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2023-10-03 09:40:34 -07:00
)
assert [e.metadata.languages for e in elements] == [
["eng"],
["spa", "eng"],
["eng"],
["eng"],
["spa"],
]
# -- miscellaneous -------------------------------------------------------------------------------
@pytest.mark.parametrize(
("kwargs", "expected_value"),
[({}, "hi_res"), ({"strategy": None}, "hi_res"), ({"strategy": "auto"}, "auto")],
)
def test_partition_odt_forwards_strategy_arg_to_partition_docx(
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
):
from unstructured.partition.docx import _DocxPartitioner
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
yield Text(f"strategy == {self._opts.strategy}")
_iter_elements_ = method_mock(
request,
_DocxPartitioner,
"_iter_document_elements",
side_effect=fake_iter_document_elements,
)
(element,) = partition_odt(example_doc_path("simple.odt"), **kwargs)
_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy == {expected_value}"
def test_partition_odt_round_trips_through_json():
"""Elements produced can be serialized then deserialized without loss."""
assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt")))
def test_partition_odt_chunks_elements_when_chunking_strategy_is_specified():
document_path = example_doc_path("simple.odt")
elements = partition_odt(document_path)
chunks = partition_odt(document_path, chunking_strategy="basic")
# -- all chunks are chunk element-types --
assert all(isinstance(c, (CompositeElement, Table, TableChunk)) for c in chunks)
# -- chunks from partitioning match those produced by chunking elements in separate step --
assert chunks == chunk_elements(elements)