diff --git a/CHANGELOG.md b/CHANGELOG.md index 629b02d8c..7590a3d82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev7 +## 0.15.14-dev8 ### Enhancements @@ -13,6 +13,7 @@ * **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id. * **Allow empty text files.** Fixes an issue where text files with only white space would fail to be partitioned. * **Remove double-decoration for CSV, DOC, ODT partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (CSV and DOCX in this case); remove decoration from delegating partitioners. +* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. ## 0.15.13 diff --git a/test_unstructured/partition/test_ppt.py b/test_unstructured/partition/test_ppt.py index c2d8af3e5..09f8e51eb 100644 --- a/test_unstructured/partition/test_ppt.py +++ b/test_unstructured/partition/test_ppt.py @@ -30,11 +30,6 @@ def test_partition_ppt_from_filename(): assert {element.metadata.detection_origin for element in elements} == {"pptx"} -def test_partition_ppt_from_filename_with_metadata_filename(): - elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test") - assert all(element.metadata.filename == "test" for element in elements) - - def test_partition_ppt_raises_with_missing_file(): with pytest.raises(ValueError): partition_ppt(example_doc_path("doesnt-exist.ppt")) @@ -67,6 +62,38 @@ def test_partition_ppt_raises_when_neither_file_path_or_file_is_provided(): partition_ppt() +# -- .metadata.filename -------------------------------------------------------------------------- + + +def test_partition_ppt_from_filename_gets_filename_from_filename_arg(): + elements = partition_ppt(example_doc_path("fake-power-point.ppt")) + + assert len(elements) > 0 + assert all(e.metadata.filename == "fake-power-point.ppt" for e in elements) + + +def test_partition_ppt_from_file_gets_filename_None(): + with open(example_doc_path("fake-power-point.ppt"), "rb") as f: + elements = partition_ppt(file=f) + + assert len(elements) > 0 + assert all(e.metadata.filename is None for e in elements) + + +def test_partition_ppt_from_filename_prefers_metadata_filename(): + elements = partition_ppt(example_doc_path("fake-power-point.ppt"), metadata_filename="test") + + assert len(elements) > 0 + assert all(element.metadata.filename == "test" for element in elements) + + +def test_partition_ppt_from_file_prefers_metadata_filename(): + with open(example_doc_path("fake-power-point.ppt"), "rb") as f: + elements = partition_ppt(file=f, metadata_filename="test") + + assert all(e.metadata.filename == "test" for e in elements) + + # -- .metadata.last_modified --------------------------------------------------------------------- diff --git a/test_unstructured/partition/test_pptx.py b/test_unstructured/partition/test_pptx.py index 854262b26..43238a413 100644 --- a/test_unstructured/partition/test_pptx.py +++ b/test_unstructured/partition/test_pptx.py @@ -22,6 +22,7 @@ from test_unstructured.unit_utils import ( assert_round_trips_through_JSON, example_doc_path, function_mock, + property_mock, ) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( @@ -351,7 +352,7 @@ def test_partition_pptx_from_file_prefers_metadata_last_modified(): assert all(e.metadata.last_modified == metadata_last_modified for e in elements) -# ------------------------------------------------------------------------------------------------ +# -- .metadata.languages ------------------------------------------------------------------------- def test_partition_pptx_element_metadata_has_languages(): @@ -374,7 +375,7 @@ def test_partition_pptx_respects_detect_language_per_element(): def test_partition_pptx_raises_TypeError_for_invalid_languages(): with pytest.raises(TypeError): - partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore + partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # == downstream behaviors ======================================================================== @@ -492,7 +493,7 @@ def test_partition_pptx_hierarchy_sample_document(): test_cases = [ (0, None, "b2859226ba1f9243fb3f1b2ace889f43"), (1, "b2859226ba1f9243fb3f1b2ace889f43", "d13f8827e94541c8b818b0df8f942526"), - (None, None, "1ffd3151819e594553e6b540e19e6c36"), + (None, None, "cbb95b030de22979af6bfa42969c8202"), (0, None, "e535f799d1f0e79d6777efa873a16ce1"), (0, "e535f799d1f0e79d6777efa873a16ce1", "f02bbfb417ad60daa2ba35080e96262f"), (0, "e535f799d1f0e79d6777efa873a16ce1", "414dfce72ea53cd4649176af0d62a4c1"), @@ -500,7 +501,7 @@ def test_partition_pptx_hierarchy_sample_document(): (1, "414dfce72ea53cd4649176af0d62a4c1", "a33333f527851f700ca175acd04b8a2c"), (2, "a33333f527851f700ca175acd04b8a2c", "6f1b87689e4da2b0fb865bc5f92d5702"), (0, "e535f799d1f0e79d6777efa873a16ce1", "3f58e0be3b8e8b15cba7adc4eae68586"), - (None, None, "1ffd3151819e594553e6b540e19e6c36"), + (None, None, "e5de1b503e64da424fb7d8113371e16d"), (0, None, "8319096532fe2e55f66c491ea8313150"), (0, "8319096532fe2e55f66c491ea8313150", "17a7e78277ab131a627cb4538bab7390"), (0, "8319096532fe2e55f66c491ea8313150", "41a9e1d0390f4edd77181142ceae51bc"), @@ -514,7 +515,7 @@ def test_partition_pptx_hierarchy_sample_document(): (1, "7f647b1f0f20c3db40c36ab57d9a5550", "6ec455f5f19782facf184886876c9a66"), (2, "6ec455f5f19782facf184886876c9a66", "5614b00c3f6bff23ebba1360e10f6428"), (0, "8319096532fe2e55f66c491ea8313150", "2f57a8d4182e6fd5bd5842b0a2d9841b"), - (None, None, "1ffd3151819e594553e6b540e19e6c36"), + (None, None, "4120066d251ba675ade42e8a167ca61f"), (None, None, "2ed3bd10daace79ac129cbf8faf22bfc"), (0, None, "fd08cacbaddafee5cbacc02528536ee5"), ] @@ -545,8 +546,6 @@ def opts_args() -> dict[str, Any]: "include_page_breaks": True, "include_slide_notes": False, "infer_table_structure": True, - "metadata_file_path": None, - "metadata_last_modified": None, "strategy": "fast", } @@ -632,15 +631,7 @@ class DescribePptxPartitionerOptions: # -- .last_modified -------------------------- - def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( - self, opts_args: dict[str, Any] - ): - opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" - opts = PptxPartitionerOptions(**opts_args) - - assert opts.last_modified == "2024-03-05T17:02:53" - - def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_: Mock ): opts_args["file_path"] = "a/b/spreadsheet.pptx" @@ -665,21 +656,11 @@ class DescribePptxPartitionerOptions: # -- .metadata_file_path --------------------- - def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( - self, opts_args: dict[str, Any] - ): - opts_args["file_path"] = "x/y/z.pptx" - opts_args["metadata_file_path"] = "a/b/c.pptx" - opts = PptxPartitionerOptions(**opts_args) - - assert opts.metadata_file_path == "a/b/c.pptx" - @pytest.mark.parametrize("file_path", ["u/v/w.pptx", None]) - def and_it_falls_back_to_the_document_file_path_otherwise( + def it_uses_the_filename_argument_when_provided( self, file_path: str | None, opts_args: dict[str, Any] ): opts_args["file_path"] = file_path - opts_args["metadata_file_path"] = None opts = PptxPartitionerOptions(**opts_args) assert opts.metadata_file_path == file_path @@ -769,9 +750,11 @@ class DescribePptxPartitionerOptions: # -- .table_metadata ------------------------- - def it_can_create_table_metadata(self, opts_args: dict[str, Any]): - opts_args["metadata_file_path"] = "d/e/f.pptx" - opts_args["metadata_last_modified"] = "2024-04-02T19:51:55" + def it_can_create_table_metadata( + self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any] + ): + metadata_file_path_prop_.return_value = "d/e/f.pptx" + last_modified_prop_.return_value = "2024-04-02T19:51:55" opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) @@ -786,9 +769,11 @@ class DescribePptxPartitionerOptions: # -- .text_metadata ------------------------- - def it_can_create_text_metadata(self, opts_args: dict[str, Any]): - opts_args["metadata_file_path"] = "d/e/f.pptx" - opts_args["metadata_last_modified"] = "2024-04-02T19:56:40" + def it_can_create_text_metadata( + self, last_modified_prop_: Mock, metadata_file_path_prop_: Mock, opts_args: dict[str, Any] + ): + metadata_file_path_prop_.return_value = "d/e/f.pptx" + last_modified_prop_.return_value = "2024-04-02T19:56:40" opts = PptxPartitionerOptions(**opts_args) # -- move to the first slide -- list(opts.increment_page_number()) @@ -806,3 +791,11 @@ class DescribePptxPartitionerOptions: @pytest.fixture() def get_last_modified_date_(self, request: FixtureRequest): return function_mock(request, "unstructured.partition.pptx.get_last_modified_date") + + @pytest.fixture() + def last_modified_prop_(self, request: FixtureRequest): + return property_mock(request, PptxPartitionerOptions, "last_modified") + + @pytest.fixture() + def metadata_file_path_prop_(self, request: FixtureRequest): + return property_mock(request, PptxPartitionerOptions, "metadata_file_path") diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index f60f16f8e..7a9d25baf 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -321,15 +321,6 @@ def test_partition_xlsx_with_more_than_1k_cells(): class Describe_XlsxPartitionerOptions: """Unit-test suite for `unstructured.partition.xlsx._XlsxPartitionerOptions` objects.""" - @pytest.mark.parametrize("arg_value", [True, False]) - def it_knows_whether_to_detect_language_for_each_element_individually( - self, arg_value: bool, opts_args: dict[str, Any] - ): - opts_args["detect_language_per_element"] = arg_value - opts = _XlsxPartitionerOptions(**opts_args) - - assert opts.detect_language_per_element is arg_value - @pytest.mark.parametrize("arg_value", [True, False]) def it_knows_whether_to_find_subtables_within_each_worksheet_or_return_table_per_worksheet( self, arg_value: bool, opts_args: dict[str, Any] @@ -366,37 +357,20 @@ class Describe_XlsxPartitionerOptions: assert opts.infer_table_structure is arg_value - @pytest.mark.parametrize( - ("arg_value", "expected_value"), - [(None, None), (["eng"], ["eng"]), (["eng", "spa"], ["eng", "spa"])], - ) - def it_knows_what_languages_the_caller_expects_to_appear_in_the_text( - self, arg_value: bool, expected_value: int | None, opts_args: dict[str, Any] - ): - opts_args["languages"] = arg_value - opts = _XlsxPartitionerOptions(**opts_args) + # -- .last_modified -------------------------------------------------------------------------- - assert opts.languages == expected_value - - def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( - self, opts_args: dict[str, Any] - ): - opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" - opts = _XlsxPartitionerOptions(**opts_args) - - assert opts.last_modified == "2024-03-05T17:02:53" - - def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + def it_gets_last_modified_from_the_filesystem_when_a_path_is_provided( self, opts_args: dict[str, Any], get_last_modified_date_: Mock ): + filesystem_last_modified = "2024-04-02T20:32:35" opts_args["file_path"] = "a/b/spreadsheet.xlsx" - get_last_modified_date_.return_value = "2024-04-02T20:32:35" + get_last_modified_date_.return_value = filesystem_last_modified opts = _XlsxPartitionerOptions(**opts_args) last_modified = opts.last_modified get_last_modified_date_.assert_called_once_with("a/b/spreadsheet.xlsx") - assert last_modified == "2024-04-02T20:32:35" + assert last_modified == filesystem_last_modified def but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided( self, opts_args: dict[str, Any] @@ -409,24 +383,13 @@ class Describe_XlsxPartitionerOptions: assert last_modified is None - def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( - self, opts_args: dict[str, Any] - ): + # -- .metadata_file_path --------------------------------------------------------------------- + + def it_uses_the_file_path_argument_when_provided(self, opts_args: dict[str, Any]): opts_args["file_path"] = "x/y/z.xlsx" - opts_args["metadata_file_path"] = "a/b/c.xlsx" opts = _XlsxPartitionerOptions(**opts_args) - assert opts.metadata_file_path == "a/b/c.xlsx" - - @pytest.mark.parametrize("file_path", ["u/v/w.xlsx", None]) - def and_it_falls_back_to_the_document_file_path_otherwise( - self, file_path: str | None, opts_args: dict[str, Any] - ): - opts_args["file_path"] = file_path - opts_args["metadata_file_path"] = None - opts = _XlsxPartitionerOptions(**opts_args) - - assert opts.metadata_file_path == file_path + assert opts.metadata_file_path == "x/y/z.xlsx" # -- fixtures -------------------------------------------------------------------------------- @@ -442,15 +405,11 @@ class Describe_XlsxPartitionerOptions: compact for testing purposes. """ return { - "detect_language_per_element": False, - "file": None, "file_path": None, + "file": None, "find_subtable": True, "include_header": False, "infer_table_structure": True, - "languages": ["auto"], - "metadata_file_path": None, - "metadata_last_modified": None, } diff --git a/test_unstructured/partition/test_xml.py b/test_unstructured/partition/test_xml.py index 1d5ac6d23..6bc183405 100644 --- a/test_unstructured/partition/test_xml.py +++ b/test_unstructured/partition/test_xml.py @@ -114,6 +114,18 @@ def test_partition_xml_from_file_rb_with_tags_raises_encoding_error(): ) +# -- .metadata.filetype -------------------------------------------------------------------------- + + +def test_partition_xml_gets_the_XML_mime_type_in_metadata_filetype(): + XML_MIME_TYPE = "application/xml" + elements = partition_xml(example_doc_path("factbook.xml")) + assert all(e.metadata.filetype == XML_MIME_TYPE for e in elements), ( + f"Expected all elements to have '{XML_MIME_TYPE}' as their filetype, but got:" + f" {repr(elements[0].metadata.filetype)}" + ) + + # -- .metadata.last_modified --------------------------------------------------------------------- diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9f01f933e..1fb34c386 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev7" # pragma: no cover +__version__ = "0.15.14-dev8" # pragma: no cover diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 7b604a28f..630471290 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -20,6 +20,7 @@ DETECTION_ORIGIN: str = "csv" @add_chunking_strategy def partition_csv( filename: str | None = None, + *, file: IO[bytes] | None = None, encoding: str | None = None, include_header: bool = False, diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index df24daac0..461c20836 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -8,7 +8,7 @@ from __future__ import annotations import io from tempfile import SpooledTemporaryFile -from typing import IO, Any, Iterator, Optional, Protocol, Sequence +from typing import IO, Any, Iterator, Protocol, Sequence import pptx from pptx.presentation import Presentation @@ -32,13 +32,10 @@ from unstructured.documents.elements import ( Table, Text, Title, - process_metadata, ) -from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.partition.common.common import convert_ms_office_table_to_text -from unstructured.partition.common.lang import apply_lang_metadata -from unstructured.partition.common.metadata import get_last_modified_date +from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.partition.text_type import ( is_email_address, is_possible_narrative_text, @@ -80,20 +77,15 @@ class AbstractPicturePartitioner(Protocol): # ================================================================================================ -@process_metadata() -@add_metadata_with_filetype(FileType.PPTX) +@apply_metadata(FileType.PPTX) @add_chunking_strategy def partition_pptx( - filename: Optional[str] = None, + filename: str | None = None, *, - file: Optional[IO[bytes]] = None, - detect_language_per_element: bool = False, + file: IO[bytes] | None = None, include_page_breaks: bool = True, - include_slide_notes: Optional[bool] = None, + include_slide_notes: bool | None = None, infer_table_structure: bool = True, - languages: Optional[list[str]] = ["auto"], - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, starting_page_number: int = 1, strategy: str = PartitionStrategy.FAST, **kwargs: Any, @@ -108,12 +100,6 @@ def partition_pptx( A file-like object using "rb" mode --> open(filename, "rb"). include_page_breaks If True, includes a PageBreak element between slides - metadata_filename - The filename to use for the metadata. Relevant because partition_ppt() converts its - (legacy) .ppt document to .pptx before partition. We want the filename of the original - .ppt source file in the metadata. - metadata_last_modified - The last modified date for the document. include_slide_notes If True, includes the slide notes as element infer_table_structure @@ -122,13 +108,6 @@ def partition_pptx( I.e., rows and cells are preserved. Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure). - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. starting_page_number Indicates what page number should be assigned to the first slide in the presentation. This information will be reflected in elements' metadata and can be be especially @@ -140,19 +119,11 @@ def partition_pptx( include_page_breaks=include_page_breaks, include_slide_notes=include_slide_notes, infer_table_structure=infer_table_structure, - metadata_file_path=metadata_filename, - metadata_last_modified=metadata_last_modified, strategy=strategy, starting_page_number=starting_page_number, ) - elements = _PptxPartitioner.iter_presentation_elements(opts) - elements = apply_lang_metadata( - elements=elements, - languages=languages, - detect_language_per_element=detect_language_per_element, - ) - return list(elements) + return list(_PptxPartitioner.iter_presentation_elements(opts)) class _PptxPartitioner: @@ -321,7 +292,7 @@ class _PptxPartitioner: detection_origin=DETECTION_ORIGIN, ) - def _order_shapes(self, slide: Slide) -> tuple[Optional[Shape], Sequence[BaseShape]]: + def _order_shapes(self, slide: Slide) -> tuple[Shape | None, Sequence[BaseShape]]: """Orders the shapes on `slide` from top to bottom and left to right. Returns the title shape if it exists and the ordered shapes.""" @@ -365,13 +336,11 @@ class PptxPartitionerOptions: def __init__( self, *, - file: Optional[IO[bytes]], - file_path: Optional[str], + file: IO[bytes] | None, + file_path: str | None, include_page_breaks: bool, - include_slide_notes: Optional[bool], + include_slide_notes: bool | None, infer_table_structure: bool, - metadata_file_path: Optional[str], - metadata_last_modified: Optional[str], strategy: str, starting_page_number: int = 1, ): @@ -380,8 +349,6 @@ class PptxPartitionerOptions: self._include_page_breaks = include_page_breaks self._include_slide_notes = include_slide_notes self._infer_table_structure = infer_table_structure - self._metadata_file_path = metadata_file_path - self._metadata_last_modified = metadata_last_modified self._strategy = strategy # -- options object maintains page-number state -- self._page_counter = starting_page_number - 1 @@ -417,7 +384,9 @@ class PptxPartitionerOptions: yield PageBreak( "", detection_origin=DETECTION_ORIGIN, - metadata=ElementMetadata(last_modified=self.last_modified), + metadata=ElementMetadata( + last_modified=self.last_modified, page_number=self.page_number - 1 + ), ) @lazyproperty @@ -426,27 +395,19 @@ class PptxPartitionerOptions: return self._infer_table_structure @lazyproperty - def last_modified(self) -> Optional[str]: + def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" - # -- Value explicitly specified by caller takes precedence. This is used for example when - # -- this file was converted from another format, and any last-modified date for the file - # -- would be just now. - if self._metadata_last_modified: - return self._metadata_last_modified + if not self._file_path: + return None - if self._file_path: - return ( - None - if is_temp_file_path(self._file_path) - else get_last_modified_date(self._file_path) - ) - - return None + return ( + None if is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path) + ) @lazyproperty def metadata_file_path(self) -> str | None: """The best available file-path for this document or `None` if unavailable.""" - return self._metadata_file_path or self._file_path + return self._file_path @property def page_number(self) -> int: diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 72c0984e4..8bace2876 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -6,36 +6,24 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import ( - Element, - ElementMetadata, - Table, - process_metadata, -) -from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.documents.elements import Element, ElementMetadata, Table from unstructured.file_utils.model import FileType from unstructured.partition.common.common import ( exactly_one, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.common.lang import apply_lang_metadata -from unstructured.partition.common.metadata import get_last_modified_date +from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date DETECTION_ORIGIN: str = "tsv" -@process_metadata() -@add_metadata_with_filetype(FileType.TSV) +@apply_metadata(FileType.TSV) @add_chunking_strategy def partition_tsv( filename: Optional[str] = None, + *, file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, include_header: bool = False, - languages: Optional[list[str]] = ["auto"], - # NOTE (jennings) partition_tsv generates a single TableElement - # so detect_language_per_element is not included as a param **kwargs: Any, ) -> list[Element]: """Partitions TSV files into document elements. @@ -48,17 +36,9 @@ def partition_tsv( A file-like object using "rb" mode --> open(filename, "rb"). include_header Determines whether or not header info info is included in text and medatada.text_as_html. - metadata_last_modified - The day of the last modification. - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. """ exactly_one(filename=filename, file=file) - last_modified = get_last_modified_date(filename) if filename else None - header = 0 if include_header else None if filename: @@ -75,14 +55,9 @@ def partition_tsv( metadata = ElementMetadata( text_as_html=html_text, - filename=metadata_filename or filename, - last_modified=metadata_last_modified or last_modified, - languages=languages, + filename=filename, + last_modified=get_last_modified_date(filename) if filename else None, ) metadata.detection_origin = DETECTION_ORIGIN - elements = apply_lang_metadata( - [Table(text=text, metadata=metadata)], - languages=languages, - ) - return list(elements) + return [Table(text=text, metadata=metadata)] diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 7f6e93eca..caeea9053 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -22,12 +22,9 @@ from unstructured.documents.elements import ( Table, Text, Title, - process_metadata, ) -from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType -from unstructured.partition.common.lang import apply_lang_metadata -from unstructured.partition.common.metadata import get_last_modified_date +from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.partition.text_type import ( is_bulleted_text, is_possible_narrative_text, @@ -41,19 +38,15 @@ _CellCoordinate: TypeAlias = "tuple[int, int]" DETECTION_ORIGIN: str = "xlsx" -@process_metadata() -@add_metadata_with_filetype(FileType.XLSX) +@apply_metadata(FileType.XLSX) @add_chunking_strategy def partition_xlsx( filename: Optional[str] = None, + *, file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - infer_table_structure: bool = True, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - metadata_last_modified: Optional[str] = None, - include_header: bool = False, find_subtable: bool = True, + include_header: bool = False, + infer_table_structure: bool = True, starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: @@ -71,28 +64,15 @@ def partition_xlsx( I.e., rows and cells are preserved. Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure). - languages - User defined value for metadata.languages if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. - metadata_last_modified - The day of the last modification include_header Determines whether or not header info is included in text and medatada.text_as_html """ opts = _XlsxPartitionerOptions( - detect_language_per_element=detect_language_per_element, - file=file, file_path=filename, + file=file, find_subtable=find_subtable, include_header=include_header, infer_table_structure=infer_table_structure, - languages=languages, - metadata_file_path=metadata_filename, - metadata_last_modified=metadata_last_modified, ) elements: list[Element] = [] @@ -151,13 +131,6 @@ def partition_xlsx( element.metadata = _get_metadata(sheet_name, page_number, opts) elements.append(element) - elements = list( - apply_lang_metadata( - elements=elements, - languages=opts.languages, - detect_language_per_element=opts.detect_language_per_element, - ), - ) return elements @@ -167,30 +140,17 @@ class _XlsxPartitionerOptions: def __init__( self, *, - detect_language_per_element: bool, - file: Optional[IO[bytes]], file_path: Optional[str], + file: Optional[IO[bytes]], find_subtable: bool, include_header: bool, infer_table_structure: bool, - languages: Optional[list[str]], - metadata_file_path: Optional[str], - metadata_last_modified: Optional[str], ): - self._detect_language_per_element = detect_language_per_element - self._file = file self._file_path = file_path + self._file = file self._find_subtable = find_subtable self._include_header = include_header self._infer_table_structure = infer_table_structure - self._languages = languages - self._metadata_file_path = metadata_file_path - self._metadata_last_modified = metadata_last_modified - - @lazyproperty - def detect_language_per_element(self) -> bool: - """When True, detect language on element-by-element basis instead of document level.""" - return self._detect_language_per_element @lazyproperty def find_subtable(self) -> bool: @@ -215,31 +175,15 @@ class _XlsxPartitionerOptions: """True when partitioner should compute and apply `text_as_html` metadata.""" return self._infer_table_structure - @lazyproperty - def languages(self) -> Optional[list[str]]: - """User-specified language(s) of this document. - - When `None`, language is detected using naive Bayesian filter via `langdetect`. Multiple - language codes indicate text could be in any of those languages. - """ - return self._languages - @lazyproperty def last_modified(self) -> Optional[str]: """The best last-modified date available, None if no sources are available.""" - # -- value explicitly specified by caller takes precedence -- - if self._metadata_last_modified: - return self._metadata_last_modified - - if self._file_path: - return get_last_modified_date(self._file_path) - - return None + return get_last_modified_date(self._file_path) if self._file_path else None @lazyproperty def metadata_file_path(self) -> str | None: """The best available file-path for this document or `None` if unavailable.""" - return self._metadata_file_path or self._file_path + return self._file_path @lazyproperty def sheets(self) -> dict[str, pd.DataFrame]: diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 1ed30966b..79c991087 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -2,92 +2,34 @@ from __future__ import annotations import copy from io import BytesIO -from typing import IO, Any, Iterator, Optional, cast +from typing import IO, Any, Iterator, cast from lxml import etree from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import ( - Element, - ElementMetadata, - Text, - process_metadata, -) +from unstructured.documents.elements import Element, ElementMetadata, Text from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.partition.common.common import ( exactly_one, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.common.lang import apply_lang_metadata -from unstructured.partition.common.metadata import get_last_modified_date +from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.partition.text import element_from_text DETECTION_ORIGIN: str = "xml" -def get_leaf_elements( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - text: Optional[str] = None, - xml_path: Optional[str] = None, -) -> Iterator[Optional[str]]: - """Get leaf elements from the XML tree defined in filename, file, or text.""" - exactly_one(filename=filename, file=file, text=text) - if filename: - return _get_leaf_elements(filename, xml_path=xml_path) - elif file: - return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path) - else: - b = BytesIO(bytes(cast(str, text), encoding="utf-8")) - return _get_leaf_elements(b, xml_path=xml_path) - - -def _get_leaf_elements( - file: str | IO[bytes], - xml_path: Optional[str] = None, -) -> Iterator[Optional[str]]: - """Parse the XML tree in a memory efficient manner if possible.""" - element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage] - - element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False) - # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream - # elements through in a memory efficient way, so we bite the bullet and load it all into - # memory. - if xml_path is not None: - _, element = next(element_iterator) - compiled_path = etree.XPath(xml_path) - element_iterator = (("end", el) for el in compiled_path(element)) - - for event, element in element_iterator: - if event == "start": - element_stack.append(element) - - if event == "end": - if element.text is not None and element.text.strip(): - yield element.text - - element.clear() - - while element_stack and element_stack[-1].getparent() is None: - element_stack.pop() - - -@process_metadata() -@add_metadata_with_filetype(FileType.XML) +@apply_metadata(FileType.XML) @add_chunking_strategy def partition_xml( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - text: Optional[str] = None, + filename: str | None = None, + *, + file: IO[bytes] | None = None, + text: str | None = None, + encoding: str | None = None, xml_keep_tags: bool = False, - xml_path: Optional[str] = None, - metadata_filename: Optional[str] = None, - encoding: Optional[str] = None, - metadata_last_modified: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, + xml_path: str | None = None, **kwargs: Any, ) -> list[Element]: """Partitions an XML document into its document elements. @@ -100,32 +42,20 @@ def partition_xml( A file-like object using "rb" mode --> open(filename, "rb"). text The text of the XML file. + encoding + The encoding method used to decode the text input. If None, utf-8 will be used. xml_keep_tags If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. xml_path The xml_path to use for extracting the text. Only used if xml_keep_tags=False. - encoding - The encoding method used to decode the text input. If None, utf-8 will be used. - metadata_last_modified - The day of the last modification. - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. """ exactly_one(filename=filename, file=file, text=text) elements: list[Element] = [] - last_modification_date = get_last_modified_date(filename) if filename else None - metadata = ElementMetadata( - filename=metadata_filename or filename, - last_modified=metadata_last_modified or last_modification_date, + filename=filename, last_modified=get_last_modified_date(filename) if filename else None ) metadata.detection_origin = DETECTION_ORIGIN @@ -153,11 +83,48 @@ def partition_xml( element.metadata = copy.deepcopy(metadata) elements.append(element) - elements = list( - apply_lang_metadata( - elements=elements, - languages=languages, - detect_language_per_element=detect_language_per_element, - ), - ) return elements + + +def get_leaf_elements( + filename: str | None, file: IO[bytes] | None, text: str | None, xml_path: str | None +) -> Iterator[str | None]: + """Get leaf elements from the XML tree defined in filename, file, or text.""" + exactly_one(filename=filename, file=file, text=text) + if filename: + return _get_leaf_elements(filename, xml_path=xml_path) + elif file: + return _get_leaf_elements(file=spooled_to_bytes_io_if_needed(file), xml_path=xml_path) + else: + b = BytesIO(bytes(cast(str, text), encoding="utf-8")) + return _get_leaf_elements(b, xml_path=xml_path) + + +def _get_leaf_elements( + file: str | IO[bytes], + xml_path: str | None, +) -> Iterator[str | None]: + """Parse the XML tree in a memory efficient manner if possible.""" + element_stack: list[etree._Element] = [] # pyright: ignore[reportPrivateUsage] + + element_iterator = etree.iterparse(file, events=("start", "end"), resolve_entities=False) + # NOTE(alan) If xml_path is used for filtering, I've yet to find a good way to stream + # elements through in a memory efficient way, so we bite the bullet and load it all into + # memory. + if xml_path is not None: + _, element = next(element_iterator) + compiled_path = etree.XPath(xml_path) + element_iterator = (("end", el) for el in compiled_path(element)) + + for event, element in element_iterator: + if event == "start": + element_stack.append(element) + + if event == "end": + if element.text is not None and element.text.strip(): + yield element.text + + element.clear() + + while element_stack and element_stack[-1].getparent() is None: + element_stack.pop()