From c78c5b6adfcbf4606f3fc645a34b1ba201605c0c Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Tue, 30 May 2023 15:47:55 -0500 Subject: [PATCH] fix: `page_number` appears in `partition_html `metadata if `include_metadata=False` (#658) * fix: page_number appears in partition_html metadata if include_metadata=False * Update common.py * Update CHANGELOG --------- Co-authored-by: Matt Robinson --- CHANGELOG.md | 1 + .../partition/test_html_partition.py | 8 +++++++ unstructured/file_utils/filetype.py | 10 +++++++-- unstructured/partition/common.py | 21 ++++++++++++++++++- 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7abb1afba..7195cc0d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * Adds `.log` as a file extension for TXT files. * Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding. * Allow passed encoding to be used in the `replace_mime_encodings` +* Fixes page metadata for `partition_html` when `include_metadata=False` ## 0.6.11 diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py index b2410ce86..4a56f793b 100644 --- a/test_unstructured/partition/test_html_partition.py +++ b/test_unstructured/partition/test_html_partition.py @@ -22,6 +22,14 @@ def test_partition_html_from_filename(): assert elements[0].metadata.file_directory == directory +def test_partition_html_from_filename_metadata_false(): + directory = os.path.join(DIRECTORY, "..", "..", "example-docs") + filename = os.path.join(directory, "example-10k.html") + elements = partition_html(filename=filename, include_metadata=False) + metadata_present = any(element.metadata.to_dict() for element in elements) + assert not metadata_present + + def test_partition_html_with_page_breaks(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html") elements = partition_html(filename=filename, include_page_breaks=True) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index fa440afe1..7b6cd12f5 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -8,7 +8,11 @@ from typing import IO, Callable, List, Optional from unstructured.documents.elements import Element, PageBreak from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN -from unstructured.partition.common import _add_element_metadata, exactly_one +from unstructured.partition.common import ( + _add_element_metadata, + _remove_element_metadata, + exactly_one, +) try: import magic @@ -380,7 +384,9 @@ def add_metadata_with_filetype(filetype: FileType): **metadata_kwargs, # type: ignore ) else: - return elements + return _remove_element_metadata( + elements, + ) return wrapper diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 488176d4a..e0cf1d295 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -84,7 +84,6 @@ def _add_element_metadata( text_as_html: Optional[str] = layout_element.text_as_html else: text_as_html = None - # NOTE(robinson) - defer to the page number that's already in the metadata # if it's available if hasattr(element, "metadata"): @@ -111,6 +110,26 @@ def _add_element_metadata( return elements +def _remove_element_metadata( + layout_elements, +) -> List[Element]: + """Removes document metadata from the document element. Document metadata includes information + like the filename, source url, and page number.""" + # Init an empty list of elements to write to + elements: List[Element] = [] + metadata = ElementMetadata() + for layout_element in layout_elements: + element = normalize_layout_element(layout_element) + if isinstance(element, list): + for _element in element: + _element.metadata = metadata + elements.extend(element) + else: + element.metadata = metadata + elements.append(element) + return elements + + def convert_office_doc(input_filename: str, output_directory: str, target_format: str): """Converts a .doc file to a .docx file using the libreoffice CLI.""" # NOTE(robinson) - In the future can also include win32com client as a fallback for windows