fix: page_number appears in partition_html metadata if include_metadata=False (#658)

* fix: page_number appears in partition_html metadata if include_metadata=False

* Update common.py

* Update CHANGELOG

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
John 2023-05-30 15:47:55 -05:00 committed by GitHub
parent f7cde5539a
commit c78c5b6adf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 37 additions and 3 deletions

View File

@ -11,6 +11,7 @@
* Adds `.log` as a file extension for TXT files.
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
* Allow passed encoding to be used in the `replace_mime_encodings`
* Fixes page metadata for `partition_html` when `include_metadata=False`
## 0.6.11

View File

@ -22,6 +22,14 @@ def test_partition_html_from_filename():
assert elements[0].metadata.file_directory == directory
def test_partition_html_from_filename_metadata_false():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
filename = os.path.join(directory, "example-10k.html")
elements = partition_html(filename=filename, include_metadata=False)
metadata_present = any(element.metadata.to_dict() for element in elements)
assert not metadata_present
def test_partition_html_with_page_breaks():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
elements = partition_html(filename=filename, include_page_breaks=True)

View File

@ -8,7 +8,11 @@ from typing import IO, Callable, List, Optional
from unstructured.documents.elements import Element, PageBreak
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.partition.common import _add_element_metadata, exactly_one
from unstructured.partition.common import (
_add_element_metadata,
_remove_element_metadata,
exactly_one,
)
try:
import magic
@ -380,7 +384,9 @@ def add_metadata_with_filetype(filetype: FileType):
**metadata_kwargs, # type: ignore
)
else:
return elements
return _remove_element_metadata(
elements,
)
return wrapper

View File

@ -84,7 +84,6 @@ def _add_element_metadata(
text_as_html: Optional[str] = layout_element.text_as_html
else:
text_as_html = None
# NOTE(robinson) - defer to the page number that's already in the metadata
# if it's available
if hasattr(element, "metadata"):
@ -111,6 +110,26 @@ def _add_element_metadata(
return elements
def _remove_element_metadata(
layout_elements,
) -> List[Element]:
"""Removes document metadata from the document element. Document metadata includes information
like the filename, source url, and page number."""
# Init an empty list of elements to write to
elements: List[Element] = []
metadata = ElementMetadata()
for layout_element in layout_elements:
element = normalize_layout_element(layout_element)
if isinstance(element, list):
for _element in element:
_element.metadata = metadata
elements.extend(element)
else:
element.metadata = metadata
elements.append(element)
return elements
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows