mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
fix: page_number
appears in partition_html
metadata if include_metadata=False
(#658)
* fix: page_number appears in partition_html metadata if include_metadata=False * Update common.py * Update CHANGELOG --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
f7cde5539a
commit
c78c5b6adf
@ -11,6 +11,7 @@
|
||||
* Adds `.log` as a file extension for TXT files.
|
||||
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
|
||||
* Allow passed encoding to be used in the `replace_mime_encodings`
|
||||
* Fixes page metadata for `partition_html` when `include_metadata=False`
|
||||
|
||||
## 0.6.11
|
||||
|
||||
|
@ -22,6 +22,14 @@ def test_partition_html_from_filename():
|
||||
assert elements[0].metadata.file_directory == directory
|
||||
|
||||
|
||||
def test_partition_html_from_filename_metadata_false():
|
||||
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
filename = os.path.join(directory, "example-10k.html")
|
||||
elements = partition_html(filename=filename, include_metadata=False)
|
||||
metadata_present = any(element.metadata.to_dict() for element in elements)
|
||||
assert not metadata_present
|
||||
|
||||
|
||||
def test_partition_html_with_page_breaks():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
||||
elements = partition_html(filename=filename, include_page_breaks=True)
|
||||
|
@ -8,7 +8,11 @@ from typing import IO, Callable, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element, PageBreak
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.partition.common import _add_element_metadata, exactly_one
|
||||
from unstructured.partition.common import (
|
||||
_add_element_metadata,
|
||||
_remove_element_metadata,
|
||||
exactly_one,
|
||||
)
|
||||
|
||||
try:
|
||||
import magic
|
||||
@ -380,7 +384,9 @@ def add_metadata_with_filetype(filetype: FileType):
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
else:
|
||||
return elements
|
||||
return _remove_element_metadata(
|
||||
elements,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
@ -84,7 +84,6 @@ def _add_element_metadata(
|
||||
text_as_html: Optional[str] = layout_element.text_as_html
|
||||
else:
|
||||
text_as_html = None
|
||||
|
||||
# NOTE(robinson) - defer to the page number that's already in the metadata
|
||||
# if it's available
|
||||
if hasattr(element, "metadata"):
|
||||
@ -111,6 +110,26 @@ def _add_element_metadata(
|
||||
return elements
|
||||
|
||||
|
||||
def _remove_element_metadata(
|
||||
layout_elements,
|
||||
) -> List[Element]:
|
||||
"""Removes document metadata from the document element. Document metadata includes information
|
||||
like the filename, source url, and page number."""
|
||||
# Init an empty list of elements to write to
|
||||
elements: List[Element] = []
|
||||
metadata = ElementMetadata()
|
||||
for layout_element in layout_elements:
|
||||
element = normalize_layout_element(layout_element)
|
||||
if isinstance(element, list):
|
||||
for _element in element:
|
||||
_element.metadata = metadata
|
||||
elements.extend(element)
|
||||
else:
|
||||
element.metadata = metadata
|
||||
elements.append(element)
|
||||
return elements
|
||||
|
||||
|
||||
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
||||
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
||||
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
||||
|
Loading…
x
Reference in New Issue
Block a user