mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
fix: page_number
appears in partition_html
metadata if include_metadata=False
(#658)
* fix: page_number appears in partition_html metadata if include_metadata=False * Update common.py * Update CHANGELOG --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
f7cde5539a
commit
c78c5b6adf
@ -11,6 +11,7 @@
|
|||||||
* Adds `.log` as a file extension for TXT files.
|
* Adds `.log` as a file extension for TXT files.
|
||||||
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
|
* Adds functionality to try other common encodings for email (`.eml`) files if an error related to the encoding is raised and the user has not specified an encoding.
|
||||||
* Allow passed encoding to be used in the `replace_mime_encodings`
|
* Allow passed encoding to be used in the `replace_mime_encodings`
|
||||||
|
* Fixes page metadata for `partition_html` when `include_metadata=False`
|
||||||
|
|
||||||
## 0.6.11
|
## 0.6.11
|
||||||
|
|
||||||
|
@ -22,6 +22,14 @@ def test_partition_html_from_filename():
|
|||||||
assert elements[0].metadata.file_directory == directory
|
assert elements[0].metadata.file_directory == directory
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_html_from_filename_metadata_false():
|
||||||
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||||
|
filename = os.path.join(directory, "example-10k.html")
|
||||||
|
elements = partition_html(filename=filename, include_metadata=False)
|
||||||
|
metadata_present = any(element.metadata.to_dict() for element in elements)
|
||||||
|
assert not metadata_present
|
||||||
|
|
||||||
|
|
||||||
def test_partition_html_with_page_breaks():
|
def test_partition_html_with_page_breaks():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "example-10k.html")
|
||||||
elements = partition_html(filename=filename, include_page_breaks=True)
|
elements = partition_html(filename=filename, include_page_breaks=True)
|
||||||
|
@ -8,7 +8,11 @@ from typing import IO, Callable, List, Optional
|
|||||||
|
|
||||||
from unstructured.documents.elements import Element, PageBreak
|
from unstructured.documents.elements import Element, PageBreak
|
||||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||||
from unstructured.partition.common import _add_element_metadata, exactly_one
|
from unstructured.partition.common import (
|
||||||
|
_add_element_metadata,
|
||||||
|
_remove_element_metadata,
|
||||||
|
exactly_one,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import magic
|
import magic
|
||||||
@ -380,7 +384,9 @@ def add_metadata_with_filetype(filetype: FileType):
|
|||||||
**metadata_kwargs, # type: ignore
|
**metadata_kwargs, # type: ignore
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return elements
|
return _remove_element_metadata(
|
||||||
|
elements,
|
||||||
|
)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
@ -84,7 +84,6 @@ def _add_element_metadata(
|
|||||||
text_as_html: Optional[str] = layout_element.text_as_html
|
text_as_html: Optional[str] = layout_element.text_as_html
|
||||||
else:
|
else:
|
||||||
text_as_html = None
|
text_as_html = None
|
||||||
|
|
||||||
# NOTE(robinson) - defer to the page number that's already in the metadata
|
# NOTE(robinson) - defer to the page number that's already in the metadata
|
||||||
# if it's available
|
# if it's available
|
||||||
if hasattr(element, "metadata"):
|
if hasattr(element, "metadata"):
|
||||||
@ -111,6 +110,26 @@ def _add_element_metadata(
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_element_metadata(
|
||||||
|
layout_elements,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Removes document metadata from the document element. Document metadata includes information
|
||||||
|
like the filename, source url, and page number."""
|
||||||
|
# Init an empty list of elements to write to
|
||||||
|
elements: List[Element] = []
|
||||||
|
metadata = ElementMetadata()
|
||||||
|
for layout_element in layout_elements:
|
||||||
|
element = normalize_layout_element(layout_element)
|
||||||
|
if isinstance(element, list):
|
||||||
|
for _element in element:
|
||||||
|
_element.metadata = metadata
|
||||||
|
elements.extend(element)
|
||||||
|
else:
|
||||||
|
element.metadata = metadata
|
||||||
|
elements.append(element)
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
def convert_office_doc(input_filename: str, output_directory: str, target_format: str):
|
||||||
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
"""Converts a .doc file to a .docx file using the libreoffice CLI."""
|
||||||
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
# NOTE(robinson) - In the future can also include win32com client as a fallback for windows
|
||||||
|
Loading…
x
Reference in New Issue
Block a user