rfctr(part): add new decorator to replace four (#3650)

**Summary**
In preparation for pluggable auto-partitioners, add a new metadata
decorator to replace the four existing ones.

**Additional Context**
"Global" metadata items, those applied to all element on all
partitioners, are applied using a decorator.

Currently there are four decorators where there only needs to be one.
Consolidate those into a single metadata decorator.
One or two additional behaviors of the new decorator will allow us to
remove decorators from delegating partitioners which is a prerequisite
for pluggable auto-partitioners.
This commit is contained in:
Steve Canny 2024-09-25 16:15:50 -07:00 committed by GitHub
parent 44bad216f3
commit 50d75c47d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 388 additions and 69 deletions

View File

@ -1,9 +1,11 @@
## 0.15.14-dev3 ## 0.15.14-dev4
### Enhancements ### Enhancements
### Features ### Features
* **Add (but do not install) a new post-partitioning decorator to handle metadata added for all file-types, like `.filename`, `.filetype` and `.languages`.** This will be installed in a closely following PR to replace the four currently being used for this purpose.
### Fixes ### Fixes
* **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK.

View File

@ -6,15 +6,15 @@ from __future__ import annotations
import os import os
import pathlib import pathlib
from typing import Union
import pytest import pytest
from test_unstructured.unit_utils import LogCaptureFixture
from unstructured.documents.elements import ( from unstructured.documents.elements import (
NarrativeText, NarrativeText,
PageBreak, PageBreak,
) )
from unstructured.partition.lang import ( from unstructured.partition.common.lang import (
_clean_ocr_languages_arg, _clean_ocr_languages_arg,
_convert_language_code_to_pytesseract_lang_code, _convert_language_code_to_pytesseract_lang_code,
apply_lang_metadata, apply_lang_metadata,
@ -61,13 +61,13 @@ def test_prepare_languages_for_tesseract_with_multiple_languages():
assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog): def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog: LogCaptureFixture):
languages = ["zzz", "chi"] languages = ["zzz", "chi"]
assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
assert "not a valid standard language code" in caplog.text assert "not a valid standard language code" in caplog.text
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog): def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog: LogCaptureFixture):
languages = ["kbd", "eng"] languages = ["kbd", "eng"]
assert prepare_languages_for_tesseract(languages) == "eng" assert prepare_languages_for_tesseract(languages) == "eng"
assert "not a language supported by Tesseract" in caplog.text assert "not a language supported by Tesseract" in caplog.text
@ -79,7 +79,7 @@ def test_prepare_languages_for_tesseract_None_languages():
prepare_languages_for_tesseract(languages) prepare_languages_for_tesseract(languages)
def test_prepare_languages_for_tesseract_no_valid_languages(caplog): def test_prepare_languages_for_tesseract_no_valid_languages(caplog: LogCaptureFixture):
languages = [""] languages = [""]
assert prepare_languages_for_tesseract(languages) == "eng" assert prepare_languages_for_tesseract(languages) == "eng"
assert "Failed to find any valid standard language code from languages" in caplog.text assert "Failed to find any valid standard language code from languages" in caplog.text
@ -96,11 +96,11 @@ def test_prepare_languages_for_tesseract_no_valid_languages(caplog):
("kor", "korean"), ("kor", "korean"),
], ],
) )
def test_tesseract_to_paddle_language_valid_codes(tesseract_lang, expected_lang): def test_tesseract_to_paddle_language_valid_codes(tesseract_lang: str, expected_lang: str):
assert expected_lang == tesseract_to_paddle_language(tesseract_lang) assert expected_lang == tesseract_to_paddle_language(tesseract_lang)
def test_tesseract_to_paddle_language_invalid_codes(caplog): def test_tesseract_to_paddle_language_invalid_codes(caplog: LogCaptureFixture):
tesseract_lang = "unsupported_lang" tesseract_lang = "unsupported_lang"
assert tesseract_to_paddle_language(tesseract_lang) == "en" assert tesseract_to_paddle_language(tesseract_lang) == "en"
assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text
@ -114,7 +114,7 @@ def test_tesseract_to_paddle_language_invalid_codes(caplog):
("DEU", "german"), ("DEU", "german"),
], ],
) )
def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang, expected_lang): def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang: str, expected_lang: str):
assert expected_lang == tesseract_to_paddle_language(tesseract_lang) assert expected_lang == tesseract_to_paddle_language(tesseract_lang)
@ -139,7 +139,7 @@ def test_detect_languages_gets_multiple_languages():
assert detect_languages(text) == ["ces", "pol", "slk"] assert detect_languages(text) == ["ces", "pol", "slk"]
def test_detect_languages_warns_for_auto_and_other_input(caplog): def test_detect_languages_warns_for_auto_and_other_input(caplog: LogCaptureFixture):
text = "This is another short sentence." text = "This is another short sentence."
languages = ["en", "auto", "rus"] languages = ["en", "auto", "rus"]
assert detect_languages(text, languages) == ["eng"] assert detect_languages(text, languages) == ["eng"]
@ -149,10 +149,10 @@ def test_detect_languages_warns_for_auto_and_other_input(caplog):
def test_detect_languages_raises_TypeError_for_invalid_languages(): def test_detect_languages_raises_TypeError_for_invalid_languages():
with pytest.raises(TypeError): with pytest.raises(TypeError):
text = "This is a short sentence." text = "This is a short sentence."
detect_languages(text, languages="eng") == ["eng"] detect_languages(text, languages="eng") == ["eng"] # type: ignore
def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog): def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog: LogCaptureFixture):
elements = [NarrativeText("Sample text."), PageBreak("")] elements = [NarrativeText("Sample text."), PageBreak("")]
elements = list( elements = list(
apply_lang_metadata( apply_lang_metadata(
@ -171,7 +171,7 @@ def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog):
("fr", "fra"), ("fr", "fra"),
], ],
) )
def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang): def test_convert_language_code_to_pytesseract_lang_code(lang_in: str, expected_lang: str):
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in) assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
@ -187,7 +187,7 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
("deu+spa", "deu+spa"), # correct input ("deu+spa", "deu+spa"), # correct input
], ],
) )
def test_clean_ocr_languages_arg(input_ocr_langs, expected): def test_clean_ocr_languages_arg(input_ocr_langs: str, expected: str):
assert _clean_ocr_languages_arg(input_ocr_langs) == expected assert _clean_ocr_languages_arg(input_ocr_langs) == expected
@ -209,12 +209,15 @@ def test_detect_languages_handles_spelled_out_languages():
], ],
) )
def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined(
languages: Union[list[str], str], languages: list[str],
ocr_languages: Union[list[str], str, None], ocr_languages: list[str] | str,
expected_langs: list[str], expected_langs: list[str],
caplog, caplog: LogCaptureFixture,
): ):
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) returned_langs = check_language_args(
languages=languages,
ocr_languages=ocr_languages,
)
for lang in returned_langs: # type: ignore for lang in returned_langs: # type: ignore
assert lang in expected_langs assert lang in expected_langs
assert "ocr_languages" in caplog.text assert "ocr_languages" in caplog.text
@ -231,10 +234,10 @@ def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are
], ],
) )
def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None(
languages: Union[list[str], str], languages: list[str],
ocr_languages: Union[list[str], str, None], ocr_languages: str,
expected_langs: list[str], expected_langs: list[str],
caplog, caplog: LogCaptureFixture,
): ):
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
for lang in returned_langs: # type: ignore for lang in returned_langs: # type: ignore
@ -250,19 +253,15 @@ def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None(
], ],
) )
def test_check_language_args_returns_None( def test_check_language_args_returns_None(
languages: Union[list[str], str, None], languages: list[str],
ocr_languages: Union[list[str], str, None], ocr_languages: None,
): ):
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
assert returned_langs is None assert returned_langs is None
def test_check_language_args_returns_auto( def test_check_language_args_returns_auto():
languages=["eng", "spa", "auto"], assert check_language_args(languages=["eng", "spa", "auto"], ocr_languages=None) == ["auto"]
ocr_languages=None,
):
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
assert returned_langs == ["auto"]
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -273,8 +272,11 @@ def test_check_language_args_returns_auto(
], ],
) )
def test_check_language_args_raises_error_when_ocr_languages_contains_auto( def test_check_language_args_raises_error_when_ocr_languages_contains_auto(
languages: Union[list[str], str, None], languages: list[str],
ocr_languages: Union[list[str], str, None], ocr_languages: str | list[str],
): ):
with pytest.raises(ValueError): with pytest.raises(ValueError):
check_language_args(languages=languages, ocr_languages=ocr_languages) check_language_args(
languages=languages,
ocr_languages=ocr_languages,
)

View File

@ -5,9 +5,13 @@ from __future__ import annotations
import datetime as dt import datetime as dt
import os import os
import pathlib import pathlib
from typing import Any, Callable
import pytest
from unstructured.documents.elements import ( from unstructured.documents.elements import (
CheckBox, CheckBox,
Element,
ElementMetadata, ElementMetadata,
FigureCaption, FigureCaption,
Header, Header,
@ -16,7 +20,9 @@ from unstructured.documents.elements import (
Text, Text,
Title, Title,
) )
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import ( from unstructured.partition.common.metadata import (
apply_metadata,
get_last_modified_date, get_last_modified_date,
set_element_hierarchy, set_element_hierarchy,
) )
@ -119,3 +125,193 @@ def test_set_element_hierarchy_custom_rule_set():
assert ( assert (
elements[5].metadata.parent_id == elements[4].id elements[5].metadata.parent_id == elements[4].id
), "FigureCaption should be child of Title 2" ), "FigureCaption should be child of Title 2"
class Describe_apply_metadata:
"""Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator."""
# -- unique-ids -------------------------------------------------------
def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition()
elements_2 = partition()
# -- SHA1 hash is 32 characters long, no hyphens --
assert all(len(e.id) == 32 for e in elements)
assert all("-" not in e.id for e in elements)
# -- SHA1 hashes are deterministic --
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
def it_assigns_hash_element_ids_when_unique_ids_arg_is_False(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition(unique_element_ids=False)
elements_2 = partition(unique_element_ids=False)
# -- SHA1 hash is 32 characters long, no hyphens --
assert all(len(e.id) == 32 for e in elements)
assert all("-" not in e.id for e in elements)
# -- SHA1 hashes are deterministic --
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
def it_leaves_UUID_element_ids_when_unique_ids_arg_is_True(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition(unique_element_ids=True)
elements_2 = partition(unique_element_ids=True)
# -- UUID is 36 characters long with four hyphens --
assert all(len(e.id) == 36 for e in elements)
assert all(e.id.count("-") == 4 for e in elements)
# -- UUIDs are non-deterministic, different every time --
assert all(e.id != e2.id for e, e2 in zip(elements, elements_2))
# -- parent-id --------------------------------------------------------
def it_computes_and_assigns_parent_id(self, fake_partitioner: Callable[..., list[Element]]):
partition = apply_metadata()(fake_partitioner)
elements = partition()
title = elements[0]
assert title.metadata.category_depth == 1
narr_text = elements[1]
assert narr_text.metadata.parent_id == title.id
# -- languages --------------------------------------------------------
def it_applies_language_metadata(self, fake_partitioner: Callable[..., list[Element]]):
partition = apply_metadata()(fake_partitioner)
elements = partition(languages=["auto"], detect_language_per_element=True)
assert all(e.metadata.languages == ["eng"] for e in elements)
# -- filetype (MIME-type) ---------------------------------------------
def it_assigns_the_value_of_a_metadata_file_type_arg_when_there_is_one(
self, fake_partitioner: Callable[..., list[Element]]
):
"""A `metadata_file_type` arg overrides the file-type specified in the decorator.
This is used for example by a delegating partitioner to preserve the original file-type in
the metadata, like EPUB instead of the HTML that partitioner converts the .epub file to.
"""
partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner)
elements = partition(metadata_file_type=FileType.ODT)
assert all(
e.metadata.filetype == "application/vnd.oasis.opendocument.text" for e in elements
)
def and_it_assigns_the_decorator_file_type_when_the_metadata_file_type_arg_is_omitted(
self, fake_partitioner: Callable[..., list[Element]]
):
"""The `file_type=...` decorator arg is the "normal" way to specify the file-type.
This is used for principal (non-delegating) partitioners.
"""
partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner)
elements = partition()
DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
assert all(e.metadata.filetype == DOCX_MIME_TYPE for e in elements)
def and_it_does_not_assign_file_type_metadata_when_both_are_omitted(
self, fake_partitioner: Callable[..., list[Element]]
):
"""A partitioner can elect to assign `.metadata.filetype` for itself.
This is done in `partition_image()` for example where the same partitioner is used for
multiple file-types.
"""
partition = apply_metadata()(fake_partitioner)
elements = partition()
assert all(e.metadata.filetype == "image/jpeg" for e in elements)
# -- filename ---------------------------------------------------------
def it_uses_metadata_filename_arg_value_when_present(
self, fake_partitioner: Callable[..., list[Element]]
):
"""A `metadata_filename` arg overrides all other sources."""
partition = apply_metadata()(fake_partitioner)
elements = partition(metadata_filename="a/b/c.xyz")
assert all(e.metadata.filename == "c.xyz" for e in elements)
assert all(e.metadata.file_directory == "a/b" for e in elements)
def and_it_uses_filename_arg_value_when_metadata_filename_arg_not_present(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition(filename="a/b/c.xyz")
assert all(e.metadata.filename == "c.xyz" for e in elements)
assert all(e.metadata.file_directory == "a/b" for e in elements)
def and_it_does_not_assign_filename_metadata_when_neither_are_present(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition()
assert all(e.metadata.filename == "image.jpeg" for e in elements)
assert all(e.metadata.file_directory == "x/y/images" for e in elements)
# -- url --------------------------------------------------------------
def it_assigns_url_metadata_field_when_url_arg_is_present(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition(url="https://adobe.com/stock/54321")
assert all(e.metadata.url == "https://adobe.com/stock/54321" for e in elements)
def and_it_does_not_assign_url_metadata_when_url_arg_is_not_present(
self, fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition()
assert all(e.metadata.url == "http://images.com" for e in elements)
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture
def fake_partitioner(self) -> Callable[..., list[Element]]:
def fake_partitioner(**kwargs: Any) -> list[Element]:
title = Title("Introduction")
title.metadata.category_depth = 1
title.metadata.file_directory = "x/y/images"
title.metadata.filename = "image.jpeg"
title.metadata.filetype = "image/jpeg"
title.metadata.url = "http://images.com"
narr_text = NarrativeText("To understand bar you must first understand foo.")
narr_text.metadata.file_directory = "x/y/images"
narr_text.metadata.filename = "image.jpeg"
narr_text.metadata.filetype = "image/jpeg"
narr_text.metadata.url = "http://images.com"
return [title, narr_text]
return fake_partitioner

View File

@ -1 +1 @@
__version__ = "0.15.14-dev3" # pragma: no cover __version__ = "0.15.14-dev4" # pragma: no cover

View File

@ -14,7 +14,7 @@ from unstructured.file_utils.filetype import detect_filetype, is_json_processabl
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.common.common import exactly_one from unstructured.partition.common.common import exactly_one
from unstructured.partition.lang import check_language_args from unstructured.partition.common.lang import check_language_args
from unstructured.partition.utils.constants import PartitionStrategy from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import dependency_exists from unstructured.utils import dependency_exists

View File

@ -3,8 +3,12 @@ from __future__ import annotations
import re import re
from typing import Iterable, Iterator, Optional from typing import Iterable, Iterator, Optional
import iso639 import iso639 # pyright: ignore[reportMissingTypeStubs]
from langdetect import DetectorFactory, detect_langs, lang_detect_exception from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
DetectorFactory,
detect_langs, # pyright: ignore[reportUnknownVariableType]
lang_detect_exception,
)
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
from unstructured.logger import logger from unstructured.logger import logger
@ -208,12 +212,13 @@ def prepare_languages_for_tesseract(languages: Optional[list[str]] = ["eng"]) ->
""" """
if languages is None: if languages is None:
raise ValueError("`languages` can not be `None`") raise ValueError("`languages` can not be `None`")
converted_languages = list( converted_languages = [
filter( lang_code
lambda x: x is not None and x != "", for lang_code in (
[_convert_language_code_to_pytesseract_lang_code(lang) for lang in languages], _convert_language_code_to_pytesseract_lang_code(lang) for lang in languages
),
) )
if lang_code
]
# Remove duplicates from the list but keep the original order # Remove duplicates from the list but keep the original order
converted_languages = list(dict.fromkeys(converted_languages)) converted_languages = list(dict.fromkeys(converted_languages))
if len(converted_languages) == 0: if len(converted_languages) == 0:
@ -245,13 +250,17 @@ def tesseract_to_paddle_language(tesseract_language: str) -> str:
return lang return lang
def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> Optional[list[str]]: def check_language_args(
"""Handle users defining both `ocr_languages` and `languages`, giving preference to `languages` languages: list[str], ocr_languages: str | list[str] | None
and converting `ocr_languages` if needed, but defaulting to `None. ) -> list[str] | None:
"""Handle users defining both `ocr_languages` and `languages`.
Give preference to `languages` and convert `ocr_languages` if needed, but default to `None`.
`ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`. `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`.
`ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection
which is not supported by `partition_image` or `partition_pdf`.""" which is not supported by `partition_image` or `partition_pdf`.
"""
# --- Clean and update defaults # --- Clean and update defaults
if ocr_languages: if ocr_languages:
ocr_languages = _clean_ocr_languages_arg(ocr_languages) ocr_languages = _clean_ocr_languages_arg(ocr_languages)
@ -259,6 +268,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O
"The ocr_languages kwarg will be deprecated in a future version of unstructured. " "The ocr_languages kwarg will be deprecated in a future version of unstructured. "
"Please use languages instead.", "Please use languages instead.",
) )
assert ocr_languages is None or isinstance(ocr_languages, str)
if ocr_languages and "auto" in ocr_languages: if ocr_languages and "auto" in ocr_languages:
raise ValueError( raise ValueError(
@ -268,7 +278,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O
" Language detection is not currently supported in pdfs or images." " Language detection is not currently supported in pdfs or images."
) )
if not isinstance(languages, list): if not isinstance(languages, list): # pyright: ignore[reportUnnecessaryIsInstance]
raise TypeError( raise TypeError(
"The language parameter must be a list of language codes as strings, ex. ['eng']", "The language parameter must be a list of language codes as strings, ex. ['eng']",
) )
@ -354,7 +364,7 @@ def _convert_language_code_to_pytesseract_lang_code(lang: str) -> str:
def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]: def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]:
try: try:
return iso639.Language.match(lang.lower()) return iso639.Language.match(lang.lower()) # pyright: ignore[reportUnknownMemberType]
except iso639.LanguageNotFoundError: except iso639.LanguageNotFoundError:
logger.warning(f"{lang} is not a valid standard language code.") logger.warning(f"{lang} is not a valid standard language code.")
return None return None
@ -431,10 +441,10 @@ def detect_languages(
# machine translation # machine translation
# TODO(shreya): decide how to maintain nonstandard chinese script information # TODO(shreya): decide how to maintain nonstandard chinese script information
for langobj in langdetect_result: for langobj in langdetect_result:
if str(langobj.lang).startswith("zh"): if str(langobj.lang).startswith("zh"): # pyright: ignore
langdetect_langs.append("zho") langdetect_langs.append("zho")
else: else:
language = _get_iso639_language_object(langobj.lang[:3]) language = _get_iso639_language_object(langobj.lang[:3]) # pyright: ignore
if language: if language:
langdetect_langs.append(language.part3) langdetect_langs.append(language.part3)

View File

@ -3,13 +3,21 @@
from __future__ import annotations from __future__ import annotations
import datetime as dt import datetime as dt
import functools
import os import os
from typing import Optional, Sequence from typing import Any, Callable, Sequence
from unstructured.documents.elements import Element from typing_extensions import ParamSpec
from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids
from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.utils import get_call_args_applying_defaults
_P = ParamSpec("_P")
def get_last_modified_date(filename: str) -> Optional[str]: def get_last_modified_date(filename: str) -> str | None:
"""Modification time of file at path `filename`, if it exists. """Modification time of file at path `filename`, if it exists.
Returns `None` when `filename` is not a path to a file on the local filesystem. Returns `None` when `filename` is not a path to a file on the local filesystem.
@ -54,9 +62,9 @@ HIERARCHY_RULE_SET = {
def set_element_hierarchy( def set_element_hierarchy(
elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
) -> list[Element]: ) -> list[Element]:
"""Sets the parent_id for each element in the list of elements """Sets `.metadata.parent_id` for each element it applies to.
based on the element's category, depth and a ruleset
`parent_id` assignment is based on the element's category, depth and a ruleset.
""" """
stack: list[Element] = [] stack: list[Element] = []
for element in elements: for element in elements:
@ -97,3 +105,104 @@ def set_element_hierarchy(
stack.append(element) stack.append(element)
return list(elements) return list(elements)
# ================================================================================================
# METADATA POST-PARTITIONING PROCESSING DECORATOR
# ================================================================================================
def apply_metadata(
file_type: FileType | None = None,
) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Post-process element-metadata for this document.
This decorator adds a post-processing step to a partitioner, primarily to apply metadata that
is common to all partitioners. It assumes the following responsibilities:
- Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids`
argument is False.
- Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth`
etc. added by partitioner.
- Language metadata. Computes and applies `language` metadata based on a language detection
model.
- Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that
applies is used:
- `metadata_file_type` argument is present in call, use that.
- `file_type` decorator argument is populated, use that.
- `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype`
(assume the partitioner will do that for itself, like `partition_image()`.
- Replace `filename` with `metadata_filename` when present.
- Apply `url` metadata when present.
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
"""The decorator function itself.
This function is returned by the `apply_metadata()` function and is the actual decorator.
Think of `apply_metadata()` as a factory function that configures this decorator, in
particular by setting its `file_type` value.
"""
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
# -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the
# -- elements themselves, not the metadata.
unique_element_ids: bool = call_args.get("unique_element_ids", False)
if unique_element_ids is False:
elements = assign_and_map_hash_ids(elements)
# -- `parent_id` - process category-level etc. to assign parent-id --
elements = set_element_hierarchy(elements)
# -- `language` - auto-detect language (e.g. eng, spa) --
languages = call_args.get("languages")
detect_language_per_element = call_args.get("detect_language_per_element", False)
elements = list(
apply_lang_metadata(
elements=elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
)
)
# == apply filetype, filename, and url metadata =========================
metadata_kwargs: dict[str, Any] = {}
# -- `filetype` (MIME-type) metadata --
metadata_file_type = call_args.get("metadata_file_type") or file_type
if metadata_file_type is not None:
metadata_kwargs["filetype"] = metadata_file_type.mime_type
# -- `filename` metadata - override with metadata_filename when it's present --
filename = call_args.get("metadata_filename") or call_args.get("filename")
if filename:
metadata_kwargs["filename"] = filename
# -- `url` metadata - record url when present --
url = call_args.get("url")
if url:
metadata_kwargs["url"] = url
# -- update element.metadata in single pass --
for element in elements:
# NOTE(robinson) - Attached files have already run through this logic in their own
# partitioning function
if element.metadata.attached_to_filename:
continue
element.metadata.update(ElementMetadata(**metadata_kwargs))
return elements
return wrapper
return decorator

View File

@ -16,8 +16,8 @@ from unstructured.documents.elements import (
) )
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty from unstructured.utils import is_temp_file_path, lazyproperty
DETECTION_ORIGIN: str = "csv" DETECTION_ORIGIN: str = "csv"

View File

@ -46,8 +46,8 @@ from unstructured.documents.elements import (
) )
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
is_email_address, is_email_address,

View File

@ -47,9 +47,9 @@ from unstructured.file_utils.model import FileType
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.common.common import convert_to_bytes, exactly_one from unstructured.partition.common.common import convert_to_bytes, exactly_one
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]
@ -101,7 +101,7 @@ def partition_email_header(msg: EmailMessage) -> list[Element]:
for addr in header.addresses: for addr in header.addresses:
elements.append( elements.append(
element_type( element_type(
name=addr.display_name or addr.username, name=addr.display_name or addr.username, # type: ignore
text=addr.addr_spec, # type: ignore text=addr.addr_spec, # type: ignore
) )
) )

View File

@ -14,9 +14,9 @@ from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html.parser import Flow, html_parser from unstructured.partition.html.parser import Flow, html_parser
from unstructured.partition.lang import apply_lang_metadata
from unstructured.utils import is_temp_file_path, lazyproperty from unstructured.utils import is_temp_file_path, lazyproperty

View File

@ -6,7 +6,7 @@ from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common.common import exactly_one from unstructured.partition.common.common import exactly_one
from unstructured.partition.lang import check_language_args from unstructured.partition.common.lang import check_language_args
from unstructured.partition.pdf import partition_pdf_or_image from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy from unstructured.partition.utils.constants import PartitionStrategy

View File

@ -14,9 +14,9 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.logger import logger from unstructured.logger import logger
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import partition_text from unstructured.partition.text import partition_text
from unstructured.utils import is_temp_file_path, lazyproperty from unstructured.utils import is_temp_file_path, lazyproperty

View File

@ -48,12 +48,12 @@ from unstructured.partition.common.common import (
ocr_data_to_elements, ocr_data_to_elements,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
) )
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.lang import (
from unstructured.partition.lang import (
check_language_args, check_language_args,
prepare_languages_for_tesseract, prepare_languages_for_tesseract,
tesseract_to_paddle_language, tesseract_to_paddle_language,
) )
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.pdf_image.analysis.layout_dump import ( from unstructured.partition.pdf_image.analysis.layout_dump import (
ExtractedLayoutDumper, ExtractedLayoutDumper,
FinalLayoutDumper, FinalLayoutDumper,

View File

@ -37,8 +37,8 @@ from unstructured.documents.elements import (
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import convert_ms_office_table_to_text from unstructured.partition.common.common import convert_ms_office_table_to_text
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (
is_email_address, is_email_address,
is_possible_narrative_text, is_possible_narrative_text,

View File

@ -30,8 +30,8 @@ from unstructured.file_utils.model import FileType
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
from unstructured.nlp.tokenize import sent_tokenize from unstructured.nlp.tokenize import sent_tokenize
from unstructured.partition.common.common import exactly_one from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
is_email_address, is_email_address,

View File

@ -18,8 +18,8 @@ from unstructured.partition.common.common import (
exactly_one, exactly_one,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
) )
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
DETECTION_ORIGIN: str = "tsv" DETECTION_ORIGIN: str = "tsv"

View File

@ -26,8 +26,8 @@ from unstructured.documents.elements import (
) )
from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text_type import ( from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
is_possible_narrative_text, is_possible_narrative_text,

View File

@ -20,8 +20,8 @@ from unstructured.partition.common.common import (
exactly_one, exactly_one,
spooled_to_bytes_io_if_needed, spooled_to_bytes_io_if_needed,
) )
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.lang import apply_lang_metadata
from unstructured.partition.text import element_from_text from unstructured.partition.text import element_from_text
DETECTION_ORIGIN: str = "xml" DETECTION_ORIGIN: str = "xml"