mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-09 08:39:57 +00:00
rfctr(part): add new decorator to replace four (#3650)
**Summary** In preparation for pluggable auto-partitioners, add a new metadata decorator to replace the four existing ones. **Additional Context** "Global" metadata items, those applied to all element on all partitioners, are applied using a decorator. Currently there are four decorators where there only needs to be one. Consolidate those into a single metadata decorator. One or two additional behaviors of the new decorator will allow us to remove decorators from delegating partitioners which is a prerequisite for pluggable auto-partitioners.
This commit is contained in:
parent
44bad216f3
commit
50d75c47d3
@ -1,9 +1,11 @@
|
||||
## 0.15.14-dev3
|
||||
## 0.15.14-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
* **Add (but do not install) a new post-partitioning decorator to handle metadata added for all file-types, like `.filename`, `.filetype` and `.languages`.** This will be installed in a closely following PR to replace the four currently being used for this purpose.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK.
|
||||
|
@ -6,15 +6,15 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Union
|
||||
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import LogCaptureFixture
|
||||
from unstructured.documents.elements import (
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
)
|
||||
from unstructured.partition.lang import (
|
||||
from unstructured.partition.common.lang import (
|
||||
_clean_ocr_languages_arg,
|
||||
_convert_language_code_to_pytesseract_lang_code,
|
||||
apply_lang_metadata,
|
||||
@ -61,13 +61,13 @@ def test_prepare_languages_for_tesseract_with_multiple_languages():
|
||||
assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ"
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog):
|
||||
def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog: LogCaptureFixture):
|
||||
languages = ["zzz", "chi"]
|
||||
assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert"
|
||||
assert "not a valid standard language code" in caplog.text
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog):
|
||||
def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog: LogCaptureFixture):
|
||||
languages = ["kbd", "eng"]
|
||||
assert prepare_languages_for_tesseract(languages) == "eng"
|
||||
assert "not a language supported by Tesseract" in caplog.text
|
||||
@ -79,7 +79,7 @@ def test_prepare_languages_for_tesseract_None_languages():
|
||||
prepare_languages_for_tesseract(languages)
|
||||
|
||||
|
||||
def test_prepare_languages_for_tesseract_no_valid_languages(caplog):
|
||||
def test_prepare_languages_for_tesseract_no_valid_languages(caplog: LogCaptureFixture):
|
||||
languages = [""]
|
||||
assert prepare_languages_for_tesseract(languages) == "eng"
|
||||
assert "Failed to find any valid standard language code from languages" in caplog.text
|
||||
@ -96,11 +96,11 @@ def test_prepare_languages_for_tesseract_no_valid_languages(caplog):
|
||||
("kor", "korean"),
|
||||
],
|
||||
)
|
||||
def test_tesseract_to_paddle_language_valid_codes(tesseract_lang, expected_lang):
|
||||
def test_tesseract_to_paddle_language_valid_codes(tesseract_lang: str, expected_lang: str):
|
||||
assert expected_lang == tesseract_to_paddle_language(tesseract_lang)
|
||||
|
||||
|
||||
def test_tesseract_to_paddle_language_invalid_codes(caplog):
|
||||
def test_tesseract_to_paddle_language_invalid_codes(caplog: LogCaptureFixture):
|
||||
tesseract_lang = "unsupported_lang"
|
||||
assert tesseract_to_paddle_language(tesseract_lang) == "en"
|
||||
assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text
|
||||
@ -114,7 +114,7 @@ def test_tesseract_to_paddle_language_invalid_codes(caplog):
|
||||
("DEU", "german"),
|
||||
],
|
||||
)
|
||||
def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang, expected_lang):
|
||||
def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang: str, expected_lang: str):
|
||||
assert expected_lang == tesseract_to_paddle_language(tesseract_lang)
|
||||
|
||||
|
||||
@ -139,7 +139,7 @@ def test_detect_languages_gets_multiple_languages():
|
||||
assert detect_languages(text) == ["ces", "pol", "slk"]
|
||||
|
||||
|
||||
def test_detect_languages_warns_for_auto_and_other_input(caplog):
|
||||
def test_detect_languages_warns_for_auto_and_other_input(caplog: LogCaptureFixture):
|
||||
text = "This is another short sentence."
|
||||
languages = ["en", "auto", "rus"]
|
||||
assert detect_languages(text, languages) == ["eng"]
|
||||
@ -149,10 +149,10 @@ def test_detect_languages_warns_for_auto_and_other_input(caplog):
|
||||
def test_detect_languages_raises_TypeError_for_invalid_languages():
|
||||
with pytest.raises(TypeError):
|
||||
text = "This is a short sentence."
|
||||
detect_languages(text, languages="eng") == ["eng"]
|
||||
detect_languages(text, languages="eng") == ["eng"] # type: ignore
|
||||
|
||||
|
||||
def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog):
|
||||
def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog: LogCaptureFixture):
|
||||
elements = [NarrativeText("Sample text."), PageBreak("")]
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
@ -171,7 +171,7 @@ def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog):
|
||||
("fr", "fra"),
|
||||
],
|
||||
)
|
||||
def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
|
||||
def test_convert_language_code_to_pytesseract_lang_code(lang_in: str, expected_lang: str):
|
||||
assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in)
|
||||
|
||||
|
||||
@ -187,7 +187,7 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang):
|
||||
("deu+spa", "deu+spa"), # correct input
|
||||
],
|
||||
)
|
||||
def test_clean_ocr_languages_arg(input_ocr_langs, expected):
|
||||
def test_clean_ocr_languages_arg(input_ocr_langs: str, expected: str):
|
||||
assert _clean_ocr_languages_arg(input_ocr_langs) == expected
|
||||
|
||||
|
||||
@ -209,12 +209,15 @@ def test_detect_languages_handles_spelled_out_languages():
|
||||
],
|
||||
)
|
||||
def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined(
|
||||
languages: Union[list[str], str],
|
||||
ocr_languages: Union[list[str], str, None],
|
||||
languages: list[str],
|
||||
ocr_languages: list[str] | str,
|
||||
expected_langs: list[str],
|
||||
caplog,
|
||||
caplog: LogCaptureFixture,
|
||||
):
|
||||
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
|
||||
returned_langs = check_language_args(
|
||||
languages=languages,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
for lang in returned_langs: # type: ignore
|
||||
assert lang in expected_langs
|
||||
assert "ocr_languages" in caplog.text
|
||||
@ -231,10 +234,10 @@ def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are
|
||||
],
|
||||
)
|
||||
def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None(
|
||||
languages: Union[list[str], str],
|
||||
ocr_languages: Union[list[str], str, None],
|
||||
languages: list[str],
|
||||
ocr_languages: str,
|
||||
expected_langs: list[str],
|
||||
caplog,
|
||||
caplog: LogCaptureFixture,
|
||||
):
|
||||
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
|
||||
for lang in returned_langs: # type: ignore
|
||||
@ -250,19 +253,15 @@ def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None(
|
||||
],
|
||||
)
|
||||
def test_check_language_args_returns_None(
|
||||
languages: Union[list[str], str, None],
|
||||
ocr_languages: Union[list[str], str, None],
|
||||
languages: list[str],
|
||||
ocr_languages: None,
|
||||
):
|
||||
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
|
||||
assert returned_langs is None
|
||||
|
||||
|
||||
def test_check_language_args_returns_auto(
|
||||
languages=["eng", "spa", "auto"],
|
||||
ocr_languages=None,
|
||||
):
|
||||
returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages)
|
||||
assert returned_langs == ["auto"]
|
||||
def test_check_language_args_returns_auto():
|
||||
assert check_language_args(languages=["eng", "spa", "auto"], ocr_languages=None) == ["auto"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -273,8 +272,11 @@ def test_check_language_args_returns_auto(
|
||||
],
|
||||
)
|
||||
def test_check_language_args_raises_error_when_ocr_languages_contains_auto(
|
||||
languages: Union[list[str], str, None],
|
||||
ocr_languages: Union[list[str], str, None],
|
||||
languages: list[str],
|
||||
ocr_languages: str | list[str],
|
||||
):
|
||||
with pytest.raises(ValueError):
|
||||
check_language_args(languages=languages, ocr_languages=ocr_languages)
|
||||
check_language_args(
|
||||
languages=languages,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
@ -5,9 +5,13 @@ from __future__ import annotations
|
||||
import datetime as dt
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Any, Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import (
|
||||
CheckBox,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
FigureCaption,
|
||||
Header,
|
||||
@ -16,7 +20,9 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.metadata import (
|
||||
apply_metadata,
|
||||
get_last_modified_date,
|
||||
set_element_hierarchy,
|
||||
)
|
||||
@ -119,3 +125,193 @@ def test_set_element_hierarchy_custom_rule_set():
|
||||
assert (
|
||||
elements[5].metadata.parent_id == elements[4].id
|
||||
), "FigureCaption should be child of Title 2"
|
||||
|
||||
|
||||
class Describe_apply_metadata:
|
||||
"""Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator."""
|
||||
|
||||
# -- unique-ids -------------------------------------------------------
|
||||
|
||||
def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
elements_2 = partition()
|
||||
|
||||
# -- SHA1 hash is 32 characters long, no hyphens --
|
||||
assert all(len(e.id) == 32 for e in elements)
|
||||
assert all("-" not in e.id for e in elements)
|
||||
# -- SHA1 hashes are deterministic --
|
||||
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
|
||||
|
||||
def it_assigns_hash_element_ids_when_unique_ids_arg_is_False(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(unique_element_ids=False)
|
||||
elements_2 = partition(unique_element_ids=False)
|
||||
|
||||
# -- SHA1 hash is 32 characters long, no hyphens --
|
||||
assert all(len(e.id) == 32 for e in elements)
|
||||
assert all("-" not in e.id for e in elements)
|
||||
# -- SHA1 hashes are deterministic --
|
||||
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
|
||||
|
||||
def it_leaves_UUID_element_ids_when_unique_ids_arg_is_True(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(unique_element_ids=True)
|
||||
elements_2 = partition(unique_element_ids=True)
|
||||
|
||||
# -- UUID is 36 characters long with four hyphens --
|
||||
assert all(len(e.id) == 36 for e in elements)
|
||||
assert all(e.id.count("-") == 4 for e in elements)
|
||||
# -- UUIDs are non-deterministic, different every time --
|
||||
assert all(e.id != e2.id for e, e2 in zip(elements, elements_2))
|
||||
|
||||
# -- parent-id --------------------------------------------------------
|
||||
|
||||
def it_computes_and_assigns_parent_id(self, fake_partitioner: Callable[..., list[Element]]):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
|
||||
title = elements[0]
|
||||
assert title.metadata.category_depth == 1
|
||||
narr_text = elements[1]
|
||||
assert narr_text.metadata.parent_id == title.id
|
||||
|
||||
# -- languages --------------------------------------------------------
|
||||
|
||||
def it_applies_language_metadata(self, fake_partitioner: Callable[..., list[Element]]):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(languages=["auto"], detect_language_per_element=True)
|
||||
|
||||
assert all(e.metadata.languages == ["eng"] for e in elements)
|
||||
|
||||
# -- filetype (MIME-type) ---------------------------------------------
|
||||
|
||||
def it_assigns_the_value_of_a_metadata_file_type_arg_when_there_is_one(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
"""A `metadata_file_type` arg overrides the file-type specified in the decorator.
|
||||
|
||||
This is used for example by a delegating partitioner to preserve the original file-type in
|
||||
the metadata, like EPUB instead of the HTML that partitioner converts the .epub file to.
|
||||
"""
|
||||
partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner)
|
||||
|
||||
elements = partition(metadata_file_type=FileType.ODT)
|
||||
|
||||
assert all(
|
||||
e.metadata.filetype == "application/vnd.oasis.opendocument.text" for e in elements
|
||||
)
|
||||
|
||||
def and_it_assigns_the_decorator_file_type_when_the_metadata_file_type_arg_is_omitted(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
"""The `file_type=...` decorator arg is the "normal" way to specify the file-type.
|
||||
|
||||
This is used for principal (non-delegating) partitioners.
|
||||
"""
|
||||
partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
|
||||
DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
assert all(e.metadata.filetype == DOCX_MIME_TYPE for e in elements)
|
||||
|
||||
def and_it_does_not_assign_file_type_metadata_when_both_are_omitted(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
"""A partitioner can elect to assign `.metadata.filetype` for itself.
|
||||
|
||||
This is done in `partition_image()` for example where the same partitioner is used for
|
||||
multiple file-types.
|
||||
"""
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
|
||||
assert all(e.metadata.filetype == "image/jpeg" for e in elements)
|
||||
|
||||
# -- filename ---------------------------------------------------------
|
||||
|
||||
def it_uses_metadata_filename_arg_value_when_present(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
"""A `metadata_filename` arg overrides all other sources."""
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(metadata_filename="a/b/c.xyz")
|
||||
|
||||
assert all(e.metadata.filename == "c.xyz" for e in elements)
|
||||
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
||||
|
||||
def and_it_uses_filename_arg_value_when_metadata_filename_arg_not_present(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(filename="a/b/c.xyz")
|
||||
|
||||
assert all(e.metadata.filename == "c.xyz" for e in elements)
|
||||
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
||||
|
||||
def and_it_does_not_assign_filename_metadata_when_neither_are_present(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
|
||||
assert all(e.metadata.filename == "image.jpeg" for e in elements)
|
||||
assert all(e.metadata.file_directory == "x/y/images" for e in elements)
|
||||
|
||||
# -- url --------------------------------------------------------------
|
||||
|
||||
def it_assigns_url_metadata_field_when_url_arg_is_present(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition(url="https://adobe.com/stock/54321")
|
||||
|
||||
assert all(e.metadata.url == "https://adobe.com/stock/54321" for e in elements)
|
||||
|
||||
def and_it_does_not_assign_url_metadata_when_url_arg_is_not_present(
|
||||
self, fake_partitioner: Callable[..., list[Element]]
|
||||
):
|
||||
partition = apply_metadata()(fake_partitioner)
|
||||
|
||||
elements = partition()
|
||||
|
||||
assert all(e.metadata.url == "http://images.com" for e in elements)
|
||||
|
||||
# -- fixtures --------------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def fake_partitioner(self) -> Callable[..., list[Element]]:
|
||||
def fake_partitioner(**kwargs: Any) -> list[Element]:
|
||||
title = Title("Introduction")
|
||||
title.metadata.category_depth = 1
|
||||
title.metadata.file_directory = "x/y/images"
|
||||
title.metadata.filename = "image.jpeg"
|
||||
title.metadata.filetype = "image/jpeg"
|
||||
title.metadata.url = "http://images.com"
|
||||
|
||||
narr_text = NarrativeText("To understand bar you must first understand foo.")
|
||||
narr_text.metadata.file_directory = "x/y/images"
|
||||
narr_text.metadata.filename = "image.jpeg"
|
||||
narr_text.metadata.filetype = "image/jpeg"
|
||||
narr_text.metadata.url = "http://images.com"
|
||||
|
||||
return [title, narr_text]
|
||||
|
||||
return fake_partitioner
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev3" # pragma: no cover
|
||||
__version__ = "0.15.14-dev4" # pragma: no cover
|
||||
|
@ -14,7 +14,7 @@ from unstructured.file_utils.filetype import detect_filetype, is_json_processabl
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.lang import check_language_args
|
||||
from unstructured.partition.common.lang import check_language_args
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
|
@ -3,8 +3,12 @@ from __future__ import annotations
|
||||
import re
|
||||
from typing import Iterable, Iterator, Optional
|
||||
|
||||
import iso639
|
||||
from langdetect import DetectorFactory, detect_langs, lang_detect_exception
|
||||
import iso639 # pyright: ignore[reportMissingTypeStubs]
|
||||
from langdetect import ( # pyright: ignore[reportMissingTypeStubs]
|
||||
DetectorFactory,
|
||||
detect_langs, # pyright: ignore[reportUnknownVariableType]
|
||||
lang_detect_exception,
|
||||
)
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.logger import logger
|
||||
@ -208,12 +212,13 @@ def prepare_languages_for_tesseract(languages: Optional[list[str]] = ["eng"]) ->
|
||||
"""
|
||||
if languages is None:
|
||||
raise ValueError("`languages` can not be `None`")
|
||||
converted_languages = list(
|
||||
filter(
|
||||
lambda x: x is not None and x != "",
|
||||
[_convert_language_code_to_pytesseract_lang_code(lang) for lang in languages],
|
||||
),
|
||||
)
|
||||
converted_languages = [
|
||||
lang_code
|
||||
for lang_code in (
|
||||
_convert_language_code_to_pytesseract_lang_code(lang) for lang in languages
|
||||
)
|
||||
if lang_code
|
||||
]
|
||||
# Remove duplicates from the list but keep the original order
|
||||
converted_languages = list(dict.fromkeys(converted_languages))
|
||||
if len(converted_languages) == 0:
|
||||
@ -245,13 +250,17 @@ def tesseract_to_paddle_language(tesseract_language: str) -> str:
|
||||
return lang
|
||||
|
||||
|
||||
def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> Optional[list[str]]:
|
||||
"""Handle users defining both `ocr_languages` and `languages`, giving preference to `languages`
|
||||
and converting `ocr_languages` if needed, but defaulting to `None.
|
||||
def check_language_args(
|
||||
languages: list[str], ocr_languages: str | list[str] | None
|
||||
) -> list[str] | None:
|
||||
"""Handle users defining both `ocr_languages` and `languages`.
|
||||
|
||||
Give preference to `languages` and convert `ocr_languages` if needed, but default to `None`.
|
||||
|
||||
`ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`.
|
||||
`ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection
|
||||
which is not supported by `partition_image` or `partition_pdf`."""
|
||||
which is not supported by `partition_image` or `partition_pdf`.
|
||||
"""
|
||||
# --- Clean and update defaults
|
||||
if ocr_languages:
|
||||
ocr_languages = _clean_ocr_languages_arg(ocr_languages)
|
||||
@ -259,6 +268,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O
|
||||
"The ocr_languages kwarg will be deprecated in a future version of unstructured. "
|
||||
"Please use languages instead.",
|
||||
)
|
||||
assert ocr_languages is None or isinstance(ocr_languages, str)
|
||||
|
||||
if ocr_languages and "auto" in ocr_languages:
|
||||
raise ValueError(
|
||||
@ -268,7 +278,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O
|
||||
" Language detection is not currently supported in pdfs or images."
|
||||
)
|
||||
|
||||
if not isinstance(languages, list):
|
||||
if not isinstance(languages, list): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
raise TypeError(
|
||||
"The language parameter must be a list of language codes as strings, ex. ['eng']",
|
||||
)
|
||||
@ -354,7 +364,7 @@ def _convert_language_code_to_pytesseract_lang_code(lang: str) -> str:
|
||||
|
||||
def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]:
|
||||
try:
|
||||
return iso639.Language.match(lang.lower())
|
||||
return iso639.Language.match(lang.lower()) # pyright: ignore[reportUnknownMemberType]
|
||||
except iso639.LanguageNotFoundError:
|
||||
logger.warning(f"{lang} is not a valid standard language code.")
|
||||
return None
|
||||
@ -431,10 +441,10 @@ def detect_languages(
|
||||
# machine translation
|
||||
# TODO(shreya): decide how to maintain nonstandard chinese script information
|
||||
for langobj in langdetect_result:
|
||||
if str(langobj.lang).startswith("zh"):
|
||||
if str(langobj.lang).startswith("zh"): # pyright: ignore
|
||||
langdetect_langs.append("zho")
|
||||
else:
|
||||
language = _get_iso639_language_object(langobj.lang[:3])
|
||||
language = _get_iso639_language_object(langobj.lang[:3]) # pyright: ignore
|
||||
if language:
|
||||
langdetect_langs.append(language.part3)
|
||||
|
@ -3,13 +3,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as dt
|
||||
import functools
|
||||
import os
|
||||
from typing import Optional, Sequence
|
||||
from typing import Any, Callable, Sequence
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.utils import get_call_args_applying_defaults
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def get_last_modified_date(filename: str) -> Optional[str]:
|
||||
def get_last_modified_date(filename: str) -> str | None:
|
||||
"""Modification time of file at path `filename`, if it exists.
|
||||
|
||||
Returns `None` when `filename` is not a path to a file on the local filesystem.
|
||||
@ -54,9 +62,9 @@ HIERARCHY_RULE_SET = {
|
||||
def set_element_hierarchy(
|
||||
elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET
|
||||
) -> list[Element]:
|
||||
"""Sets the parent_id for each element in the list of elements
|
||||
based on the element's category, depth and a ruleset
|
||||
"""Sets `.metadata.parent_id` for each element it applies to.
|
||||
|
||||
`parent_id` assignment is based on the element's category, depth and a ruleset.
|
||||
"""
|
||||
stack: list[Element] = []
|
||||
for element in elements:
|
||||
@ -97,3 +105,104 @@ def set_element_hierarchy(
|
||||
stack.append(element)
|
||||
|
||||
return list(elements)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# METADATA POST-PARTITIONING PROCESSING DECORATOR
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def apply_metadata(
|
||||
file_type: FileType | None = None,
|
||||
) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
|
||||
"""Post-process element-metadata for this document.
|
||||
|
||||
This decorator adds a post-processing step to a partitioner, primarily to apply metadata that
|
||||
is common to all partitioners. It assumes the following responsibilities:
|
||||
|
||||
- Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids`
|
||||
argument is False.
|
||||
|
||||
- Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth`
|
||||
etc. added by partitioner.
|
||||
|
||||
- Language metadata. Computes and applies `language` metadata based on a language detection
|
||||
model.
|
||||
|
||||
- Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that
|
||||
applies is used:
|
||||
|
||||
- `metadata_file_type` argument is present in call, use that.
|
||||
- `file_type` decorator argument is populated, use that.
|
||||
- `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype`
|
||||
(assume the partitioner will do that for itself, like `partition_image()`.
|
||||
|
||||
- Replace `filename` with `metadata_filename` when present.
|
||||
|
||||
- Apply `url` metadata when present.
|
||||
"""
|
||||
|
||||
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
||||
"""The decorator function itself.
|
||||
|
||||
This function is returned by the `apply_metadata()` function and is the actual decorator.
|
||||
Think of `apply_metadata()` as a factory function that configures this decorator, in
|
||||
particular by setting its `file_type` value.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
|
||||
elements = func(*args, **kwargs)
|
||||
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
|
||||
|
||||
# -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the
|
||||
# -- elements themselves, not the metadata.
|
||||
unique_element_ids: bool = call_args.get("unique_element_ids", False)
|
||||
if unique_element_ids is False:
|
||||
elements = assign_and_map_hash_ids(elements)
|
||||
|
||||
# -- `parent_id` - process category-level etc. to assign parent-id --
|
||||
elements = set_element_hierarchy(elements)
|
||||
|
||||
# -- `language` - auto-detect language (e.g. eng, spa) --
|
||||
languages = call_args.get("languages")
|
||||
detect_language_per_element = call_args.get("detect_language_per_element", False)
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
elements=elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
)
|
||||
)
|
||||
|
||||
# == apply filetype, filename, and url metadata =========================
|
||||
metadata_kwargs: dict[str, Any] = {}
|
||||
|
||||
# -- `filetype` (MIME-type) metadata --
|
||||
metadata_file_type = call_args.get("metadata_file_type") or file_type
|
||||
if metadata_file_type is not None:
|
||||
metadata_kwargs["filetype"] = metadata_file_type.mime_type
|
||||
|
||||
# -- `filename` metadata - override with metadata_filename when it's present --
|
||||
filename = call_args.get("metadata_filename") or call_args.get("filename")
|
||||
if filename:
|
||||
metadata_kwargs["filename"] = filename
|
||||
|
||||
# -- `url` metadata - record url when present --
|
||||
url = call_args.get("url")
|
||||
if url:
|
||||
metadata_kwargs["url"] = url
|
||||
|
||||
# -- update element.metadata in single pass --
|
||||
for element in elements:
|
||||
# NOTE(robinson) - Attached files have already run through this logic in their own
|
||||
# partitioning function
|
||||
if element.metadata.attached_to_filename:
|
||||
continue
|
||||
element.metadata.update(ElementMetadata(**metadata_kwargs))
|
||||
|
||||
return elements
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
@ -16,8 +16,8 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
DETECTION_ORIGIN: str = "csv"
|
||||
|
@ -46,8 +46,8 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
|
@ -47,9 +47,9 @@ from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||
from unstructured.partition.common.common import convert_to_bytes, exactly_one
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]
|
||||
@ -101,7 +101,7 @@ def partition_email_header(msg: EmailMessage) -> list[Element]:
|
||||
for addr in header.addresses:
|
||||
elements.append(
|
||||
element_type(
|
||||
name=addr.display_name or addr.username,
|
||||
name=addr.display_name or addr.username, # type: ignore
|
||||
text=addr.addr_spec, # type: ignore
|
||||
)
|
||||
)
|
||||
|
@ -14,9 +14,9 @@ from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.encoding import read_txt_file
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html.parser import Flow, html_parser
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, process_metadata
|
||||
from unstructured.file_utils.filetype import add_metadata
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.lang import check_language_args
|
||||
from unstructured.partition.common.lang import check_language_args
|
||||
from unstructured.partition.pdf import partition_pdf_or_image
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
|
||||
|
@ -14,9 +14,9 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
|
@ -48,12 +48,12 @@ from unstructured.partition.common.common import (
|
||||
ocr_data_to_elements,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import (
|
||||
from unstructured.partition.common.lang import (
|
||||
check_language_args,
|
||||
prepare_languages_for_tesseract,
|
||||
tesseract_to_paddle_language,
|
||||
)
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.pdf_image.analysis.layout_dump import (
|
||||
ExtractedLayoutDumper,
|
||||
FinalLayoutDumper,
|
||||
|
@ -37,8 +37,8 @@ from unstructured.documents.elements import (
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.common import convert_ms_office_table_to_text
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text_type import (
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
|
@ -30,8 +30,8 @@ from unstructured.file_utils.model import FileType
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
|
||||
from unstructured.nlp.tokenize import sent_tokenize
|
||||
from unstructured.partition.common.common import exactly_one
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
|
@ -18,8 +18,8 @@ from unstructured.partition.common.common import (
|
||||
exactly_one,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
|
||||
DETECTION_ORIGIN: str = "tsv"
|
||||
|
||||
|
@ -26,8 +26,8 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_possible_narrative_text,
|
||||
|
@ -20,8 +20,8 @@ from unstructured.partition.common.common import (
|
||||
exactly_one,
|
||||
spooled_to_bytes_io_if_needed,
|
||||
)
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.lang import apply_lang_metadata
|
||||
from unstructured.partition.text import element_from_text
|
||||
|
||||
DETECTION_ORIGIN: str = "xml"
|
||||
|
Loading…
x
Reference in New Issue
Block a user