mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	rfctr(part): add new decorator to replace four (#3650)
**Summary** In preparation for pluggable auto-partitioners, add a new metadata decorator to replace the four existing ones. **Additional Context** "Global" metadata items, those applied to all element on all partitioners, are applied using a decorator. Currently there are four decorators where there only needs to be one. Consolidate those into a single metadata decorator. One or two additional behaviors of the new decorator will allow us to remove decorators from delegating partitioners which is a prerequisite for pluggable auto-partitioners.
This commit is contained in:
		
							parent
							
								
									44bad216f3
								
							
						
					
					
						commit
						50d75c47d3
					
				| @ -1,9 +1,11 @@ | |||||||
| ## 0.15.14-dev3 | ## 0.15.14-dev4 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
| 
 | 
 | ||||||
| ### Features | ### Features | ||||||
| 
 | 
 | ||||||
|  | * **Add (but do not install) a new post-partitioning decorator to handle metadata added for all file-types, like `.filename`, `.filetype` and `.languages`.** This will be installed in a closely following PR to replace the four currently being used for this purpose. | ||||||
|  | 
 | ||||||
| ### Fixes | ### Fixes | ||||||
| 
 | 
 | ||||||
| * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. | * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. | ||||||
|  | |||||||
| @ -6,15 +6,15 @@ from __future__ import annotations | |||||||
| 
 | 
 | ||||||
| import os | import os | ||||||
| import pathlib | import pathlib | ||||||
| from typing import Union |  | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
|  | from test_unstructured.unit_utils import LogCaptureFixture | ||||||
| from unstructured.documents.elements import ( | from unstructured.documents.elements import ( | ||||||
|     NarrativeText, |     NarrativeText, | ||||||
|     PageBreak, |     PageBreak, | ||||||
| ) | ) | ||||||
| from unstructured.partition.lang import ( | from unstructured.partition.common.lang import ( | ||||||
|     _clean_ocr_languages_arg, |     _clean_ocr_languages_arg, | ||||||
|     _convert_language_code_to_pytesseract_lang_code, |     _convert_language_code_to_pytesseract_lang_code, | ||||||
|     apply_lang_metadata, |     apply_lang_metadata, | ||||||
| @ -61,13 +61,13 @@ def test_prepare_languages_for_tesseract_with_multiple_languages(): | |||||||
|     assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" |     assert prepare_languages_for_tesseract(languages) == "jpn+jpn_vert+afr+eng+equ" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog): | def test_prepare_languages_for_tesseract_warns_nonstandard_language(caplog: LogCaptureFixture): | ||||||
|     languages = ["zzz", "chi"] |     languages = ["zzz", "chi"] | ||||||
|     assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" |     assert prepare_languages_for_tesseract(languages) == "chi_sim+chi_sim_vert+chi_tra+chi_tra_vert" | ||||||
|     assert "not a valid standard language code" in caplog.text |     assert "not a valid standard language code" in caplog.text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog): | def test_prepare_languages_for_tesseract_warns_non_tesseract_language(caplog: LogCaptureFixture): | ||||||
|     languages = ["kbd", "eng"] |     languages = ["kbd", "eng"] | ||||||
|     assert prepare_languages_for_tesseract(languages) == "eng" |     assert prepare_languages_for_tesseract(languages) == "eng" | ||||||
|     assert "not a language supported by Tesseract" in caplog.text |     assert "not a language supported by Tesseract" in caplog.text | ||||||
| @ -79,7 +79,7 @@ def test_prepare_languages_for_tesseract_None_languages(): | |||||||
|         prepare_languages_for_tesseract(languages) |         prepare_languages_for_tesseract(languages) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_prepare_languages_for_tesseract_no_valid_languages(caplog): | def test_prepare_languages_for_tesseract_no_valid_languages(caplog: LogCaptureFixture): | ||||||
|     languages = [""] |     languages = [""] | ||||||
|     assert prepare_languages_for_tesseract(languages) == "eng" |     assert prepare_languages_for_tesseract(languages) == "eng" | ||||||
|     assert "Failed to find any valid standard language code from languages" in caplog.text |     assert "Failed to find any valid standard language code from languages" in caplog.text | ||||||
| @ -96,11 +96,11 @@ def test_prepare_languages_for_tesseract_no_valid_languages(caplog): | |||||||
|         ("kor", "korean"), |         ("kor", "korean"), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_tesseract_to_paddle_language_valid_codes(tesseract_lang, expected_lang): | def test_tesseract_to_paddle_language_valid_codes(tesseract_lang: str, expected_lang: str): | ||||||
|     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) |     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_tesseract_to_paddle_language_invalid_codes(caplog): | def test_tesseract_to_paddle_language_invalid_codes(caplog: LogCaptureFixture): | ||||||
|     tesseract_lang = "unsupported_lang" |     tesseract_lang = "unsupported_lang" | ||||||
|     assert tesseract_to_paddle_language(tesseract_lang) == "en" |     assert tesseract_to_paddle_language(tesseract_lang) == "en" | ||||||
|     assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text |     assert "unsupported_lang is not a language code supported by PaddleOCR," in caplog.text | ||||||
| @ -114,7 +114,7 @@ def test_tesseract_to_paddle_language_invalid_codes(caplog): | |||||||
|         ("DEU", "german"), |         ("DEU", "german"), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang, expected_lang): | def test_tesseract_to_paddle_language_case_sensitivity(tesseract_lang: str, expected_lang: str): | ||||||
|     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) |     assert expected_lang == tesseract_to_paddle_language(tesseract_lang) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -139,7 +139,7 @@ def test_detect_languages_gets_multiple_languages(): | |||||||
|     assert detect_languages(text) == ["ces", "pol", "slk"] |     assert detect_languages(text) == ["ces", "pol", "slk"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_detect_languages_warns_for_auto_and_other_input(caplog): | def test_detect_languages_warns_for_auto_and_other_input(caplog: LogCaptureFixture): | ||||||
|     text = "This is another short sentence." |     text = "This is another short sentence." | ||||||
|     languages = ["en", "auto", "rus"] |     languages = ["en", "auto", "rus"] | ||||||
|     assert detect_languages(text, languages) == ["eng"] |     assert detect_languages(text, languages) == ["eng"] | ||||||
| @ -149,10 +149,10 @@ def test_detect_languages_warns_for_auto_and_other_input(caplog): | |||||||
| def test_detect_languages_raises_TypeError_for_invalid_languages(): | def test_detect_languages_raises_TypeError_for_invalid_languages(): | ||||||
|     with pytest.raises(TypeError): |     with pytest.raises(TypeError): | ||||||
|         text = "This is a short sentence." |         text = "This is a short sentence." | ||||||
|         detect_languages(text, languages="eng") == ["eng"] |         detect_languages(text, languages="eng") == ["eng"]  # type: ignore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog): | def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog: LogCaptureFixture): | ||||||
|     elements = [NarrativeText("Sample text."), PageBreak("")] |     elements = [NarrativeText("Sample text."), PageBreak("")] | ||||||
|     elements = list( |     elements = list( | ||||||
|         apply_lang_metadata( |         apply_lang_metadata( | ||||||
| @ -171,7 +171,7 @@ def test_apply_lang_metadata_has_no_warning_for_PageBreak(caplog): | |||||||
|         ("fr", "fra"), |         ("fr", "fra"), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang): | def test_convert_language_code_to_pytesseract_lang_code(lang_in: str, expected_lang: str): | ||||||
|     assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in) |     assert expected_lang == _convert_language_code_to_pytesseract_lang_code(lang_in) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -187,7 +187,7 @@ def test_convert_language_code_to_pytesseract_lang_code(lang_in, expected_lang): | |||||||
|         ("deu+spa", "deu+spa"),  # correct input |         ("deu+spa", "deu+spa"),  # correct input | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_clean_ocr_languages_arg(input_ocr_langs, expected): | def test_clean_ocr_languages_arg(input_ocr_langs: str, expected: str): | ||||||
|     assert _clean_ocr_languages_arg(input_ocr_langs) == expected |     assert _clean_ocr_languages_arg(input_ocr_langs) == expected | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -209,12 +209,15 @@ def test_detect_languages_handles_spelled_out_languages(): | |||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( | def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_defined( | ||||||
|     languages: Union[list[str], str], |     languages: list[str], | ||||||
|     ocr_languages: Union[list[str], str, None], |     ocr_languages: list[str] | str, | ||||||
|     expected_langs: list[str], |     expected_langs: list[str], | ||||||
|     caplog, |     caplog: LogCaptureFixture, | ||||||
| ): | ): | ||||||
|     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) |     returned_langs = check_language_args( | ||||||
|  |         languages=languages, | ||||||
|  |         ocr_languages=ocr_languages, | ||||||
|  |     ) | ||||||
|     for lang in returned_langs:  # type: ignore |     for lang in returned_langs:  # type: ignore | ||||||
|         assert lang in expected_langs |         assert lang in expected_langs | ||||||
|         assert "ocr_languages" in caplog.text |         assert "ocr_languages" in caplog.text | ||||||
| @ -231,10 +234,10 @@ def test_check_language_args_uses_languages_when_ocr_languages_and_languages_are | |||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( | def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( | ||||||
|     languages: Union[list[str], str], |     languages: list[str], | ||||||
|     ocr_languages: Union[list[str], str, None], |     ocr_languages: str, | ||||||
|     expected_langs: list[str], |     expected_langs: list[str], | ||||||
|     caplog, |     caplog: LogCaptureFixture, | ||||||
| ): | ): | ||||||
|     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) |     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) | ||||||
|     for lang in returned_langs:  # type: ignore |     for lang in returned_langs:  # type: ignore | ||||||
| @ -250,19 +253,15 @@ def test_check_language_args_uses_ocr_languages_when_languages_is_empty_or_None( | |||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_check_language_args_returns_None( | def test_check_language_args_returns_None( | ||||||
|     languages: Union[list[str], str, None], |     languages: list[str], | ||||||
|     ocr_languages: Union[list[str], str, None], |     ocr_languages: None, | ||||||
| ): | ): | ||||||
|     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) |     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) | ||||||
|     assert returned_langs is None |     assert returned_langs is None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_check_language_args_returns_auto( | def test_check_language_args_returns_auto(): | ||||||
|     languages=["eng", "spa", "auto"], |     assert check_language_args(languages=["eng", "spa", "auto"], ocr_languages=None) == ["auto"] | ||||||
|     ocr_languages=None, |  | ||||||
| ): |  | ||||||
|     returned_langs = check_language_args(languages=languages, ocr_languages=ocr_languages) |  | ||||||
|     assert returned_langs == ["auto"] |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
| @ -273,8 +272,11 @@ def test_check_language_args_returns_auto( | |||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_check_language_args_raises_error_when_ocr_languages_contains_auto( | def test_check_language_args_raises_error_when_ocr_languages_contains_auto( | ||||||
|     languages: Union[list[str], str, None], |     languages: list[str], | ||||||
|     ocr_languages: Union[list[str], str, None], |     ocr_languages: str | list[str], | ||||||
| ): | ): | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         check_language_args(languages=languages, ocr_languages=ocr_languages) |         check_language_args( | ||||||
|  |             languages=languages, | ||||||
|  |             ocr_languages=ocr_languages, | ||||||
|  |         ) | ||||||
| @ -5,9 +5,13 @@ from __future__ import annotations | |||||||
| import datetime as dt | import datetime as dt | ||||||
| import os | import os | ||||||
| import pathlib | import pathlib | ||||||
|  | from typing import Any, Callable | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
| 
 | 
 | ||||||
| from unstructured.documents.elements import ( | from unstructured.documents.elements import ( | ||||||
|     CheckBox, |     CheckBox, | ||||||
|  |     Element, | ||||||
|     ElementMetadata, |     ElementMetadata, | ||||||
|     FigureCaption, |     FigureCaption, | ||||||
|     Header, |     Header, | ||||||
| @ -16,7 +20,9 @@ from unstructured.documents.elements import ( | |||||||
|     Text, |     Text, | ||||||
|     Title, |     Title, | ||||||
| ) | ) | ||||||
|  | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.metadata import ( | from unstructured.partition.common.metadata import ( | ||||||
|  |     apply_metadata, | ||||||
|     get_last_modified_date, |     get_last_modified_date, | ||||||
|     set_element_hierarchy, |     set_element_hierarchy, | ||||||
| ) | ) | ||||||
| @ -119,3 +125,193 @@ def test_set_element_hierarchy_custom_rule_set(): | |||||||
|     assert ( |     assert ( | ||||||
|         elements[5].metadata.parent_id == elements[4].id |         elements[5].metadata.parent_id == elements[4].id | ||||||
|     ), "FigureCaption should be child of Title 2" |     ), "FigureCaption should be child of Title 2" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Describe_apply_metadata: | ||||||
|  |     """Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator.""" | ||||||
|  | 
 | ||||||
|  |     # -- unique-ids ------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  |         elements_2 = partition() | ||||||
|  | 
 | ||||||
|  |         # -- SHA1 hash is 32 characters long, no hyphens -- | ||||||
|  |         assert all(len(e.id) == 32 for e in elements) | ||||||
|  |         assert all("-" not in e.id for e in elements) | ||||||
|  |         # -- SHA1 hashes are deterministic -- | ||||||
|  |         assert all(e.id == e2.id for e, e2 in zip(elements, elements_2)) | ||||||
|  | 
 | ||||||
|  |     def it_assigns_hash_element_ids_when_unique_ids_arg_is_False( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(unique_element_ids=False) | ||||||
|  |         elements_2 = partition(unique_element_ids=False) | ||||||
|  | 
 | ||||||
|  |         # -- SHA1 hash is 32 characters long, no hyphens -- | ||||||
|  |         assert all(len(e.id) == 32 for e in elements) | ||||||
|  |         assert all("-" not in e.id for e in elements) | ||||||
|  |         # -- SHA1 hashes are deterministic -- | ||||||
|  |         assert all(e.id == e2.id for e, e2 in zip(elements, elements_2)) | ||||||
|  | 
 | ||||||
|  |     def it_leaves_UUID_element_ids_when_unique_ids_arg_is_True( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(unique_element_ids=True) | ||||||
|  |         elements_2 = partition(unique_element_ids=True) | ||||||
|  | 
 | ||||||
|  |         # -- UUID is 36 characters long with four hyphens -- | ||||||
|  |         assert all(len(e.id) == 36 for e in elements) | ||||||
|  |         assert all(e.id.count("-") == 4 for e in elements) | ||||||
|  |         # -- UUIDs are non-deterministic, different every time -- | ||||||
|  |         assert all(e.id != e2.id for e, e2 in zip(elements, elements_2)) | ||||||
|  | 
 | ||||||
|  |     # -- parent-id -------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_computes_and_assigns_parent_id(self, fake_partitioner: Callable[..., list[Element]]): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  | 
 | ||||||
|  |         title = elements[0] | ||||||
|  |         assert title.metadata.category_depth == 1 | ||||||
|  |         narr_text = elements[1] | ||||||
|  |         assert narr_text.metadata.parent_id == title.id | ||||||
|  | 
 | ||||||
|  |     # -- languages -------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_applies_language_metadata(self, fake_partitioner: Callable[..., list[Element]]): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(languages=["auto"], detect_language_per_element=True) | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.languages == ["eng"] for e in elements) | ||||||
|  | 
 | ||||||
|  |     # -- filetype (MIME-type) --------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_assigns_the_value_of_a_metadata_file_type_arg_when_there_is_one( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         """A `metadata_file_type` arg overrides the file-type specified in the decorator. | ||||||
|  | 
 | ||||||
|  |         This is used for example by a delegating partitioner to preserve the original file-type in | ||||||
|  |         the metadata, like EPUB instead of the HTML that partitioner converts the .epub file to. | ||||||
|  |         """ | ||||||
|  |         partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(metadata_file_type=FileType.ODT) | ||||||
|  | 
 | ||||||
|  |         assert all( | ||||||
|  |             e.metadata.filetype == "application/vnd.oasis.opendocument.text" for e in elements | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     def and_it_assigns_the_decorator_file_type_when_the_metadata_file_type_arg_is_omitted( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         """The `file_type=...` decorator arg is the "normal" way to specify the file-type. | ||||||
|  | 
 | ||||||
|  |         This is used for principal (non-delegating) partitioners. | ||||||
|  |         """ | ||||||
|  |         partition = apply_metadata(file_type=FileType.DOCX)(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  | 
 | ||||||
|  |         DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||||
|  |         assert all(e.metadata.filetype == DOCX_MIME_TYPE for e in elements) | ||||||
|  | 
 | ||||||
|  |     def and_it_does_not_assign_file_type_metadata_when_both_are_omitted( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         """A partitioner can elect to assign `.metadata.filetype` for itself. | ||||||
|  | 
 | ||||||
|  |         This is done in `partition_image()` for example where the same partitioner is used for | ||||||
|  |         multiple file-types. | ||||||
|  |         """ | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.filetype == "image/jpeg" for e in elements) | ||||||
|  | 
 | ||||||
|  |     # -- filename --------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_uses_metadata_filename_arg_value_when_present( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         """A `metadata_filename` arg overrides all other sources.""" | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(metadata_filename="a/b/c.xyz") | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.filename == "c.xyz" for e in elements) | ||||||
|  |         assert all(e.metadata.file_directory == "a/b" for e in elements) | ||||||
|  | 
 | ||||||
|  |     def and_it_uses_filename_arg_value_when_metadata_filename_arg_not_present( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(filename="a/b/c.xyz") | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.filename == "c.xyz" for e in elements) | ||||||
|  |         assert all(e.metadata.file_directory == "a/b" for e in elements) | ||||||
|  | 
 | ||||||
|  |     def and_it_does_not_assign_filename_metadata_when_neither_are_present( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.filename == "image.jpeg" for e in elements) | ||||||
|  |         assert all(e.metadata.file_directory == "x/y/images" for e in elements) | ||||||
|  | 
 | ||||||
|  |     # -- url -------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     def it_assigns_url_metadata_field_when_url_arg_is_present( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition(url="https://adobe.com/stock/54321") | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.url == "https://adobe.com/stock/54321" for e in elements) | ||||||
|  | 
 | ||||||
|  |     def and_it_does_not_assign_url_metadata_when_url_arg_is_not_present( | ||||||
|  |         self, fake_partitioner: Callable[..., list[Element]] | ||||||
|  |     ): | ||||||
|  |         partition = apply_metadata()(fake_partitioner) | ||||||
|  | 
 | ||||||
|  |         elements = partition() | ||||||
|  | 
 | ||||||
|  |         assert all(e.metadata.url == "http://images.com" for e in elements) | ||||||
|  | 
 | ||||||
|  |     # -- fixtures -------------------------------------------------------------------------------- | ||||||
|  | 
 | ||||||
|  |     @pytest.fixture | ||||||
|  |     def fake_partitioner(self) -> Callable[..., list[Element]]: | ||||||
|  |         def fake_partitioner(**kwargs: Any) -> list[Element]: | ||||||
|  |             title = Title("Introduction") | ||||||
|  |             title.metadata.category_depth = 1 | ||||||
|  |             title.metadata.file_directory = "x/y/images" | ||||||
|  |             title.metadata.filename = "image.jpeg" | ||||||
|  |             title.metadata.filetype = "image/jpeg" | ||||||
|  |             title.metadata.url = "http://images.com" | ||||||
|  | 
 | ||||||
|  |             narr_text = NarrativeText("To understand bar you must first understand foo.") | ||||||
|  |             narr_text.metadata.file_directory = "x/y/images" | ||||||
|  |             narr_text.metadata.filename = "image.jpeg" | ||||||
|  |             narr_text.metadata.filetype = "image/jpeg" | ||||||
|  |             narr_text.metadata.url = "http://images.com" | ||||||
|  | 
 | ||||||
|  |             return [title, narr_text] | ||||||
|  | 
 | ||||||
|  |         return fake_partitioner | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.15.14-dev3"  # pragma: no cover | __version__ = "0.15.14-dev4"  # pragma: no cover | ||||||
|  | |||||||
| @ -14,7 +14,7 @@ from unstructured.file_utils.filetype import detect_filetype, is_json_processabl | |||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.logger import logger | from unstructured.logger import logger | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.lang import check_language_args | from unstructured.partition.common.lang import check_language_args | ||||||
| from unstructured.partition.utils.constants import PartitionStrategy | from unstructured.partition.utils.constants import PartitionStrategy | ||||||
| from unstructured.utils import dependency_exists | from unstructured.utils import dependency_exists | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -3,8 +3,12 @@ from __future__ import annotations | |||||||
| import re | import re | ||||||
| from typing import Iterable, Iterator, Optional | from typing import Iterable, Iterator, Optional | ||||||
| 
 | 
 | ||||||
| import iso639 | import iso639  # pyright: ignore[reportMissingTypeStubs] | ||||||
| from langdetect import DetectorFactory, detect_langs, lang_detect_exception | from langdetect import (  # pyright: ignore[reportMissingTypeStubs] | ||||||
|  |     DetectorFactory, | ||||||
|  |     detect_langs,  # pyright: ignore[reportUnknownVariableType] | ||||||
|  |     lang_detect_exception, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| from unstructured.documents.elements import Element | from unstructured.documents.elements import Element | ||||||
| from unstructured.logger import logger | from unstructured.logger import logger | ||||||
| @ -208,12 +212,13 @@ def prepare_languages_for_tesseract(languages: Optional[list[str]] = ["eng"]) -> | |||||||
|     """ |     """ | ||||||
|     if languages is None: |     if languages is None: | ||||||
|         raise ValueError("`languages` can not be `None`") |         raise ValueError("`languages` can not be `None`") | ||||||
|     converted_languages = list( |     converted_languages = [ | ||||||
|         filter( |         lang_code | ||||||
|             lambda x: x is not None and x != "", |         for lang_code in ( | ||||||
|             [_convert_language_code_to_pytesseract_lang_code(lang) for lang in languages], |             _convert_language_code_to_pytesseract_lang_code(lang) for lang in languages | ||||||
|         ), |  | ||||||
|         ) |         ) | ||||||
|  |         if lang_code | ||||||
|  |     ] | ||||||
|     # Remove duplicates from the list but keep the original order |     # Remove duplicates from the list but keep the original order | ||||||
|     converted_languages = list(dict.fromkeys(converted_languages)) |     converted_languages = list(dict.fromkeys(converted_languages)) | ||||||
|     if len(converted_languages) == 0: |     if len(converted_languages) == 0: | ||||||
| @ -245,13 +250,17 @@ def tesseract_to_paddle_language(tesseract_language: str) -> str: | |||||||
|     return lang |     return lang | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> Optional[list[str]]: | def check_language_args( | ||||||
|     """Handle users defining both `ocr_languages` and `languages`, giving preference to `languages` |     languages: list[str], ocr_languages: str | list[str] | None | ||||||
|     and converting `ocr_languages` if needed, but defaulting to `None. | ) -> list[str] | None: | ||||||
|  |     """Handle users defining both `ocr_languages` and `languages`. | ||||||
|  | 
 | ||||||
|  |     Give preference to `languages` and convert `ocr_languages` if needed, but default to `None`. | ||||||
| 
 | 
 | ||||||
|     `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`. |     `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`. | ||||||
|     `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection |     `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection | ||||||
|     which is not supported by `partition_image` or `partition_pdf`.""" |     which is not supported by `partition_image` or `partition_pdf`. | ||||||
|  |     """ | ||||||
|     # --- Clean and update defaults |     # --- Clean and update defaults | ||||||
|     if ocr_languages: |     if ocr_languages: | ||||||
|         ocr_languages = _clean_ocr_languages_arg(ocr_languages) |         ocr_languages = _clean_ocr_languages_arg(ocr_languages) | ||||||
| @ -259,6 +268,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O | |||||||
|             "The ocr_languages kwarg will be deprecated in a future version of unstructured. " |             "The ocr_languages kwarg will be deprecated in a future version of unstructured. " | ||||||
|             "Please use languages instead.", |             "Please use languages instead.", | ||||||
|         ) |         ) | ||||||
|  |     assert ocr_languages is None or isinstance(ocr_languages, str) | ||||||
| 
 | 
 | ||||||
|     if ocr_languages and "auto" in ocr_languages: |     if ocr_languages and "auto" in ocr_languages: | ||||||
|         raise ValueError( |         raise ValueError( | ||||||
| @ -268,7 +278,7 @@ def check_language_args(languages: list[str], ocr_languages: Optional[str]) -> O | |||||||
|             " Language detection is not currently supported in pdfs or images." |             " Language detection is not currently supported in pdfs or images." | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     if not isinstance(languages, list): |     if not isinstance(languages, list):  # pyright: ignore[reportUnnecessaryIsInstance] | ||||||
|         raise TypeError( |         raise TypeError( | ||||||
|             "The language parameter must be a list of language codes as strings, ex. ['eng']", |             "The language parameter must be a list of language codes as strings, ex. ['eng']", | ||||||
|         ) |         ) | ||||||
| @ -354,7 +364,7 @@ def _convert_language_code_to_pytesseract_lang_code(lang: str) -> str: | |||||||
| 
 | 
 | ||||||
| def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]: | def _get_iso639_language_object(lang: str) -> Optional[iso639.Language]: | ||||||
|     try: |     try: | ||||||
|         return iso639.Language.match(lang.lower()) |         return iso639.Language.match(lang.lower())  # pyright: ignore[reportUnknownMemberType] | ||||||
|     except iso639.LanguageNotFoundError: |     except iso639.LanguageNotFoundError: | ||||||
|         logger.warning(f"{lang} is not a valid standard language code.") |         logger.warning(f"{lang} is not a valid standard language code.") | ||||||
|         return None |         return None | ||||||
| @ -431,10 +441,10 @@ def detect_languages( | |||||||
|         # machine translation |         # machine translation | ||||||
|         # TODO(shreya): decide how to maintain nonstandard chinese script information |         # TODO(shreya): decide how to maintain nonstandard chinese script information | ||||||
|         for langobj in langdetect_result: |         for langobj in langdetect_result: | ||||||
|             if str(langobj.lang).startswith("zh"): |             if str(langobj.lang).startswith("zh"):  # pyright: ignore | ||||||
|                 langdetect_langs.append("zho") |                 langdetect_langs.append("zho") | ||||||
|             else: |             else: | ||||||
|                 language = _get_iso639_language_object(langobj.lang[:3]) |                 language = _get_iso639_language_object(langobj.lang[:3])  # pyright: ignore | ||||||
|                 if language: |                 if language: | ||||||
|                     langdetect_langs.append(language.part3) |                     langdetect_langs.append(language.part3) | ||||||
| 
 | 
 | ||||||
| @ -3,13 +3,21 @@ | |||||||
| from __future__ import annotations | from __future__ import annotations | ||||||
| 
 | 
 | ||||||
| import datetime as dt | import datetime as dt | ||||||
|  | import functools | ||||||
| import os | import os | ||||||
| from typing import Optional, Sequence | from typing import Any, Callable, Sequence | ||||||
| 
 | 
 | ||||||
| from unstructured.documents.elements import Element | from typing_extensions import ParamSpec | ||||||
|  | 
 | ||||||
|  | from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids | ||||||
|  | from unstructured.file_utils.model import FileType | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
|  | from unstructured.utils import get_call_args_applying_defaults | ||||||
|  | 
 | ||||||
|  | _P = ParamSpec("_P") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_last_modified_date(filename: str) -> Optional[str]: | def get_last_modified_date(filename: str) -> str | None: | ||||||
|     """Modification time of file at path `filename`, if it exists. |     """Modification time of file at path `filename`, if it exists. | ||||||
| 
 | 
 | ||||||
|     Returns `None` when `filename` is not a path to a file on the local filesystem. |     Returns `None` when `filename` is not a path to a file on the local filesystem. | ||||||
| @ -54,9 +62,9 @@ HIERARCHY_RULE_SET = { | |||||||
| def set_element_hierarchy( | def set_element_hierarchy( | ||||||
|     elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET |     elements: Sequence[Element], ruleset: dict[str, list[str]] = HIERARCHY_RULE_SET | ||||||
| ) -> list[Element]: | ) -> list[Element]: | ||||||
|     """Sets the parent_id for each element in the list of elements |     """Sets `.metadata.parent_id` for each element it applies to. | ||||||
|     based on the element's category, depth and a ruleset |  | ||||||
| 
 | 
 | ||||||
|  |     `parent_id` assignment is based on the element's category, depth and a ruleset. | ||||||
|     """ |     """ | ||||||
|     stack: list[Element] = [] |     stack: list[Element] = [] | ||||||
|     for element in elements: |     for element in elements: | ||||||
| @ -97,3 +105,104 @@ def set_element_hierarchy( | |||||||
|         stack.append(element) |         stack.append(element) | ||||||
| 
 | 
 | ||||||
|     return list(elements) |     return list(elements) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ================================================================================================ | ||||||
|  | # METADATA POST-PARTITIONING PROCESSING DECORATOR | ||||||
|  | # ================================================================================================ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def apply_metadata( | ||||||
|  |     file_type: FileType | None = None, | ||||||
|  | ) -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]: | ||||||
|  |     """Post-process element-metadata for this document. | ||||||
|  | 
 | ||||||
|  |     This decorator adds a post-processing step to a partitioner, primarily to apply metadata that | ||||||
|  |     is common to all partitioners. It assumes the following responsibilities: | ||||||
|  | 
 | ||||||
|  |       - Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids` | ||||||
|  |         argument is False. | ||||||
|  | 
 | ||||||
|  |       - Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth` | ||||||
|  |         etc. added by partitioner. | ||||||
|  | 
 | ||||||
|  |       - Language metadata. Computes and applies `language` metadata based on a language detection | ||||||
|  |         model. | ||||||
|  | 
 | ||||||
|  |       - Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that | ||||||
|  |         applies is used: | ||||||
|  | 
 | ||||||
|  |           - `metadata_file_type` argument is present in call, use that. | ||||||
|  |           - `file_type` decorator argument is populated, use that. | ||||||
|  |           - `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype` | ||||||
|  |             (assume the partitioner will do that for itself, like `partition_image()`. | ||||||
|  | 
 | ||||||
|  |       - Replace `filename` with `metadata_filename` when present. | ||||||
|  | 
 | ||||||
|  |       - Apply `url` metadata when present. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: | ||||||
|  |         """The decorator function itself. | ||||||
|  | 
 | ||||||
|  |         This function is returned by the `apply_metadata()` function and is the actual decorator. | ||||||
|  |         Think of `apply_metadata()` as a factory function that configures this decorator, in | ||||||
|  |         particular by setting its `file_type` value. | ||||||
|  |         """ | ||||||
|  | 
 | ||||||
|  |         @functools.wraps(func) | ||||||
|  |         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]: | ||||||
|  |             elements = func(*args, **kwargs) | ||||||
|  |             call_args = get_call_args_applying_defaults(func, *args, **kwargs) | ||||||
|  | 
 | ||||||
|  |             # -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the | ||||||
|  |             # -- elements themselves, not the metadata. | ||||||
|  |             unique_element_ids: bool = call_args.get("unique_element_ids", False) | ||||||
|  |             if unique_element_ids is False: | ||||||
|  |                 elements = assign_and_map_hash_ids(elements) | ||||||
|  | 
 | ||||||
|  |             # -- `parent_id` - process category-level etc. to assign parent-id -- | ||||||
|  |             elements = set_element_hierarchy(elements) | ||||||
|  | 
 | ||||||
|  |             # -- `language` - auto-detect language (e.g. eng, spa) -- | ||||||
|  |             languages = call_args.get("languages") | ||||||
|  |             detect_language_per_element = call_args.get("detect_language_per_element", False) | ||||||
|  |             elements = list( | ||||||
|  |                 apply_lang_metadata( | ||||||
|  |                     elements=elements, | ||||||
|  |                     languages=languages, | ||||||
|  |                     detect_language_per_element=detect_language_per_element, | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             # == apply filetype, filename, and url metadata ========================= | ||||||
|  |             metadata_kwargs: dict[str, Any] = {} | ||||||
|  | 
 | ||||||
|  |             # -- `filetype` (MIME-type) metadata -- | ||||||
|  |             metadata_file_type = call_args.get("metadata_file_type") or file_type | ||||||
|  |             if metadata_file_type is not None: | ||||||
|  |                 metadata_kwargs["filetype"] = metadata_file_type.mime_type | ||||||
|  | 
 | ||||||
|  |             # -- `filename` metadata - override with metadata_filename when it's present -- | ||||||
|  |             filename = call_args.get("metadata_filename") or call_args.get("filename") | ||||||
|  |             if filename: | ||||||
|  |                 metadata_kwargs["filename"] = filename | ||||||
|  | 
 | ||||||
|  |             # -- `url` metadata - record url when present -- | ||||||
|  |             url = call_args.get("url") | ||||||
|  |             if url: | ||||||
|  |                 metadata_kwargs["url"] = url | ||||||
|  | 
 | ||||||
|  |             # -- update element.metadata in single pass -- | ||||||
|  |             for element in elements: | ||||||
|  |                 # NOTE(robinson) - Attached files have already run through this logic in their own | ||||||
|  |                 # partitioning function | ||||||
|  |                 if element.metadata.attached_to_filename: | ||||||
|  |                     continue | ||||||
|  |                 element.metadata.update(ElementMetadata(**metadata_kwargs)) | ||||||
|  | 
 | ||||||
|  |             return elements | ||||||
|  | 
 | ||||||
|  |         return wrapper | ||||||
|  | 
 | ||||||
|  |     return decorator | ||||||
|  | |||||||
| @ -16,8 +16,8 @@ from unstructured.documents.elements import ( | |||||||
| ) | ) | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.utils import is_temp_file_path, lazyproperty | from unstructured.utils import is_temp_file_path, lazyproperty | ||||||
| 
 | 
 | ||||||
| DETECTION_ORIGIN: str = "csv" | DETECTION_ORIGIN: str = "csv" | ||||||
|  | |||||||
| @ -46,8 +46,8 @@ from unstructured.documents.elements import ( | |||||||
| ) | ) | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text_type import ( | from unstructured.partition.text_type import ( | ||||||
|     is_bulleted_text, |     is_bulleted_text, | ||||||
|     is_email_address, |     is_email_address, | ||||||
|  | |||||||
| @ -47,9 +47,9 @@ from unstructured.file_utils.model import FileType | |||||||
| from unstructured.logger import logger | from unstructured.logger import logger | ||||||
| from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE | from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE | ||||||
| from unstructured.partition.common.common import convert_to_bytes, exactly_one | from unstructured.partition.common.common import convert_to_bytes, exactly_one | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.html import partition_html | from unstructured.partition.html import partition_html | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text import partition_text | from unstructured.partition.text import partition_text | ||||||
| 
 | 
 | ||||||
| VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] | VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] | ||||||
| @ -101,7 +101,7 @@ def partition_email_header(msg: EmailMessage) -> list[Element]: | |||||||
|         for addr in header.addresses: |         for addr in header.addresses: | ||||||
|             elements.append( |             elements.append( | ||||||
|                 element_type( |                 element_type( | ||||||
|                     name=addr.display_name or addr.username, |                     name=addr.display_name or addr.username,  # type: ignore | ||||||
|                     text=addr.addr_spec,  # type: ignore |                     text=addr.addr_spec,  # type: ignore | ||||||
|                 ) |                 ) | ||||||
|             ) |             ) | ||||||
|  | |||||||
| @ -14,9 +14,9 @@ from unstructured.documents.elements import Element, process_metadata | |||||||
| from unstructured.file_utils.encoding import read_txt_file | from unstructured.file_utils.encoding import read_txt_file | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.html.parser import Flow, html_parser | from unstructured.partition.html.parser import Flow, html_parser | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.utils import is_temp_file_path, lazyproperty | from unstructured.utils import is_temp_file_path, lazyproperty | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -6,7 +6,7 @@ from unstructured.chunking import add_chunking_strategy | |||||||
| from unstructured.documents.elements import Element, process_metadata | from unstructured.documents.elements import Element, process_metadata | ||||||
| from unstructured.file_utils.filetype import add_metadata | from unstructured.file_utils.filetype import add_metadata | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
| from unstructured.partition.lang import check_language_args | from unstructured.partition.common.lang import check_language_args | ||||||
| from unstructured.partition.pdf import partition_pdf_or_image | from unstructured.partition.pdf import partition_pdf_or_image | ||||||
| from unstructured.partition.utils.constants import PartitionStrategy | from unstructured.partition.utils.constants import PartitionStrategy | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -14,9 +14,9 @@ from unstructured.documents.elements import Element, ElementMetadata, process_me | |||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.logger import logger | from unstructured.logger import logger | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.html import partition_html | from unstructured.partition.html import partition_html | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text import partition_text | from unstructured.partition.text import partition_text | ||||||
| from unstructured.utils import is_temp_file_path, lazyproperty | from unstructured.utils import is_temp_file_path, lazyproperty | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -48,12 +48,12 @@ from unstructured.partition.common.common import ( | |||||||
|     ocr_data_to_elements, |     ocr_data_to_elements, | ||||||
|     spooled_to_bytes_io_if_needed, |     spooled_to_bytes_io_if_needed, | ||||||
| ) | ) | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.lang import ( | ||||||
| from unstructured.partition.lang import ( |  | ||||||
|     check_language_args, |     check_language_args, | ||||||
|     prepare_languages_for_tesseract, |     prepare_languages_for_tesseract, | ||||||
|     tesseract_to_paddle_language, |     tesseract_to_paddle_language, | ||||||
| ) | ) | ||||||
|  | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.pdf_image.analysis.layout_dump import ( | from unstructured.partition.pdf_image.analysis.layout_dump import ( | ||||||
|     ExtractedLayoutDumper, |     ExtractedLayoutDumper, | ||||||
|     FinalLayoutDumper, |     FinalLayoutDumper, | ||||||
|  | |||||||
| @ -37,8 +37,8 @@ from unstructured.documents.elements import ( | |||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
| from unstructured.partition.common.common import convert_ms_office_table_to_text | from unstructured.partition.common.common import convert_ms_office_table_to_text | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text_type import ( | from unstructured.partition.text_type import ( | ||||||
|     is_email_address, |     is_email_address, | ||||||
|     is_possible_narrative_text, |     is_possible_narrative_text, | ||||||
|  | |||||||
| @ -30,8 +30,8 @@ from unstructured.file_utils.model import FileType | |||||||
| from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE | from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE | ||||||
| from unstructured.nlp.tokenize import sent_tokenize | from unstructured.nlp.tokenize import sent_tokenize | ||||||
| from unstructured.partition.common.common import exactly_one | from unstructured.partition.common.common import exactly_one | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text_type import ( | from unstructured.partition.text_type import ( | ||||||
|     is_bulleted_text, |     is_bulleted_text, | ||||||
|     is_email_address, |     is_email_address, | ||||||
|  | |||||||
| @ -18,8 +18,8 @@ from unstructured.partition.common.common import ( | |||||||
|     exactly_one, |     exactly_one, | ||||||
|     spooled_to_bytes_io_if_needed, |     spooled_to_bytes_io_if_needed, | ||||||
| ) | ) | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| 
 | 
 | ||||||
| DETECTION_ORIGIN: str = "tsv" | DETECTION_ORIGIN: str = "tsv" | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -26,8 +26,8 @@ from unstructured.documents.elements import ( | |||||||
| ) | ) | ||||||
| from unstructured.file_utils.filetype import add_metadata_with_filetype | from unstructured.file_utils.filetype import add_metadata_with_filetype | ||||||
| from unstructured.file_utils.model import FileType | from unstructured.file_utils.model import FileType | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text_type import ( | from unstructured.partition.text_type import ( | ||||||
|     is_bulleted_text, |     is_bulleted_text, | ||||||
|     is_possible_narrative_text, |     is_possible_narrative_text, | ||||||
|  | |||||||
| @ -20,8 +20,8 @@ from unstructured.partition.common.common import ( | |||||||
|     exactly_one, |     exactly_one, | ||||||
|     spooled_to_bytes_io_if_needed, |     spooled_to_bytes_io_if_needed, | ||||||
| ) | ) | ||||||
|  | from unstructured.partition.common.lang import apply_lang_metadata | ||||||
| from unstructured.partition.common.metadata import get_last_modified_date | from unstructured.partition.common.metadata import get_last_modified_date | ||||||
| from unstructured.partition.lang import apply_lang_metadata |  | ||||||
| from unstructured.partition.text import element_from_text | from unstructured.partition.text import element_from_text | ||||||
| 
 | 
 | ||||||
| DETECTION_ORIGIN: str = "xml" | DETECTION_ORIGIN: str = "xml" | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Steve Canny
						Steve Canny