diff --git a/CHANGELOG.md b/CHANGELOG.md index d4884ef44..d47d83bb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.8.5-dev2 +## 0.8.5 ### Enhancements +* Add parameter `skip_infer_table_types` to enable (skip) table extraction for other doc types * Adds optional Unstructured API unit tests in CI * Tracks last modified date for all document types. diff --git a/example-docs/layout-parser-paper-with-table.jpg b/example-docs/layout-parser-paper-with-table.jpg new file mode 100644 index 000000000..009180609 Binary files /dev/null and b/example-docs/layout-parser-paper-with-table.jpg differ diff --git a/test_unstructured/file_utils/test_exploration.py b/test_unstructured/file_utils/test_exploration.py index d1e052ae9..6456e6e16 100644 --- a/test_unstructured/file_utils/test_exploration.py +++ b/test_unstructured/file_utils/test_exploration.py @@ -69,7 +69,7 @@ def test_get_file_info(tmpdir): assert isinstance(file_info, pd.DataFrame) assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} - means = file_info.groupby("filetype").mean() + means = file_info.groupby("filetype").mean(numeric_only=True) assert means.columns.to_list() == ["filesize"] diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 0d36a002e..e70ca3f3c 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -771,6 +771,15 @@ def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.cs assert elements[0].metadata.filetype == "text/csv" +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"): + elements = partition(filename=filename) + + assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT + assert elements[0].metadata.text_as_html == EXPECTED_TABLE + assert elements[0].metadata.filetype == "text/tsv" + + @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): with open(filename, "rb") as f: diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py index f661693b6..d9a9e9836 100644 --- a/test_unstructured/partition/test_image.py +++ b/test_unstructured/partition/test_image.py @@ -118,6 +118,19 @@ def test_partition_image_with_auto_strategy( assert titles[0].text == title +def test_partition_image_with_table_extraction( + filename="example-docs/layout-parser-paper-with-table.jpg", +): + elements = image.partition_image( + filename=filename, + strategy="hi_res", + infer_table_structure=True, + ) + table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] + assert len(table) == 1 + assert "Layouts of history Japanese documents" in table[0] + + def test_partition_image_with_language_passed(filename="example-docs/example.jpg"): with mock.patch.object( layout, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f5c19fd96..9641792f3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.5-dev2" # pragma: no cover +__version__ = "0.8.5" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index e4ddcc8ff..ea56b0ee0 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -1,5 +1,5 @@ import io -from typing import IO, Callable, Dict, Optional, Tuple +from typing import IO, Callable, Dict, List, Optional, Tuple import requests @@ -47,6 +47,7 @@ def partition( encoding: Optional[str] = None, paragraph_grouper: Optional[Callable[[str], str]] = None, headers: Dict[str, str] = {}, + skip_infer_table_types: List[str] = ["pdf", "jpg", "png"], ssl_verify: bool = True, ocr_languages: str = "eng", pdf_infer_table_structure: bool = False, @@ -82,6 +83,8 @@ def partition( The encoding method used to decode the text input. If None, utf-8 will be used. headers The headers to be used in conjunction with the HTTP request if URL is set. + skip_infer_table_types + The document types that you want to skip table extraction with. ssl_verify If the URL parameter is set, determines whether or not partition uses SSL verification in the HTTP request. @@ -123,6 +126,12 @@ def partition( if file is not None: file.seek(0) + infer_table_structure = decide_table_extraction( + filetype, + skip_infer_table_types, + pdf_infer_table_structure, + ) + if filetype == FileType.DOC: elements = partition_doc(filename=filename, file=file, **kwargs) elif filetype == FileType.DOCX: @@ -183,7 +192,7 @@ def partition( file=file, # type: ignore url=None, include_page_breaks=include_page_breaks, - infer_table_structure=pdf_infer_table_structure, + infer_table_structure=infer_table_structure, strategy=strategy, ocr_languages=ocr_languages, **kwargs, @@ -194,6 +203,7 @@ def partition( file=file, # type: ignore url=None, include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, strategy=strategy, ocr_languages=ocr_languages, **kwargs, @@ -274,3 +284,22 @@ def file_and_type_from_url( filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding) return file, filetype + + +def decide_table_extraction( + filetype: Optional[FileType], + skip_infer_table_types: List[str], + pdf_infer_table_structure: bool, +) -> bool: + doc_type = filetype.name.lower() if filetype else None + + if doc_type == "pdf": + if doc_type in skip_infer_table_types and pdf_infer_table_structure: + logger.warning( + f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}" + f"and pdf_infer_table_structure: {pdf_infer_table_structure}," + "please reset skip_infer_table_types to turn on table extraction for PDFs.", + ) + return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure + + return not (doc_type in skip_infer_table_types) diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 9d527ce3a..353064d00 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -10,6 +10,7 @@ def partition_image( filename: str = "", file: Optional[bytes] = None, include_page_breaks: bool = False, + infer_table_structure: bool = False, ocr_languages: str = "eng", strategy: str = "auto", metadata_date: Optional[str] = None, @@ -23,6 +24,15 @@ def partition_image( A string defining the target filename path. file A file-like object as bytes --> open(filename, "rb"). + include_page_breaks + If True, includes page breaks at the end of each page in the document. + infer_table_structure + Only applicable if `strategy=hi_res`. + If True, any Table elements that are extracted will also have a metadata field + named "text_as_html" where the table's text content is rendered into an html string. + I.e., rows and cells are preserved. + Whether True or False, the "text" field is always present in any Table element + and is the text content of the table (no structure). ocr_languages The languages to use for the Tesseract agent. To use a language, you'll first need to install the appropriate Tesseract language pack. @@ -44,6 +54,7 @@ def partition_image( file=file, is_image=True, include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, strategy=strategy, metadata_date=metadata_date,