diff --git a/CHANGELOG.md b/CHANGELOG.md index f25be82e4..e00a61449 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.6.1 + +### Enhancements + +* Updated the table extraction parameter name to be more descriptive + +### Features + +### Fixes + ## 0.6.0 ### Enhancements diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 48bc241cc..1c2312c3b 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction(): with patch( "unstructured_inference.inference.layout.process_file_with_model", ) as mock_process_file_with_model: - partition(filename, pdf_extract_tables=True) + partition(filename, pdf_infer_table_structure=True) assert mock_process_file_with_model.call_args[1]["extract_tables"] @@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy(): url=None, include_page_breaks=False, encoding="utf-8", - extract_tables=False, + infer_table_structure=False, strategy="fast", ocr_languages="eng", ) diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py index 3a63ec60e..ba6e6e12f 100644 --- a/test_unstructured/partition/test_pdf.py +++ b/test_unstructured/partition/test_pdf.py @@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction(): with mock.patch( "unstructured_inference.inference.layout.process_file_with_model", ) as mock_process_file_with_model: - pdf.partition_pdf(filename, extract_tables=True) + pdf.partition_pdf(filename, infer_table_structure=True) assert mock_process_file_with_model.call_args[1]["extract_tables"] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3b3ca6c45..a2b973a84 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.0" # pragma: no cover +__version__ = "0.6.1" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index e1f170b36..6b79eb758 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -35,7 +35,7 @@ def partition( headers: Dict[str, str] = {}, ssl_verify: bool = True, ocr_languages: str = "eng", - pdf_extract_tables: bool = False, + pdf_infer_table_structure: bool = False, ): """Partitions a document into its constituent elements. Will use libmagic to determine the file's type and route it to the appropriate partitioning function. Applies the default @@ -71,9 +71,11 @@ def partition( ocr_languages The languages to use for the Tesseract agent. To use a language, you'll first need to isntall the appropriate Tesseract language pack. - pdf_extract_tables - If True, in the case that the file to be processed is detected to be a PDF, any tables that - are detected will be extracted. + pdf_infer_table_structure + If True and strategy=hi_res, any Table Elements extracted from a PDF will include an + additional metadata field, "text_as_html," where the value (string) is a just a + transformation of the data into an HTML . + The "text" field for a partitioned Table Element is always present, whether True or False. """ exactly_one(file=file, filename=filename, url=url) @@ -134,7 +136,7 @@ def partition( url=None, include_page_breaks=include_page_breaks, encoding=encoding, - extract_tables=pdf_extract_tables, + infer_table_structure=pdf_infer_table_structure, strategy=strategy, ocr_languages=ocr_languages, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 89103918d..35ae8b532 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -22,7 +22,7 @@ def partition_pdf( token: Optional[str] = None, include_page_breaks: bool = False, strategy: str = "hi_res", - extract_tables: bool = False, + infer_table_structure: bool = False, encoding: str = "utf-8", ocr_languages: str = "eng", ) -> List[Element]: @@ -45,12 +45,13 @@ def partition_pdf( The strategy to use for partitioning the PDF. Uses a layout detection model if set to 'hi_res', otherwise partition_pdf simply extracts the text from the document and processes it. - extract_tables - If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this - is True or False, the partitioning process will attempt to identify any tables in the - document. This parameter indicates that the partitioning process will attempt to extract the - structure of any identified tables. The table structure and cell contents will be stored as - HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html + infer_table_structure + Only applicable if `strategy=hi_res`. + If True, any Table elements that are extracted will also have a metadata field + named "text_as_html" where the table's text content is rendered into an html string. + I.e., rows and cells are preserved. + Whether True or False, the "text" field is always present in any Table element + and is the text content of the table (no structure). encoding The encoding method used to decode the text input. If None, utf-8 will be used. ocr_languages @@ -66,7 +67,7 @@ def partition_pdf( token=token, include_page_breaks=include_page_breaks, strategy=strategy, - extract_tables=extract_tables, + infer_table_structure=infer_table_structure, encoding=encoding, ocr_languages=ocr_languages, ) @@ -81,7 +82,7 @@ def partition_pdf_or_image( is_image: bool = False, include_page_breaks: bool = False, strategy: str = "hi_res", - extract_tables: bool = False, + infer_table_structure: bool = False, encoding: str = "utf-8", ocr_languages: str = "eng", ) -> List[Element]: @@ -117,7 +118,7 @@ def partition_pdf_or_image( file=file, template=out_template, is_image=is_image, - extract_tables=extract_tables, + infer_table_structure=infer_table_structure, include_page_breaks=True, ocr_languages=ocr_languages, ) @@ -128,7 +129,7 @@ def partition_pdf_or_image( "detectron2 is not installed. Cannot use the hi_res partitioning " "strategy. Falling back to partitioning with the fast strategy.", ) - if extract_tables: + if infer_table_structure: logger.warning( "Table extraction was selected, but is being ignored while using the fast " "strategy.", @@ -173,7 +174,7 @@ def _partition_pdf_or_image_local( file: Optional[bytes] = None, template: Optional[str] = None, is_image: bool = False, - extract_tables: bool = False, + infer_table_structure: bool = False, include_page_breaks: bool = False, ocr_languages: str = "eng", ) -> List[Element]: @@ -204,7 +205,7 @@ def _partition_pdf_or_image_local( template, is_image=is_image, ocr_languages=ocr_languages, - extract_tables=extract_tables, + extract_tables=infer_table_structure, ) else: layout = process_data_with_model( @@ -212,7 +213,7 @@ def _partition_pdf_or_image_local( template, is_image=is_image, ocr_languages=ocr_languages, - extract_tables=extract_tables, + extract_tables=infer_table_structure, ) return document_to_element_list(layout, include_page_breaks=include_page_breaks)