diff --git a/CHANGELOG.md b/CHANGELOG.md
index f25be82e4..e00a61449 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.6.1
+
+### Enhancements
+
+* Updated the table extraction parameter name to be more descriptive
+
+### Features
+
+### Fixes
+
## 0.6.0
### Enhancements
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 48bc241cc..1c2312c3b 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
with patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
- partition(filename, pdf_extract_tables=True)
+ partition(filename, pdf_infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]
@@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
url=None,
include_page_breaks=False,
encoding="utf-8",
- extract_tables=False,
+ infer_table_structure=False,
strategy="fast",
ocr_languages="eng",
)
diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py
index 3a63ec60e..ba6e6e12f 100644
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
- pdf.partition_pdf(filename, extract_tables=True)
+ pdf.partition_pdf(filename, infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 3b3ca6c45..a2b973a84 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.0" # pragma: no cover
+__version__ = "0.6.1" # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index e1f170b36..6b79eb758 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -35,7 +35,7 @@ def partition(
headers: Dict[str, str] = {},
ssl_verify: bool = True,
ocr_languages: str = "eng",
- pdf_extract_tables: bool = False,
+ pdf_infer_table_structure: bool = False,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default
@@ -71,9 +71,11 @@ def partition(
ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need
to isntall the appropriate Tesseract language pack.
- pdf_extract_tables
- If True, in the case that the file to be processed is detected to be a PDF, any tables that
- are detected will be extracted.
+ pdf_infer_table_structure
+ If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
+ additional metadata field, "text_as_html," where the value (string) is a just a
+ transformation of the data into an HTML
.
+ The "text" field for a partitioned Table Element is always present, whether True or False.
"""
exactly_one(file=file, filename=filename, url=url)
@@ -134,7 +136,7 @@ def partition(
url=None,
include_page_breaks=include_page_breaks,
encoding=encoding,
- extract_tables=pdf_extract_tables,
+ infer_table_structure=pdf_infer_table_structure,
strategy=strategy,
ocr_languages=ocr_languages,
)
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 89103918d..35ae8b532 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -22,7 +22,7 @@ def partition_pdf(
token: Optional[str] = None,
include_page_breaks: bool = False,
strategy: str = "hi_res",
- extract_tables: bool = False,
+ infer_table_structure: bool = False,
encoding: str = "utf-8",
ocr_languages: str = "eng",
) -> List[Element]:
@@ -45,12 +45,13 @@ def partition_pdf(
The strategy to use for partitioning the PDF. Uses a layout detection model if set
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
and processes it.
- extract_tables
- If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
- is True or False, the partitioning process will attempt to identify any tables in the
- document. This parameter indicates that the partitioning process will attempt to extract the
- structure of any identified tables. The table structure and cell contents will be stored as
- HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
+ infer_table_structure
+ Only applicable if `strategy=hi_res`.
+ If True, any Table elements that are extracted will also have a metadata field
+ named "text_as_html" where the table's text content is rendered into an html string.
+ I.e., rows and cells are preserved.
+ Whether True or False, the "text" field is always present in any Table element
+ and is the text content of the table (no structure).
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
ocr_languages
@@ -66,7 +67,7 @@ def partition_pdf(
token=token,
include_page_breaks=include_page_breaks,
strategy=strategy,
- extract_tables=extract_tables,
+ infer_table_structure=infer_table_structure,
encoding=encoding,
ocr_languages=ocr_languages,
)
@@ -81,7 +82,7 @@ def partition_pdf_or_image(
is_image: bool = False,
include_page_breaks: bool = False,
strategy: str = "hi_res",
- extract_tables: bool = False,
+ infer_table_structure: bool = False,
encoding: str = "utf-8",
ocr_languages: str = "eng",
) -> List[Element]:
@@ -117,7 +118,7 @@ def partition_pdf_or_image(
file=file,
template=out_template,
is_image=is_image,
- extract_tables=extract_tables,
+ infer_table_structure=infer_table_structure,
include_page_breaks=True,
ocr_languages=ocr_languages,
)
@@ -128,7 +129,7 @@ def partition_pdf_or_image(
"detectron2 is not installed. Cannot use the hi_res partitioning "
"strategy. Falling back to partitioning with the fast strategy.",
)
- if extract_tables:
+ if infer_table_structure:
logger.warning(
"Table extraction was selected, but is being ignored while using the fast "
"strategy.",
@@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
file: Optional[bytes] = None,
template: Optional[str] = None,
is_image: bool = False,
- extract_tables: bool = False,
+ infer_table_structure: bool = False,
include_page_breaks: bool = False,
ocr_languages: str = "eng",
) -> List[Element]:
@@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
template,
is_image=is_image,
ocr_languages=ocr_languages,
- extract_tables=extract_tables,
+ extract_tables=infer_table_structure,
)
else:
layout = process_data_with_model(
@@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
template,
is_image=is_image,
ocr_languages=ocr_languages,
- extract_tables=extract_tables,
+ extract_tables=infer_table_structure,
)
return document_to_element_list(layout, include_page_breaks=include_page_breaks)