chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
This commit is contained in:
qued 2023-04-21 13:48:19 -05:00 committed by GitHub
parent ba59ad6b3a
commit 5b6640a55a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 23 deletions

View File

@ -1,3 +1,13 @@
## 0.6.1
### Enhancements
* Updated the table extraction parameter name to be more descriptive
### Features
### Fixes
## 0.6.0
### Enhancements

View File

@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
with patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
partition(filename, pdf_extract_tables=True)
partition(filename, pdf_infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]
@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
url=None,
include_page_breaks=False,
encoding="utf-8",
extract_tables=False,
infer_table_structure=False,
strategy="fast",
ocr_languages="eng",
)

View File

@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model:
pdf.partition_pdf(filename, extract_tables=True)
pdf.partition_pdf(filename, infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"]

View File

@ -1 +1 @@
__version__ = "0.6.0" # pragma: no cover
__version__ = "0.6.1" # pragma: no cover

View File

@ -35,7 +35,7 @@ def partition(
headers: Dict[str, str] = {},
ssl_verify: bool = True,
ocr_languages: str = "eng",
pdf_extract_tables: bool = False,
pdf_infer_table_structure: bool = False,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default
@ -71,9 +71,11 @@ def partition(
ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need
to isntall the appropriate Tesseract language pack.
pdf_extract_tables
If True, in the case that the file to be processed is detected to be a PDF, any tables that
are detected will be extracted.
pdf_infer_table_structure
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
"""
exactly_one(file=file, filename=filename, url=url)
@ -134,7 +136,7 @@ def partition(
url=None,
include_page_breaks=include_page_breaks,
encoding=encoding,
extract_tables=pdf_extract_tables,
infer_table_structure=pdf_infer_table_structure,
strategy=strategy,
ocr_languages=ocr_languages,
)

View File

@ -22,7 +22,7 @@ def partition_pdf(
token: Optional[str] = None,
include_page_breaks: bool = False,
strategy: str = "hi_res",
extract_tables: bool = False,
infer_table_structure: bool = False,
encoding: str = "utf-8",
ocr_languages: str = "eng",
) -> List[Element]:
@ -45,12 +45,13 @@ def partition_pdf(
The strategy to use for partitioning the PDF. Uses a layout detection model if set
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
and processes it.
extract_tables
If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
is True or False, the partitioning process will attempt to identify any tables in the
document. This parameter indicates that the partitioning process will attempt to extract the
structure of any identified tables. The table structure and cell contents will be stored as
HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
infer_table_structure
Only applicable if `strategy=hi_res`.
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
ocr_languages
@ -66,7 +67,7 @@ def partition_pdf(
token=token,
include_page_breaks=include_page_breaks,
strategy=strategy,
extract_tables=extract_tables,
infer_table_structure=infer_table_structure,
encoding=encoding,
ocr_languages=ocr_languages,
)
@ -81,7 +82,7 @@ def partition_pdf_or_image(
is_image: bool = False,
include_page_breaks: bool = False,
strategy: str = "hi_res",
extract_tables: bool = False,
infer_table_structure: bool = False,
encoding: str = "utf-8",
ocr_languages: str = "eng",
) -> List[Element]:
@ -117,7 +118,7 @@ def partition_pdf_or_image(
file=file,
template=out_template,
is_image=is_image,
extract_tables=extract_tables,
infer_table_structure=infer_table_structure,
include_page_breaks=True,
ocr_languages=ocr_languages,
)
@ -128,7 +129,7 @@ def partition_pdf_or_image(
"detectron2 is not installed. Cannot use the hi_res partitioning "
"strategy. Falling back to partitioning with the fast strategy.",
)
if extract_tables:
if infer_table_structure:
logger.warning(
"Table extraction was selected, but is being ignored while using the fast "
"strategy.",
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
file: Optional[bytes] = None,
template: Optional[str] = None,
is_image: bool = False,
extract_tables: bool = False,
infer_table_structure: bool = False,
include_page_breaks: bool = False,
ocr_languages: str = "eng",
) -> List[Element]:
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
template,
is_image=is_image,
ocr_languages=ocr_languages,
extract_tables=extract_tables,
extract_tables=infer_table_structure,
)
else:
layout = process_data_with_model(
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
template,
is_image=is_image,
ocr_languages=ocr_languages,
extract_tables=extract_tables,
extract_tables=infer_table_structure,
)
return document_to_element_list(layout, include_page_breaks=include_page_breaks)