chore: change table param name (#513)

Updated parameter names that controls whether we try to infer table structure.
This commit is contained in:
qued 2023-04-21 13:48:19 -05:00 committed by GitHub
parent ba59ad6b3a
commit 5b6640a55a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 36 additions and 23 deletions

View File

@ -1,3 +1,13 @@
## 0.6.1
### Enhancements
* Updated the table extraction parameter name to be more descriptive
### Features
### Fixes
## 0.6.0 ## 0.6.0
### Enhancements ### Enhancements

View File

@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
with patch( with patch(
"unstructured_inference.inference.layout.process_file_with_model", "unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model: ) as mock_process_file_with_model:
partition(filename, pdf_extract_tables=True) partition(filename, pdf_infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"] assert mock_process_file_with_model.call_args[1]["extract_tables"]
@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
url=None, url=None,
include_page_breaks=False, include_page_breaks=False,
encoding="utf-8", encoding="utf-8",
extract_tables=False, infer_table_structure=False,
strategy="fast", strategy="fast",
ocr_languages="eng", ocr_languages="eng",
) )

View File

@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
with mock.patch( with mock.patch(
"unstructured_inference.inference.layout.process_file_with_model", "unstructured_inference.inference.layout.process_file_with_model",
) as mock_process_file_with_model: ) as mock_process_file_with_model:
pdf.partition_pdf(filename, extract_tables=True) pdf.partition_pdf(filename, infer_table_structure=True)
assert mock_process_file_with_model.call_args[1]["extract_tables"] assert mock_process_file_with_model.call_args[1]["extract_tables"]

View File

@ -1 +1 @@
__version__ = "0.6.0" # pragma: no cover __version__ = "0.6.1" # pragma: no cover

View File

@ -35,7 +35,7 @@ def partition(
headers: Dict[str, str] = {}, headers: Dict[str, str] = {},
ssl_verify: bool = True, ssl_verify: bool = True,
ocr_languages: str = "eng", ocr_languages: str = "eng",
pdf_extract_tables: bool = False, pdf_infer_table_structure: bool = False,
): ):
"""Partitions a document into its constituent elements. Will use libmagic to determine """Partitions a document into its constituent elements. Will use libmagic to determine
the file's type and route it to the appropriate partitioning function. Applies the default the file's type and route it to the appropriate partitioning function. Applies the default
@ -71,9 +71,11 @@ def partition(
ocr_languages ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need The languages to use for the Tesseract agent. To use a language, you'll first need
to isntall the appropriate Tesseract language pack. to isntall the appropriate Tesseract language pack.
pdf_extract_tables pdf_infer_table_structure
If True, in the case that the file to be processed is detected to be a PDF, any tables that If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
are detected will be extracted. additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
""" """
exactly_one(file=file, filename=filename, url=url) exactly_one(file=file, filename=filename, url=url)
@ -134,7 +136,7 @@ def partition(
url=None, url=None,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
encoding=encoding, encoding=encoding,
extract_tables=pdf_extract_tables, infer_table_structure=pdf_infer_table_structure,
strategy=strategy, strategy=strategy,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
) )

View File

@ -22,7 +22,7 @@ def partition_pdf(
token: Optional[str] = None, token: Optional[str] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = "hi_res", strategy: str = "hi_res",
extract_tables: bool = False, infer_table_structure: bool = False,
encoding: str = "utf-8", encoding: str = "utf-8",
ocr_languages: str = "eng", ocr_languages: str = "eng",
) -> List[Element]: ) -> List[Element]:
@ -45,12 +45,13 @@ def partition_pdf(
The strategy to use for partitioning the PDF. Uses a layout detection model if set The strategy to use for partitioning the PDF. Uses a layout detection model if set
to 'hi_res', otherwise partition_pdf simply extracts the text from the document to 'hi_res', otherwise partition_pdf simply extracts the text from the document
and processes it. and processes it.
extract_tables infer_table_structure
If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this Only applicable if `strategy=hi_res`.
is True or False, the partitioning process will attempt to identify any tables in the If True, any Table elements that are extracted will also have a metadata field
document. This parameter indicates that the partitioning process will attempt to extract the named "text_as_html" where the table's text content is rendered into an html string.
structure of any identified tables. The table structure and cell contents will be stored as I.e., rows and cells are preserved.
HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
encoding encoding
The encoding method used to decode the text input. If None, utf-8 will be used. The encoding method used to decode the text input. If None, utf-8 will be used.
ocr_languages ocr_languages
@ -66,7 +67,7 @@ def partition_pdf(
token=token, token=token,
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
strategy=strategy, strategy=strategy,
extract_tables=extract_tables, infer_table_structure=infer_table_structure,
encoding=encoding, encoding=encoding,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
) )
@ -81,7 +82,7 @@ def partition_pdf_or_image(
is_image: bool = False, is_image: bool = False,
include_page_breaks: bool = False, include_page_breaks: bool = False,
strategy: str = "hi_res", strategy: str = "hi_res",
extract_tables: bool = False, infer_table_structure: bool = False,
encoding: str = "utf-8", encoding: str = "utf-8",
ocr_languages: str = "eng", ocr_languages: str = "eng",
) -> List[Element]: ) -> List[Element]:
@ -117,7 +118,7 @@ def partition_pdf_or_image(
file=file, file=file,
template=out_template, template=out_template,
is_image=is_image, is_image=is_image,
extract_tables=extract_tables, infer_table_structure=infer_table_structure,
include_page_breaks=True, include_page_breaks=True,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
) )
@ -128,7 +129,7 @@ def partition_pdf_or_image(
"detectron2 is not installed. Cannot use the hi_res partitioning " "detectron2 is not installed. Cannot use the hi_res partitioning "
"strategy. Falling back to partitioning with the fast strategy.", "strategy. Falling back to partitioning with the fast strategy.",
) )
if extract_tables: if infer_table_structure:
logger.warning( logger.warning(
"Table extraction was selected, but is being ignored while using the fast " "Table extraction was selected, but is being ignored while using the fast "
"strategy.", "strategy.",
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
file: Optional[bytes] = None, file: Optional[bytes] = None,
template: Optional[str] = None, template: Optional[str] = None,
is_image: bool = False, is_image: bool = False,
extract_tables: bool = False, infer_table_structure: bool = False,
include_page_breaks: bool = False, include_page_breaks: bool = False,
ocr_languages: str = "eng", ocr_languages: str = "eng",
) -> List[Element]: ) -> List[Element]:
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
template, template,
is_image=is_image, is_image=is_image,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
extract_tables=extract_tables, extract_tables=infer_table_structure,
) )
else: else:
layout = process_data_with_model( layout = process_data_with_model(
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
template, template,
is_image=is_image, is_image=is_image,
ocr_languages=ocr_languages, ocr_languages=ocr_languages,
extract_tables=extract_tables, extract_tables=infer_table_structure,
) )
return document_to_element_list(layout, include_page_breaks=include_page_breaks) return document_to_element_list(layout, include_page_breaks=include_page_breaks)