mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 12:49:12 +00:00
chore: change table param name (#513)
Updated parameter names that controls whether we try to infer table structure.
This commit is contained in:
parent
ba59ad6b3a
commit
5b6640a55a
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.6.1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Updated the table extraction parameter name to be more descriptive
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.6.0
|
||||
|
||||
### Enhancements
|
||||
|
@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
|
||||
with patch(
|
||||
"unstructured_inference.inference.layout.process_file_with_model",
|
||||
) as mock_process_file_with_model:
|
||||
partition(filename, pdf_extract_tables=True)
|
||||
partition(filename, pdf_infer_table_structure=True)
|
||||
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
||||
|
||||
|
||||
@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
|
||||
url=None,
|
||||
include_page_breaks=False,
|
||||
encoding="utf-8",
|
||||
extract_tables=False,
|
||||
infer_table_structure=False,
|
||||
strategy="fast",
|
||||
ocr_languages="eng",
|
||||
)
|
||||
|
@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
|
||||
with mock.patch(
|
||||
"unstructured_inference.inference.layout.process_file_with_model",
|
||||
) as mock_process_file_with_model:
|
||||
pdf.partition_pdf(filename, extract_tables=True)
|
||||
pdf.partition_pdf(filename, infer_table_structure=True)
|
||||
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.0" # pragma: no cover
|
||||
__version__ = "0.6.1" # pragma: no cover
|
||||
|
@ -35,7 +35,7 @@ def partition(
|
||||
headers: Dict[str, str] = {},
|
||||
ssl_verify: bool = True,
|
||||
ocr_languages: str = "eng",
|
||||
pdf_extract_tables: bool = False,
|
||||
pdf_infer_table_structure: bool = False,
|
||||
):
|
||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||
@ -71,9 +71,11 @@ def partition(
|
||||
ocr_languages
|
||||
The languages to use for the Tesseract agent. To use a language, you'll first need
|
||||
to isntall the appropriate Tesseract language pack.
|
||||
pdf_extract_tables
|
||||
If True, in the case that the file to be processed is detected to be a PDF, any tables that
|
||||
are detected will be extracted.
|
||||
pdf_infer_table_structure
|
||||
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
|
||||
additional metadata field, "text_as_html," where the value (string) is a just a
|
||||
transformation of the data into an HTML <table>.
|
||||
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||
"""
|
||||
exactly_one(file=file, filename=filename, url=url)
|
||||
|
||||
@ -134,7 +136,7 @@ def partition(
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
encoding=encoding,
|
||||
extract_tables=pdf_extract_tables,
|
||||
infer_table_structure=pdf_infer_table_structure,
|
||||
strategy=strategy,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
|
@ -22,7 +22,7 @@ def partition_pdf(
|
||||
token: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = "hi_res",
|
||||
extract_tables: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
encoding: str = "utf-8",
|
||||
ocr_languages: str = "eng",
|
||||
) -> List[Element]:
|
||||
@ -45,12 +45,13 @@ def partition_pdf(
|
||||
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
||||
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
||||
and processes it.
|
||||
extract_tables
|
||||
If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
|
||||
is True or False, the partitioning process will attempt to identify any tables in the
|
||||
document. This parameter indicates that the partitioning process will attempt to extract the
|
||||
structure of any identified tables. The table structure and cell contents will be stored as
|
||||
HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
|
||||
infer_table_structure
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
ocr_languages
|
||||
@ -66,7 +67,7 @@ def partition_pdf(
|
||||
token=token,
|
||||
include_page_breaks=include_page_breaks,
|
||||
strategy=strategy,
|
||||
extract_tables=extract_tables,
|
||||
infer_table_structure=infer_table_structure,
|
||||
encoding=encoding,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
@ -81,7 +82,7 @@ def partition_pdf_or_image(
|
||||
is_image: bool = False,
|
||||
include_page_breaks: bool = False,
|
||||
strategy: str = "hi_res",
|
||||
extract_tables: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
encoding: str = "utf-8",
|
||||
ocr_languages: str = "eng",
|
||||
) -> List[Element]:
|
||||
@ -117,7 +118,7 @@ def partition_pdf_or_image(
|
||||
file=file,
|
||||
template=out_template,
|
||||
is_image=is_image,
|
||||
extract_tables=extract_tables,
|
||||
infer_table_structure=infer_table_structure,
|
||||
include_page_breaks=True,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
@ -128,7 +129,7 @@ def partition_pdf_or_image(
|
||||
"detectron2 is not installed. Cannot use the hi_res partitioning "
|
||||
"strategy. Falling back to partitioning with the fast strategy.",
|
||||
)
|
||||
if extract_tables:
|
||||
if infer_table_structure:
|
||||
logger.warning(
|
||||
"Table extraction was selected, but is being ignored while using the fast "
|
||||
"strategy.",
|
||||
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
|
||||
file: Optional[bytes] = None,
|
||||
template: Optional[str] = None,
|
||||
is_image: bool = False,
|
||||
extract_tables: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
include_page_breaks: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
) -> List[Element]:
|
||||
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
|
||||
template,
|
||||
is_image=is_image,
|
||||
ocr_languages=ocr_languages,
|
||||
extract_tables=extract_tables,
|
||||
extract_tables=infer_table_structure,
|
||||
)
|
||||
else:
|
||||
layout = process_data_with_model(
|
||||
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
|
||||
template,
|
||||
is_image=is_image,
|
||||
ocr_languages=ocr_languages,
|
||||
extract_tables=extract_tables,
|
||||
extract_tables=infer_table_structure,
|
||||
)
|
||||
|
||||
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
||||
|
Loading…
x
Reference in New Issue
Block a user