mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 20:57:50 +00:00
chore: change table param name (#513)
Updated parameter names that controls whether we try to infer table structure.
This commit is contained in:
parent
ba59ad6b3a
commit
5b6640a55a
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.6.1
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Updated the table extraction parameter name to be more descriptive
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.6.0
|
## 0.6.0
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
|
|||||||
with patch(
|
with patch(
|
||||||
"unstructured_inference.inference.layout.process_file_with_model",
|
"unstructured_inference.inference.layout.process_file_with_model",
|
||||||
) as mock_process_file_with_model:
|
) as mock_process_file_with_model:
|
||||||
partition(filename, pdf_extract_tables=True)
|
partition(filename, pdf_infer_table_structure=True)
|
||||||
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
||||||
|
|
||||||
|
|
||||||
@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
|
|||||||
url=None,
|
url=None,
|
||||||
include_page_breaks=False,
|
include_page_breaks=False,
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
extract_tables=False,
|
infer_table_structure=False,
|
||||||
strategy="fast",
|
strategy="fast",
|
||||||
ocr_languages="eng",
|
ocr_languages="eng",
|
||||||
)
|
)
|
||||||
|
@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
|
|||||||
with mock.patch(
|
with mock.patch(
|
||||||
"unstructured_inference.inference.layout.process_file_with_model",
|
"unstructured_inference.inference.layout.process_file_with_model",
|
||||||
) as mock_process_file_with_model:
|
) as mock_process_file_with_model:
|
||||||
pdf.partition_pdf(filename, extract_tables=True)
|
pdf.partition_pdf(filename, infer_table_structure=True)
|
||||||
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
assert mock_process_file_with_model.call_args[1]["extract_tables"]
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.6.0" # pragma: no cover
|
__version__ = "0.6.1" # pragma: no cover
|
||||||
|
@ -35,7 +35,7 @@ def partition(
|
|||||||
headers: Dict[str, str] = {},
|
headers: Dict[str, str] = {},
|
||||||
ssl_verify: bool = True,
|
ssl_verify: bool = True,
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
pdf_extract_tables: bool = False,
|
pdf_infer_table_structure: bool = False,
|
||||||
):
|
):
|
||||||
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
||||||
the file's type and route it to the appropriate partitioning function. Applies the default
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
||||||
@ -71,9 +71,11 @@ def partition(
|
|||||||
ocr_languages
|
ocr_languages
|
||||||
The languages to use for the Tesseract agent. To use a language, you'll first need
|
The languages to use for the Tesseract agent. To use a language, you'll first need
|
||||||
to isntall the appropriate Tesseract language pack.
|
to isntall the appropriate Tesseract language pack.
|
||||||
pdf_extract_tables
|
pdf_infer_table_structure
|
||||||
If True, in the case that the file to be processed is detected to be a PDF, any tables that
|
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
|
||||||
are detected will be extracted.
|
additional metadata field, "text_as_html," where the value (string) is a just a
|
||||||
|
transformation of the data into an HTML <table>.
|
||||||
|
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||||
"""
|
"""
|
||||||
exactly_one(file=file, filename=filename, url=url)
|
exactly_one(file=file, filename=filename, url=url)
|
||||||
|
|
||||||
@ -134,7 +136,7 @@ def partition(
|
|||||||
url=None,
|
url=None,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
extract_tables=pdf_extract_tables,
|
infer_table_structure=pdf_infer_table_structure,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
)
|
)
|
||||||
|
@ -22,7 +22,7 @@ def partition_pdf(
|
|||||||
token: Optional[str] = None,
|
token: Optional[str] = None,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
strategy: str = "hi_res",
|
strategy: str = "hi_res",
|
||||||
extract_tables: bool = False,
|
infer_table_structure: bool = False,
|
||||||
encoding: str = "utf-8",
|
encoding: str = "utf-8",
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
@ -45,12 +45,13 @@ def partition_pdf(
|
|||||||
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
The strategy to use for partitioning the PDF. Uses a layout detection model if set
|
||||||
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
|
||||||
and processes it.
|
and processes it.
|
||||||
extract_tables
|
infer_table_structure
|
||||||
If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
|
Only applicable if `strategy=hi_res`.
|
||||||
is True or False, the partitioning process will attempt to identify any tables in the
|
If True, any Table elements that are extracted will also have a metadata field
|
||||||
document. This parameter indicates that the partitioning process will attempt to extract the
|
named "text_as_html" where the table's text content is rendered into an html string.
|
||||||
structure of any identified tables. The table structure and cell contents will be stored as
|
I.e., rows and cells are preserved.
|
||||||
HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
|
Whether True or False, the "text" field is always present in any Table element
|
||||||
|
and is the text content of the table (no structure).
|
||||||
encoding
|
encoding
|
||||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||||
ocr_languages
|
ocr_languages
|
||||||
@ -66,7 +67,7 @@ def partition_pdf(
|
|||||||
token=token,
|
token=token,
|
||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
extract_tables=extract_tables,
|
infer_table_structure=infer_table_structure,
|
||||||
encoding=encoding,
|
encoding=encoding,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
)
|
)
|
||||||
@ -81,7 +82,7 @@ def partition_pdf_or_image(
|
|||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
strategy: str = "hi_res",
|
strategy: str = "hi_res",
|
||||||
extract_tables: bool = False,
|
infer_table_structure: bool = False,
|
||||||
encoding: str = "utf-8",
|
encoding: str = "utf-8",
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
@ -117,7 +118,7 @@ def partition_pdf_or_image(
|
|||||||
file=file,
|
file=file,
|
||||||
template=out_template,
|
template=out_template,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
extract_tables=extract_tables,
|
infer_table_structure=infer_table_structure,
|
||||||
include_page_breaks=True,
|
include_page_breaks=True,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
)
|
)
|
||||||
@ -128,7 +129,7 @@ def partition_pdf_or_image(
|
|||||||
"detectron2 is not installed. Cannot use the hi_res partitioning "
|
"detectron2 is not installed. Cannot use the hi_res partitioning "
|
||||||
"strategy. Falling back to partitioning with the fast strategy.",
|
"strategy. Falling back to partitioning with the fast strategy.",
|
||||||
)
|
)
|
||||||
if extract_tables:
|
if infer_table_structure:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Table extraction was selected, but is being ignored while using the fast "
|
"Table extraction was selected, but is being ignored while using the fast "
|
||||||
"strategy.",
|
"strategy.",
|
||||||
@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
|
|||||||
file: Optional[bytes] = None,
|
file: Optional[bytes] = None,
|
||||||
template: Optional[str] = None,
|
template: Optional[str] = None,
|
||||||
is_image: bool = False,
|
is_image: bool = False,
|
||||||
extract_tables: bool = False,
|
infer_table_structure: bool = False,
|
||||||
include_page_breaks: bool = False,
|
include_page_breaks: bool = False,
|
||||||
ocr_languages: str = "eng",
|
ocr_languages: str = "eng",
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
@ -204,7 +205,7 @@ def _partition_pdf_or_image_local(
|
|||||||
template,
|
template,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
extract_tables=extract_tables,
|
extract_tables=infer_table_structure,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
layout = process_data_with_model(
|
layout = process_data_with_model(
|
||||||
@ -212,7 +213,7 @@ def _partition_pdf_or_image_local(
|
|||||||
template,
|
template,
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
ocr_languages=ocr_languages,
|
ocr_languages=ocr_languages,
|
||||||
extract_tables=extract_tables,
|
extract_tables=infer_table_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user