Chore: Pass table support param to partition image (#973)

* add param and test in image table extraction

* version and changelog

* need to publish this one for api repo

* add new param skip_infer_table_types

* use warning

* clean up with mapping

* add test for tsv

* fix test fail

* weird change from merge

* doc nit

* don't use mapping

* correct conflict
This commit is contained in:
Yuming Long 2023-07-27 13:33:36 -04:00 committed by GitHub
parent e7f2f1e3eb
commit d46c1c2d83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 68 additions and 5 deletions

View File

@ -1,7 +1,8 @@
## 0.8.5-dev2
## 0.8.5
### Enhancements
* Add parameter `skip_infer_table_types` to enable (skip) table extraction for other doc types
* Adds optional Unstructured API unit tests in CI
* Tracks last modified date for all document types.

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

View File

@ -69,7 +69,7 @@ def test_get_file_info(tmpdir):
assert isinstance(file_info, pd.DataFrame)
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
means = file_info.groupby("filetype").mean()
means = file_info.groupby("filetype").mean(numeric_only=True)
assert means.columns.to_list() == ["filesize"]

View File

@ -771,6 +771,15 @@ def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.cs
assert elements[0].metadata.filetype == "text/csv"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
elements = partition(filename=filename)
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
assert elements[0].metadata.filetype == "text/tsv"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
with open(filename, "rb") as f:

View File

@ -118,6 +118,19 @@ def test_partition_image_with_auto_strategy(
assert titles[0].text == title
def test_partition_image_with_table_extraction(
filename="example-docs/layout-parser-paper-with-table.jpg",
):
elements = image.partition_image(
filename=filename,
strategy="hi_res",
infer_table_structure=True,
)
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
assert len(table) == 1
assert "Layouts of history Japanese documents" in table[0]
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
with mock.patch.object(
layout,

View File

@ -1 +1 @@
__version__ = "0.8.5-dev2" # pragma: no cover
__version__ = "0.8.5" # pragma: no cover

View File

@ -1,5 +1,5 @@
import io
from typing import IO, Callable, Dict, Optional, Tuple
from typing import IO, Callable, Dict, List, Optional, Tuple
import requests
@ -47,6 +47,7 @@ def partition(
encoding: Optional[str] = None,
paragraph_grouper: Optional[Callable[[str], str]] = None,
headers: Dict[str, str] = {},
skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
ssl_verify: bool = True,
ocr_languages: str = "eng",
pdf_infer_table_structure: bool = False,
@ -82,6 +83,8 @@ def partition(
The encoding method used to decode the text input. If None, utf-8 will be used.
headers
The headers to be used in conjunction with the HTTP request if URL is set.
skip_infer_table_types
The document types that you want to skip table extraction with.
ssl_verify
If the URL parameter is set, determines whether or not partition uses SSL verification
in the HTTP request.
@ -123,6 +126,12 @@ def partition(
if file is not None:
file.seek(0)
infer_table_structure = decide_table_extraction(
filetype,
skip_infer_table_types,
pdf_infer_table_structure,
)
if filetype == FileType.DOC:
elements = partition_doc(filename=filename, file=file, **kwargs)
elif filetype == FileType.DOCX:
@ -183,7 +192,7 @@ def partition(
file=file, # type: ignore
url=None,
include_page_breaks=include_page_breaks,
infer_table_structure=pdf_infer_table_structure,
infer_table_structure=infer_table_structure,
strategy=strategy,
ocr_languages=ocr_languages,
**kwargs,
@ -194,6 +203,7 @@ def partition(
file=file, # type: ignore
url=None,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
strategy=strategy,
ocr_languages=ocr_languages,
**kwargs,
@ -274,3 +284,22 @@ def file_and_type_from_url(
filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
return file, filetype
def decide_table_extraction(
filetype: Optional[FileType],
skip_infer_table_types: List[str],
pdf_infer_table_structure: bool,
) -> bool:
doc_type = filetype.name.lower() if filetype else None
if doc_type == "pdf":
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
logger.warning(
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
)
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
return not (doc_type in skip_infer_table_types)

View File

@ -10,6 +10,7 @@ def partition_image(
filename: str = "",
file: Optional[bytes] = None,
include_page_breaks: bool = False,
infer_table_structure: bool = False,
ocr_languages: str = "eng",
strategy: str = "auto",
metadata_date: Optional[str] = None,
@ -23,6 +24,15 @@ def partition_image(
A string defining the target filename path.
file
A file-like object as bytes --> open(filename, "rb").
include_page_breaks
If True, includes page breaks at the end of each page in the document.
infer_table_structure
Only applicable if `strategy=hi_res`.
If True, any Table elements that are extracted will also have a metadata field
named "text_as_html" where the table's text content is rendered into an html string.
I.e., rows and cells are preserved.
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need
to install the appropriate Tesseract language pack.
@ -44,6 +54,7 @@ def partition_image(
file=file,
is_image=True,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
strategy=strategy,
metadata_date=metadata_date,