mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 11:10:22 +00:00
Chore: Pass table support param to partition image (#973)
* add param and test in image table extraction * version and changelog * need to publish this one for api repo * add new param skip_infer_table_types * use warning * clean up with mapping * add test for tsv * fix test fail * weird change from merge * doc nit * don't use mapping * correct conflict
This commit is contained in:
parent
e7f2f1e3eb
commit
d46c1c2d83
@ -1,7 +1,8 @@
|
||||
## 0.8.5-dev2
|
||||
## 0.8.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add parameter `skip_infer_table_types` to enable (skip) table extraction for other doc types
|
||||
* Adds optional Unstructured API unit tests in CI
|
||||
* Tracks last modified date for all document types.
|
||||
|
||||
|
||||
BIN
example-docs/layout-parser-paper-with-table.jpg
Normal file
BIN
example-docs/layout-parser-paper-with-table.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 162 KiB |
@ -69,7 +69,7 @@ def test_get_file_info(tmpdir):
|
||||
assert isinstance(file_info, pd.DataFrame)
|
||||
assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"}
|
||||
|
||||
means = file_info.groupby("filetype").mean()
|
||||
means = file_info.groupby("filetype").mean(numeric_only=True)
|
||||
assert means.columns.to_list() == ["filesize"]
|
||||
|
||||
|
||||
|
||||
@ -771,6 +771,15 @@ def test_auto_partition_csv_from_filename(filename="example-docs/stanley-cups.cs
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_tsv_from_filename(filename="example-docs/stanley-cups.tsv"):
|
||||
elements = partition(filename=filename)
|
||||
|
||||
assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
|
||||
assert elements[0].metadata.filetype == "text/tsv"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||
with open(filename, "rb") as f:
|
||||
|
||||
@ -118,6 +118,19 @@ def test_partition_image_with_auto_strategy(
|
||||
assert titles[0].text == title
|
||||
|
||||
|
||||
def test_partition_image_with_table_extraction(
|
||||
filename="example-docs/layout-parser-paper-with-table.jpg",
|
||||
):
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
strategy="hi_res",
|
||||
infer_table_structure=True,
|
||||
)
|
||||
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
||||
assert len(table) == 1
|
||||
assert "Layouts of history Japanese documents" in table[0]
|
||||
|
||||
|
||||
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.8.5-dev2" # pragma: no cover
|
||||
__version__ = "0.8.5" # pragma: no cover
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import io
|
||||
from typing import IO, Callable, Dict, Optional, Tuple
|
||||
from typing import IO, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
@ -47,6 +47,7 @@ def partition(
|
||||
encoding: Optional[str] = None,
|
||||
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
||||
headers: Dict[str, str] = {},
|
||||
skip_infer_table_types: List[str] = ["pdf", "jpg", "png"],
|
||||
ssl_verify: bool = True,
|
||||
ocr_languages: str = "eng",
|
||||
pdf_infer_table_structure: bool = False,
|
||||
@ -82,6 +83,8 @@ def partition(
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
headers
|
||||
The headers to be used in conjunction with the HTTP request if URL is set.
|
||||
skip_infer_table_types
|
||||
The document types that you want to skip table extraction with.
|
||||
ssl_verify
|
||||
If the URL parameter is set, determines whether or not partition uses SSL verification
|
||||
in the HTTP request.
|
||||
@ -123,6 +126,12 @@ def partition(
|
||||
if file is not None:
|
||||
file.seek(0)
|
||||
|
||||
infer_table_structure = decide_table_extraction(
|
||||
filetype,
|
||||
skip_infer_table_types,
|
||||
pdf_infer_table_structure,
|
||||
)
|
||||
|
||||
if filetype == FileType.DOC:
|
||||
elements = partition_doc(filename=filename, file=file, **kwargs)
|
||||
elif filetype == FileType.DOCX:
|
||||
@ -183,7 +192,7 @@ def partition(
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=pdf_infer_table_structure,
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
ocr_languages=ocr_languages,
|
||||
**kwargs,
|
||||
@ -194,6 +203,7 @@ def partition(
|
||||
file=file, # type: ignore
|
||||
url=None,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
ocr_languages=ocr_languages,
|
||||
**kwargs,
|
||||
@ -274,3 +284,22 @@ def file_and_type_from_url(
|
||||
|
||||
filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
|
||||
return file, filetype
|
||||
|
||||
|
||||
def decide_table_extraction(
|
||||
filetype: Optional[FileType],
|
||||
skip_infer_table_types: List[str],
|
||||
pdf_infer_table_structure: bool,
|
||||
) -> bool:
|
||||
doc_type = filetype.name.lower() if filetype else None
|
||||
|
||||
if doc_type == "pdf":
|
||||
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
|
||||
logger.warning(
|
||||
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
|
||||
f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
|
||||
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
|
||||
)
|
||||
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
|
||||
|
||||
return not (doc_type in skip_infer_table_types)
|
||||
|
||||
@ -10,6 +10,7 @@ def partition_image(
|
||||
filename: str = "",
|
||||
file: Optional[bytes] = None,
|
||||
include_page_breaks: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
strategy: str = "auto",
|
||||
metadata_date: Optional[str] = None,
|
||||
@ -23,6 +24,15 @@ def partition_image(
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object as bytes --> open(filename, "rb").
|
||||
include_page_breaks
|
||||
If True, includes page breaks at the end of each page in the document.
|
||||
infer_table_structure
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any Table elements that are extracted will also have a metadata field
|
||||
named "text_as_html" where the table's text content is rendered into an html string.
|
||||
I.e., rows and cells are preserved.
|
||||
Whether True or False, the "text" field is always present in any Table element
|
||||
and is the text content of the table (no structure).
|
||||
ocr_languages
|
||||
The languages to use for the Tesseract agent. To use a language, you'll first need
|
||||
to install the appropriate Tesseract language pack.
|
||||
@ -44,6 +54,7 @@ def partition_image(
|
||||
file=file,
|
||||
is_image=True,
|
||||
include_page_breaks=include_page_breaks,
|
||||
infer_table_structure=infer_table_structure,
|
||||
ocr_languages=ocr_languages,
|
||||
strategy=strategy,
|
||||
metadata_date=metadata_date,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user