mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 15:45:21 +00:00
Set default strategy for images to be "hi_res" (#968)
Set default strategy for images (not PDFs) to be hi_res.
This commit is contained in:
parent
499f37f64b
commit
a23d75a292
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Set the default strategy for partitioning images to `hi_res`
|
||||
* Add page break parameter section in API documentation to sync with change in Prod API
|
||||
|
||||
### Features
|
||||
@ -22,7 +23,6 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
* Rename "date" field to "last_modified"
|
||||
* Adds Box connector
|
||||
|
||||
|
||||
@ -376,6 +376,26 @@ def test_partition_pdf_doesnt_raise_warning():
|
||||
partition(filename=filename, strategy="hi_res")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
)
|
||||
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
|
||||
file_filename = filename if pass_file_filename else None
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
file_filename=file_filename,
|
||||
content_type=content_type,
|
||||
strategy="auto",
|
||||
)
|
||||
|
||||
# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
|
||||
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
assert elements[0].text == first_line
|
||||
assert elements[0].metadata.coordinates is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_file_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
|
||||
@ -213,6 +213,16 @@ def test_partition_image_raises_with_bad_strategy():
|
||||
image.partition_image(filename=filename, strategy="fakeroo")
|
||||
|
||||
|
||||
def test_partition_image_default_strategy_hi_res():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(file=f)
|
||||
|
||||
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
assert elements[0].text == first_line
|
||||
assert elements[0].metadata.coordinates is not None
|
||||
|
||||
|
||||
def test_partition_image_metadata_date(
|
||||
mocker,
|
||||
filename="example-docs/english-and-korean.png",
|
||||
|
||||
@ -43,18 +43,9 @@ def test_is_pdf_text_extractable(filename, from_file, expected):
|
||||
assert bool(extractable) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("infer_table_structure", "expected"),
|
||||
[
|
||||
(True, "hi_res"),
|
||||
(False, "ocr_only"),
|
||||
],
|
||||
)
|
||||
def test_determine_image_auto_strategy(infer_table_structure, expected):
|
||||
strategy = strategies._determine_image_auto_strategy(
|
||||
infer_table_structure=infer_table_structure,
|
||||
)
|
||||
assert strategy is expected
|
||||
def test_determine_image_auto_strategy():
|
||||
strategy = strategies._determine_image_auto_strategy()
|
||||
assert strategy == "hi_res"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -74,9 +65,9 @@ def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structur
|
||||
assert strategy is expected
|
||||
|
||||
|
||||
def test_determine_pdf_or_image_strategy_fallback_ocr():
|
||||
def test_determine_pdf_or_image_strategy_fallback_hi_res():
|
||||
strategy = strategies.determine_pdf_or_image_strategy(
|
||||
strategy="fast",
|
||||
is_image=True,
|
||||
)
|
||||
assert strategy == "ocr_only"
|
||||
assert strategy == "hi_res"
|
||||
|
||||
@ -12,7 +12,7 @@ def partition_image(
|
||||
include_page_breaks: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
strategy: str = "auto",
|
||||
strategy: str = "hi_res",
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
@ -41,11 +41,11 @@ def partition_image(
|
||||
"ocr_only". When using the "hi_res" strategy, the function uses a layout detection
|
||||
model if to identify document elements. When using the "ocr_only" strategy,
|
||||
partition_image simply extracts the text from the document using OCR and processes it.
|
||||
The default strategy `auto` will determine when a image can be extracted using
|
||||
`ocr_only` mode, otherwise it will fall back to `hi_res`.
|
||||
The default strategy is `hi_res`.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
|
||||
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
|
||||
@ -47,10 +47,9 @@ def determine_pdf_or_image_strategy(
|
||||
|
||||
if is_image:
|
||||
# Note(yuming): There is no fast strategy for images,
|
||||
# use ocr_only as a fallback plan for consistency with PDFs.
|
||||
# This can be removed once unstructured-api use auto as the default strategy.
|
||||
# use hi_res as a fallback plan since it is the auto default.
|
||||
if strategy == "fast":
|
||||
strategy = "ocr_only"
|
||||
strategy = "hi_res"
|
||||
validate_strategy(strategy, "image")
|
||||
pdf_text_extractable = False
|
||||
else:
|
||||
@ -58,7 +57,7 @@ def determine_pdf_or_image_strategy(
|
||||
|
||||
if strategy == "auto":
|
||||
if is_image:
|
||||
strategy = _determine_image_auto_strategy(infer_table_structure=infer_table_structure)
|
||||
strategy = _determine_image_auto_strategy()
|
||||
else:
|
||||
strategy = _determine_pdf_auto_strategy(
|
||||
pdf_text_extractable=pdf_text_extractable,
|
||||
@ -115,13 +114,11 @@ def determine_pdf_or_image_strategy(
|
||||
return strategy
|
||||
|
||||
|
||||
def _determine_image_auto_strategy(infer_table_structure: bool = False):
|
||||
def _determine_image_auto_strategy():
|
||||
"""If "auto" is passed in as the strategy, determines what strategy to use
|
||||
for images."""
|
||||
if infer_table_structure is True:
|
||||
return "hi_res"
|
||||
else:
|
||||
return "ocr_only"
|
||||
# Use hi_res as the only default since images are only about one page
|
||||
return "hi_res"
|
||||
|
||||
|
||||
def _determine_pdf_auto_strategy(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user