Set default strategy for images to be "hi_res" (#968)

Set default strategy for images (not PDFs) to be hi_res.
This commit is contained in:
shreyanid 2023-08-02 09:22:20 -07:00 committed by GitHub
parent 499f37f64b
commit a23d75a292
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 27 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* Set the default strategy for partitioning images to `hi_res`
* Add page break parameter section in API documentation to sync with change in Prod API
### Features
@ -22,7 +23,6 @@
### Fixes
* Rename "date" field to "last_modified"
* Adds Box connector

View File

@ -376,6 +376,26 @@ def test_partition_pdf_doesnt_raise_warning():
partition(filename=filename, strategy="hi_res")
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_image_default_strategy_hi_res(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
file_filename = filename if pass_file_filename else None
elements = partition(
filename=filename,
file_filename=file_filename,
content_type=content_type,
strategy="auto",
)
# should be same result as test_partition_image_default_strategy_hi_res() in test_image.py
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],

View File

@ -213,6 +213,16 @@ def test_partition_image_raises_with_bad_strategy():
image.partition_image(filename=filename, strategy="fakeroo")
def test_partition_image_default_strategy_hi_res():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
with open(filename, "rb") as f:
elements = image.partition_image(file=f)
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[0].text == first_line
assert elements[0].metadata.coordinates is not None
def test_partition_image_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",

View File

@ -43,18 +43,9 @@ def test_is_pdf_text_extractable(filename, from_file, expected):
assert bool(extractable) is expected
@pytest.mark.parametrize(
("infer_table_structure", "expected"),
[
(True, "hi_res"),
(False, "ocr_only"),
],
)
def test_determine_image_auto_strategy(infer_table_structure, expected):
strategy = strategies._determine_image_auto_strategy(
infer_table_structure=infer_table_structure,
)
assert strategy is expected
def test_determine_image_auto_strategy():
strategy = strategies._determine_image_auto_strategy()
assert strategy == "hi_res"
@pytest.mark.parametrize(
@ -74,9 +65,9 @@ def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structur
assert strategy is expected
def test_determine_pdf_or_image_strategy_fallback_ocr():
def test_determine_pdf_or_image_strategy_fallback_hi_res():
strategy = strategies.determine_pdf_or_image_strategy(
strategy="fast",
is_image=True,
)
assert strategy == "ocr_only"
assert strategy == "hi_res"

View File

@ -12,7 +12,7 @@ def partition_image(
include_page_breaks: bool = False,
infer_table_structure: bool = False,
ocr_languages: str = "eng",
strategy: str = "auto",
strategy: str = "hi_res",
metadata_last_modified: Optional[str] = None,
**kwargs,
) -> List[Element]:
@ -41,11 +41,11 @@ def partition_image(
"ocr_only". When using the "hi_res" strategy, the function uses a layout detection
model if to identify document elements. When using the "ocr_only" strategy,
partition_image simply extracts the text from the document using OCR and processes it.
The default strategy `auto` will determine when a image can be extracted using
`ocr_only` mode, otherwise it will fall back to `hi_res`.
The default strategy is `hi_res`.
metadata_last_modified
The last modified date for the document.
"""
exactly_one(filename=filename, file=file)

View File

@ -47,10 +47,9 @@ def determine_pdf_or_image_strategy(
if is_image:
# Note(yuming): There is no fast strategy for images,
# use ocr_only as a fallback plan for consistency with PDFs.
# This can be removed once unstructured-api use auto as the default strategy.
# use hi_res as a fallback plan since it is the auto default.
if strategy == "fast":
strategy = "ocr_only"
strategy = "hi_res"
validate_strategy(strategy, "image")
pdf_text_extractable = False
else:
@ -58,7 +57,7 @@ def determine_pdf_or_image_strategy(
if strategy == "auto":
if is_image:
strategy = _determine_image_auto_strategy(infer_table_structure=infer_table_structure)
strategy = _determine_image_auto_strategy()
else:
strategy = _determine_pdf_auto_strategy(
pdf_text_extractable=pdf_text_extractable,
@ -115,13 +114,11 @@ def determine_pdf_or_image_strategy(
return strategy
def _determine_image_auto_strategy(infer_table_structure: bool = False):
def _determine_image_auto_strategy():
"""If "auto" is passed in as the strategy, determines what strategy to use
for images."""
if infer_table_structure is True:
return "hi_res"
else:
return "ocr_only"
# Use hi_res as the only default since images are only about one page
return "hi_res"
def _determine_pdf_auto_strategy(