fix: Don't call extractable_elements if strategy is ocr_only (#1160)

- fixes #1079 where partitioning is happening twice in the case of
`strategy="ocr_only"`
- only calls `extractable_elements` if we can predetermine that
`ocr_only` is not a possible strategy even if it was the intended
strategy.
- Adds additional assertion test that `_partition_pdf_or_image_with_ocr`
is not called when falling back to `fast` from `ocr_only`
This commit is contained in:
Charles 2023-08-23 03:43:33 +01:00 committed by GitHub
parent e9c649224e
commit 1ddf542e14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 2 deletions

View File

@ -6,6 +6,8 @@
### Fixes
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
## 0.10.5
### Enhancements

View File

@ -325,10 +325,14 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
pdf,
"extractable_elements",
return_value=mock_return,
) as mock_partition:
) as mock_partition, mock.patch.object(
pdf,
"_partition_pdf_or_image_with_ocr",
) as mock_partition_ocr:
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
mock_partition.assert_called_once()
mock_partition_ocr.assert_not_called()
assert "pytesseract is not installed" in caplog.text

View File

@ -155,7 +155,18 @@ def partition_pdf_or_image(
file=file,
filename=filename,
)
if not is_image:
if (
not is_image
and determine_pdf_or_image_strategy(
strategy,
filename=filename,
file=file,
is_image=is_image,
infer_table_structure=infer_table_structure,
)
!= "ocr_only"
):
extracted_elements = extractable_elements(
filename=filename,
file=spooled_to_bytes_io_if_needed(file),
@ -209,6 +220,7 @@ def partition_pdf_or_image(
min_partition=min_partition,
metadata_last_modified=metadata_last_modified or last_modification_date,
)
return layout_elements