mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 10:33:09 +00:00
fix: Don't call extractable_elements if strategy is ocr_only (#1160)
- fixes #1079 where partitioning is happening twice in the case of `strategy="ocr_only"` - only calls `extractable_elements` if we can predetermine that `ocr_only` is not a possible strategy even if it was the intended strategy. - Adds additional assertion test that `_partition_pdf_or_image_with_ocr` is not called when falling back to `fast` from `ocr_only`
This commit is contained in:
parent
e9c649224e
commit
1ddf542e14
@ -6,6 +6,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
|
||||
|
||||
## 0.10.5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -325,10 +325,14 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
||||
pdf,
|
||||
"extractable_elements",
|
||||
return_value=mock_return,
|
||||
) as mock_partition:
|
||||
) as mock_partition, mock.patch.object(
|
||||
pdf,
|
||||
"_partition_pdf_or_image_with_ocr",
|
||||
) as mock_partition_ocr:
|
||||
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
||||
|
||||
mock_partition.assert_called_once()
|
||||
mock_partition_ocr.assert_not_called()
|
||||
assert "pytesseract is not installed" in caplog.text
|
||||
|
||||
|
||||
|
||||
@ -155,7 +155,18 @@ def partition_pdf_or_image(
|
||||
file=file,
|
||||
filename=filename,
|
||||
)
|
||||
if not is_image:
|
||||
|
||||
if (
|
||||
not is_image
|
||||
and determine_pdf_or_image_strategy(
|
||||
strategy,
|
||||
filename=filename,
|
||||
file=file,
|
||||
is_image=is_image,
|
||||
infer_table_structure=infer_table_structure,
|
||||
)
|
||||
!= "ocr_only"
|
||||
):
|
||||
extracted_elements = extractable_elements(
|
||||
filename=filename,
|
||||
file=spooled_to_bytes_io_if_needed(file),
|
||||
@ -209,6 +220,7 @@ def partition_pdf_or_image(
|
||||
min_partition=min_partition,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
)
|
||||
|
||||
return layout_elements
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user