mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 15:37:58 +00:00
fix: Don't call extractable_elements if strategy is ocr_only (#1160)
- fixes #1079 where partitioning is happening twice in the case of `strategy="ocr_only"` - only calls `extractable_elements` if we can predetermine that `ocr_only` is not a possible strategy even if it was the intended strategy. - Adds additional assertion test that `_partition_pdf_or_image_with_ocr` is not called when falling back to `fast` from `ocr_only`
This commit is contained in:
parent
e9c649224e
commit
1ddf542e14
@ -6,6 +6,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Fix bug in `partition_pdf_or_image` where two partitions were called if `strategy == "ocr_only"`.
|
||||||
|
|
||||||
## 0.10.5
|
## 0.10.5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -325,10 +325,14 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|||||||
pdf,
|
pdf,
|
||||||
"extractable_elements",
|
"extractable_elements",
|
||||||
return_value=mock_return,
|
return_value=mock_return,
|
||||||
) as mock_partition:
|
) as mock_partition, mock.patch.object(
|
||||||
|
pdf,
|
||||||
|
"_partition_pdf_or_image_with_ocr",
|
||||||
|
) as mock_partition_ocr:
|
||||||
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
pdf.partition_pdf(filename=filename, url=None, strategy="ocr_only")
|
||||||
|
|
||||||
mock_partition.assert_called_once()
|
mock_partition.assert_called_once()
|
||||||
|
mock_partition_ocr.assert_not_called()
|
||||||
assert "pytesseract is not installed" in caplog.text
|
assert "pytesseract is not installed" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -155,7 +155,18 @@ def partition_pdf_or_image(
|
|||||||
file=file,
|
file=file,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
)
|
)
|
||||||
if not is_image:
|
|
||||||
|
if (
|
||||||
|
not is_image
|
||||||
|
and determine_pdf_or_image_strategy(
|
||||||
|
strategy,
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
is_image=is_image,
|
||||||
|
infer_table_structure=infer_table_structure,
|
||||||
|
)
|
||||||
|
!= "ocr_only"
|
||||||
|
):
|
||||||
extracted_elements = extractable_elements(
|
extracted_elements = extractable_elements(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=spooled_to_bytes_io_if_needed(file),
|
file=spooled_to_bytes_io_if_needed(file),
|
||||||
@ -209,6 +220,7 @@ def partition_pdf_or_image(
|
|||||||
min_partition=min_partition,
|
min_partition=min_partition,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
)
|
)
|
||||||
|
|
||||||
return layout_elements
|
return layout_elements
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user