Yao You 5f5ff6319f
fix: consider text in cid code as invalid in hi_res (#2259)
This PR addresses
[CORE-2969](https://unstructured-ai.atlassian.net/browse/CORE-2969)
- pdfminer sometimes fail to decode text in an pdf file and returns cid
codes as text
- now those text will be considered invalid and be replaced with ocr
results in `hi_res` mode

## test

This PR adds unit test for the utility functions. In addition the file
below would return elements with text in cid code on main but proper
ascii text with this PR:


[005-CISA-AA22-076-Strengthening-Cybersecurity-p1-p4.pdf](https://github.com/Unstructured-IO/unstructured/files/13662984/005-CISA-AA22-076-Strengthening-Cybersecurity-p1-p4.pdf)

This change improves both cct accuracy and %missing scores:

**before:**
```
metric       average sample_sd population_sd count
--------------------------------------------------
cct-accuracy 0.681   0.267     0.266         105
cct-%missing 0.086   0.159     0.159         105
```

**after:**
```
metric       average sample_sd population_sd count
--------------------------------------------------
cct-accuracy 0.697   0.251     0.250         105
cct-%missing 0.071   0.123     0.122         105
```

[CORE-2969]:
https://unstructured-ai.atlassian.net/browse/CORE-2969?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
2023-12-14 06:49:23 +00:00

127 lines
4.4 KiB
Python

import os
import tempfile
import numpy as np
import pytest
from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image
from unstructured.partition.pdf_image import pdf_image_utils
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
def test_write_image(image_type):
mock_pil_image = PILImg.new("RGB", (50, 50))
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
image_map = {
"pil": mock_pil_image,
"numpy_array": mock_numpy_image,
}
image = image_map[image_type]
with tempfile.TemporaryDirectory() as tmpdir:
output_image_path = os.path.join(tmpdir, "test_image.jpg")
pdf_image_utils.write_image(image, output_image_path)
assert os.path.exists(output_image_path)
# Additional check to see if the written image can be read
read_image = PILImg.open(output_image_path)
assert read_image is not None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(
file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
):
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
images = pdf_image_utils.convert_pdf_to_image(
filename=filename,
file=None,
output_folder=tmpdir,
path_only=path_only,
)
else:
with open(filename, "rb") as f:
images = pdf_image_utils.convert_pdf_to_image(
filename="",
file=f,
output_folder=tmpdir,
path_only=path_only,
)
if path_only:
assert isinstance(images[0], str)
else:
assert isinstance(images[0], PILImg.Image)
def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
text="3",
coordinates=(
(78.7401411111111, 86.61545694444455),
(78.7401411111111, 519.9487805555556),
(512.0734647222223, 519.9487805555556),
(512.0734647222223, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="4",
coordinates=(
(570.8661397222222, 86.6154566666667),
(570.8661397222222, 519.6862825000001),
(1003.9369655555556, 519.6862825000001),
(1003.9369655555556, 86.6154566666667),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="5",
coordinates=(
(1062.9921808333331, 86.61545694444455),
(1062.9921808333331, 519.9487805555556),
(1496.3255044444445, 519.9487805555556),
(1496.3255044444445, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
]
pdf_image_utils.save_elements(
elements=elements,
element_category_to_save=ElementType.IMAGE,
pdf_image_dpi=200,
filename=filename,
output_dir_path=str(tmpdir),
)
for i, el in enumerate(elements):
expected_image_path = os.path.join(
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
)
assert os.path.isfile(el.metadata.image_path)
assert el.metadata.image_path == expected_image_path
def test_write_image_raises_error():
with pytest.raises(ValueError):
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
@pytest.mark.parametrize(
("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)]
)
def test_valid_text(text, outcome):
assert pdf_image_utils.valid_text(text) == outcome