Extract coordinates from PDFs and images when using OCR only strategy (#1163)

### Summary
Closes #983 
Creates new function `add_pytesseract_bbox_to_elements`
Fixes typos in docstrings

### Testing
```
from unstructured.partition.image import partition_image
from PIL import Image, ImageDraw

png_filename="example-docs/english-and-korean.png"
png_elements = partition_image(filename=png_filename, strategy="ocr_only")
png_image = Image.open(png_filename)
draw = ImageDraw.Draw(png_image)
draw.polygon(png_elements[0].metadata.coordinates.points, outline="red", width=2)
draw.polygon(png_elements[1].metadata.coordinates.points, outline="red", width=2)
draw.polygon(png_elements[2].metadata.coordinates.points, outline="red", width=2)
output = "example-docs/english-and-korean-box.png"
png_image.save(output)
png_image.close()
```
This commit is contained in:
John 2023-08-25 00:32:12 -05:00 committed by GitHub
parent c578b85699
commit 5872fa23c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 121 additions and 9 deletions

View File

@ -7,6 +7,8 @@
### Features
* Extract coordinates from PDFs and images when using OCR only strategy and add to metadata
### Fixes
* Update `partition_html` to respect the order of `<pre>` tags.

View File

@ -7,7 +7,6 @@ from PIL import Image
from pytesseract import TesseractError
from unstructured_inference.inference import layout
from unstructured.documents.elements import Title
from unstructured.partition import image, pdf
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -194,7 +193,7 @@ def test_partition_image_with_ocr_detects_korean():
strategy="ocr_only",
)
assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
@ -207,7 +206,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
strategy="ocr_only",
)
assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
@ -378,3 +377,17 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_image_with_ocr_has_coordinates_from_file(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename, strategy="ocr_only")
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]

View File

@ -768,3 +768,31 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_pdf_with_ocr_has_coordinates_from_filename(
filename="example-docs/chevron-page.pdf",
):
elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
assert elements[0].metadata.coordinates.points == [
(657.0, 2144.0),
(657.0, 2106.0),
(1043.0, 2106.0),
(1043.0, 2144.0),
]
def test_partition_pdf_with_ocr_has_coordinates_from_file(
filename="example-docs/chevron-page.pdf",
):
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
strategy="ocr_only",
)
assert elements[0].metadata.coordinates.points == [
(657.0, 2144.0),
(657.0, 2106.0),
(1043.0, 2106.0),
(1043.0, 2144.0),
]

View File

@ -80,15 +80,15 @@ class RelativeCoordinateSystem(CoordinateSystem):
class PixelSpace(CoordinateSystem):
"""Coordinate system representing a pixel space, such as an image. The origin is at the bottom
right."""
"""Coordinate system representing a pixel space, such as an image. The origin is at the top
left."""
orientation = Orientation.SCREEN
class PointSpace(CoordinateSystem):
"""Coordinate system representing a point space, such as a pdf. The origin is at the top
right."""
"""Coordinate system representing a point space, such as a pdf. The origin is at the bottom
left."""
orientation = Orientation.CARTESIAN

View File

@ -11,7 +11,7 @@ from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
from pdfminer.utils import open_filename
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.coordinates import PixelSpace, PointSpace
from unstructured.documents.elements import (
CoordinatesMetadata,
Element,
@ -460,6 +460,66 @@ def convert_pdf_to_images(
yield image
def add_pytesseract_bbox_to_elements(elements, bboxes, width, height):
"""
Get the bounding box of each element and add it to element.metadata.coordinates
Args:
elements: elements containing text detected by pytesseract.image_to_string.
bboxes (str): The return value of pytesseract.image_to_boxes.
"""
# (NOTE) jennings: This function was written with pytesseract in mind, but
# paddle returns similar values via `ocr.ocr(img)`.
# See more at issue #1176: https://github.com/Unstructured-IO/unstructured/issues/1176
min_x = float("inf")
min_y = float("inf")
max_x = 0
max_y = 0
point_space = PointSpace(
width=width,
height=height,
)
pixel_space = PixelSpace(
width=width,
height=height,
)
boxes = bboxes.strip().split("\n")
i = 0
for element in elements:
char_count = len(element.text.replace(" ", ""))
for box in boxes[i : i + char_count]: # noqa
_, x1, y1, x2, y2, _ = box.split()
x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
min_x = min(min_x, x1)
min_y = min(min_y, y1)
max_x = max(max_x, x2)
max_y = max(max_y, y2)
points = ((min_x, min_y), (min_x, max_y), (max_x, max_y), (max_x, min_y))
converted_points = []
for point in points:
x, y = point
new_x, new_y = point_space.convert_coordinates_to_new_system(pixel_space, x, y)
converted_points.append((new_x, new_y))
element.metadata.coordinates = CoordinatesMetadata(
points=converted_points,
system=pixel_space,
)
# reset for next element
min_x = float("inf")
min_y = float("inf")
max_x = 0
max_y = 0
i += char_count
return elements
@requires_dependencies("pytesseract")
def _partition_pdf_or_image_with_ocr(
filename: str = "",
@ -471,7 +531,7 @@ def _partition_pdf_or_image_with_ocr(
min_partition: Optional[int] = 0,
metadata_last_modified: Optional[str] = None,
):
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
"""Partitions an image or PDF using Tesseract OCR. For PDFs, each page is converted
to an image prior to processing."""
import pytesseract
@ -479,14 +539,19 @@ def _partition_pdf_or_image_with_ocr(
if file is not None:
image = PIL.Image.open(file)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
else:
image = PIL.Image.open(filename)
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(filename, config=f"-l '{ocr_languages}'")
elements = partition_text(
text=text,
max_partition=max_partition,
min_partition=min_partition,
metadata_last_modified=metadata_last_modified,
)
width, height = image.size
add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
else:
elements = []
@ -499,6 +564,8 @@ def _partition_pdf_or_image_with_ocr(
last_modified=metadata_last_modified,
)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
width, height = image.size
_elements = partition_text(
text=text,
@ -509,6 +576,8 @@ def _partition_pdf_or_image_with_ocr(
element.metadata = metadata
elements.append(element)
add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
if include_page_breaks:
elements.append(PageBreak(text=""))
return elements