mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-11 16:13:24 +00:00
Extract coordinates from PDFs and images when using OCR only strategy (#1163)
### Summary Closes #983 Creates new function `add_pytesseract_bbox_to_elements` Fixes typos in docstrings ### Testing ``` from unstructured.partition.image import partition_image from PIL import Image, ImageDraw png_filename="example-docs/english-and-korean.png" png_elements = partition_image(filename=png_filename, strategy="ocr_only") png_image = Image.open(png_filename) draw = ImageDraw.Draw(png_image) draw.polygon(png_elements[0].metadata.coordinates.points, outline="red", width=2) draw.polygon(png_elements[1].metadata.coordinates.points, outline="red", width=2) draw.polygon(png_elements[2].metadata.coordinates.points, outline="red", width=2) output = "example-docs/english-and-korean-box.png" png_image.save(output) png_image.close() ```
This commit is contained in:
parent
c578b85699
commit
5872fa23c3
@ -7,6 +7,8 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* Extract coordinates from PDFs and images when using OCR only strategy and add to metadata
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
* Update `partition_html` to respect the order of `<pre>` tags.
|
* Update `partition_html` to respect the order of `<pre>` tags.
|
||||||
|
@ -7,7 +7,6 @@ from PIL import Image
|
|||||||
from pytesseract import TesseractError
|
from pytesseract import TesseractError
|
||||||
from unstructured_inference.inference import layout
|
from unstructured_inference.inference import layout
|
||||||
|
|
||||||
from unstructured.documents.elements import Title
|
|
||||||
from unstructured.partition import image, pdf
|
from unstructured.partition import image, pdf
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
@ -194,7 +193,7 @@ def test_partition_image_with_ocr_detects_korean():
|
|||||||
strategy="ocr_only",
|
strategy="ocr_only",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
||||||
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
||||||
|
|
||||||
|
|
||||||
@ -207,7 +206,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
|
|||||||
strategy="ocr_only",
|
strategy="ocr_only",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
||||||
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
||||||
|
|
||||||
|
|
||||||
@ -378,3 +377,17 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_image_with_ocr_has_coordinates_from_file(
|
||||||
|
mocker,
|
||||||
|
filename="example-docs/english-and-korean.png",
|
||||||
|
):
|
||||||
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
||||||
|
mocker.patch(
|
||||||
|
"unstructured.partition.pdf.get_last_modified_date",
|
||||||
|
return_value=mocked_last_modification_date,
|
||||||
|
)
|
||||||
|
elements = image.partition_image(filename=filename, strategy="ocr_only")
|
||||||
|
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
|
||||||
|
assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]
|
||||||
|
@ -768,3 +768,31 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
|
|||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_pdf_with_ocr_has_coordinates_from_filename(
|
||||||
|
filename="example-docs/chevron-page.pdf",
|
||||||
|
):
|
||||||
|
elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
|
||||||
|
assert elements[0].metadata.coordinates.points == [
|
||||||
|
(657.0, 2144.0),
|
||||||
|
(657.0, 2106.0),
|
||||||
|
(1043.0, 2106.0),
|
||||||
|
(1043.0, 2144.0),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_pdf_with_ocr_has_coordinates_from_file(
|
||||||
|
filename="example-docs/chevron-page.pdf",
|
||||||
|
):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = pdf.partition_pdf(
|
||||||
|
file=f,
|
||||||
|
strategy="ocr_only",
|
||||||
|
)
|
||||||
|
assert elements[0].metadata.coordinates.points == [
|
||||||
|
(657.0, 2144.0),
|
||||||
|
(657.0, 2106.0),
|
||||||
|
(1043.0, 2106.0),
|
||||||
|
(1043.0, 2144.0),
|
||||||
|
]
|
||||||
|
@ -80,15 +80,15 @@ class RelativeCoordinateSystem(CoordinateSystem):
|
|||||||
|
|
||||||
|
|
||||||
class PixelSpace(CoordinateSystem):
|
class PixelSpace(CoordinateSystem):
|
||||||
"""Coordinate system representing a pixel space, such as an image. The origin is at the bottom
|
"""Coordinate system representing a pixel space, such as an image. The origin is at the top
|
||||||
right."""
|
left."""
|
||||||
|
|
||||||
orientation = Orientation.SCREEN
|
orientation = Orientation.SCREEN
|
||||||
|
|
||||||
|
|
||||||
class PointSpace(CoordinateSystem):
|
class PointSpace(CoordinateSystem):
|
||||||
"""Coordinate system representing a point space, such as a pdf. The origin is at the top
|
"""Coordinate system representing a point space, such as a pdf. The origin is at the bottom
|
||||||
right."""
|
left."""
|
||||||
|
|
||||||
orientation = Orientation.CARTESIAN
|
orientation = Orientation.CARTESIAN
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
|
|||||||
from pdfminer.utils import open_filename
|
from pdfminer.utils import open_filename
|
||||||
|
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace, PointSpace
|
||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CoordinatesMetadata,
|
CoordinatesMetadata,
|
||||||
Element,
|
Element,
|
||||||
@ -460,6 +460,66 @@ def convert_pdf_to_images(
|
|||||||
yield image
|
yield image
|
||||||
|
|
||||||
|
|
||||||
|
def add_pytesseract_bbox_to_elements(elements, bboxes, width, height):
|
||||||
|
"""
|
||||||
|
Get the bounding box of each element and add it to element.metadata.coordinates
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elements: elements containing text detected by pytesseract.image_to_string.
|
||||||
|
bboxes (str): The return value of pytesseract.image_to_boxes.
|
||||||
|
"""
|
||||||
|
# (NOTE) jennings: This function was written with pytesseract in mind, but
|
||||||
|
# paddle returns similar values via `ocr.ocr(img)`.
|
||||||
|
# See more at issue #1176: https://github.com/Unstructured-IO/unstructured/issues/1176
|
||||||
|
min_x = float("inf")
|
||||||
|
min_y = float("inf")
|
||||||
|
max_x = 0
|
||||||
|
max_y = 0
|
||||||
|
point_space = PointSpace(
|
||||||
|
width=width,
|
||||||
|
height=height,
|
||||||
|
)
|
||||||
|
pixel_space = PixelSpace(
|
||||||
|
width=width,
|
||||||
|
height=height,
|
||||||
|
)
|
||||||
|
|
||||||
|
boxes = bboxes.strip().split("\n")
|
||||||
|
i = 0
|
||||||
|
for element in elements:
|
||||||
|
char_count = len(element.text.replace(" ", ""))
|
||||||
|
|
||||||
|
for box in boxes[i : i + char_count]: # noqa
|
||||||
|
_, x1, y1, x2, y2, _ = box.split()
|
||||||
|
x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
|
||||||
|
|
||||||
|
min_x = min(min_x, x1)
|
||||||
|
min_y = min(min_y, y1)
|
||||||
|
max_x = max(max_x, x2)
|
||||||
|
max_y = max(max_y, y2)
|
||||||
|
|
||||||
|
points = ((min_x, min_y), (min_x, max_y), (max_x, max_y), (max_x, min_y))
|
||||||
|
converted_points = []
|
||||||
|
for point in points:
|
||||||
|
x, y = point
|
||||||
|
new_x, new_y = point_space.convert_coordinates_to_new_system(pixel_space, x, y)
|
||||||
|
converted_points.append((new_x, new_y))
|
||||||
|
|
||||||
|
element.metadata.coordinates = CoordinatesMetadata(
|
||||||
|
points=converted_points,
|
||||||
|
system=pixel_space,
|
||||||
|
)
|
||||||
|
|
||||||
|
# reset for next element
|
||||||
|
min_x = float("inf")
|
||||||
|
min_y = float("inf")
|
||||||
|
max_x = 0
|
||||||
|
max_y = 0
|
||||||
|
i += char_count
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
@requires_dependencies("pytesseract")
|
@requires_dependencies("pytesseract")
|
||||||
def _partition_pdf_or_image_with_ocr(
|
def _partition_pdf_or_image_with_ocr(
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
@ -471,7 +531,7 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
min_partition: Optional[int] = 0,
|
min_partition: Optional[int] = 0,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
|
"""Partitions an image or PDF using Tesseract OCR. For PDFs, each page is converted
|
||||||
to an image prior to processing."""
|
to an image prior to processing."""
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
||||||
@ -479,14 +539,19 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
if file is not None:
|
if file is not None:
|
||||||
image = PIL.Image.open(file)
|
image = PIL.Image.open(file)
|
||||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||||
|
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
|
||||||
else:
|
else:
|
||||||
|
image = PIL.Image.open(filename)
|
||||||
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
||||||
|
bboxes = pytesseract.image_to_boxes(filename, config=f"-l '{ocr_languages}'")
|
||||||
elements = partition_text(
|
elements = partition_text(
|
||||||
text=text,
|
text=text,
|
||||||
max_partition=max_partition,
|
max_partition=max_partition,
|
||||||
min_partition=min_partition,
|
min_partition=min_partition,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
)
|
)
|
||||||
|
width, height = image.size
|
||||||
|
add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
elements = []
|
elements = []
|
||||||
@ -499,6 +564,8 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
last_modified=metadata_last_modified,
|
last_modified=metadata_last_modified,
|
||||||
)
|
)
|
||||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||||
|
bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
|
||||||
|
width, height = image.size
|
||||||
|
|
||||||
_elements = partition_text(
|
_elements = partition_text(
|
||||||
text=text,
|
text=text,
|
||||||
@ -509,6 +576,8 @@ def _partition_pdf_or_image_with_ocr(
|
|||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
elements.append(element)
|
elements.append(element)
|
||||||
|
|
||||||
|
add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
|
||||||
|
|
||||||
if include_page_breaks:
|
if include_page_breaks:
|
||||||
elements.append(PageBreak(text=""))
|
elements.append(PageBreak(text=""))
|
||||||
return elements
|
return elements
|
||||||
|
Loading…
x
Reference in New Issue
Block a user