118 lines
3.7 KiB
Python
Raw Normal View History

2022-06-29 14:35:19 -04:00
import pytest
from unittest.mock import patch
import layoutparser as lp
from layoutparser.elements import Layout, Rectangle, TextBlock
import numpy as np
from PIL import Image
from unstructured.documents.pdf import PDFDocument, PDFPage
import unstructured.models.layout.detectron2 as detectron2
import unstructured.models.ocr.tesseract as tesseract
@pytest.fixture
def mock_image():
return Image.new("1", (1, 1))
@pytest.fixture
def mock_page_layout():
text_rectangle = Rectangle(2, 4, 6, 8)
text_block = TextBlock(text_rectangle, text="A very repetitive narrative. " * 10, type="Text")
title_rectangle = Rectangle(1, 2, 3, 4)
title_block = TextBlock(title_rectangle, text="A Catchy Title", type="Title")
return Layout([text_block, title_block])
def test_pdf_page_converts_images_to_array(mock_image):
page = PDFPage(number=0, image=mock_image, layout=Layout())
assert page.image_array is None
image_array = page._get_image_array()
assert isinstance(image_array, np.ndarray)
assert page.image_array.all() == image_array.all()
def test_ocr(monkeypatch):
mock_text = "The parrot flies high in the air!"
class MockOCRAgent:
def detect(self, *args):
return mock_text
monkeypatch.setattr(tesseract, "ocr_agent", MockOCRAgent)
monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True)
image = np.random.randint(12, 24, (40, 40))
page = PDFPage(number=0, image=image, layout=Layout())
rectangle = Rectangle(1, 2, 3, 4)
text_block = TextBlock(rectangle, text=None)
assert page.ocr(text_block) == mock_text
class MockLayoutModel:
def __init__(self, layout):
self.layout = layout
def detect(self, *args):
return self.layout
def test_get_page_elements(monkeypatch, mock_page_layout):
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
image = np.random.randint(12, 24, (40, 40))
page = PDFPage(number=0, image=image, layout=mock_page_layout)
elements = page.get_elements(inplace=False)
assert str(elements[0]) == "A Catchy Title"
assert str(elements[1]).startswith("A very repetitive narrative.")
page.get_elements(inplace=True)
assert elements == page.elements
def test_get_page_elements_with_ocr(monkeypatch):
monkeypatch.setattr(PDFPage, "ocr", lambda *args: "An Even Catchier TItle")
rectangle = Rectangle(2, 4, 6, 8)
text_block = TextBlock(rectangle, text=None, type="Title")
layout = Layout([text_block])
monkeypatch.setattr(detectron2, "model", MockLayoutModel(layout))
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
image = np.random.randint(12, 24, (40, 40))
page = PDFPage(number=0, image=image, layout=layout)
page.get_elements()
assert str(page) == "An Even Catchier TItle"
def test_read_pdf(monkeypatch, mock_page_layout):
image = np.random.randint(12, 24, (40, 40))
images = [image, image]
layouts = Layout([mock_page_layout, mock_page_layout])
page = PDFPage(number=0, image=image, layout=mock_page_layout)
monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
doc = PDFDocument("fake-file.pdf")
with patch.object(lp, "load_pdf", return_value=(layouts, images)):
page.get_elements()
assert str(doc).startswith("A Catchy Title")
assert str(doc).count("A Catchy Title") == 2 # Once for each page
assert str(doc).endswith("A very repetitive narrative. ")
pages = doc.pages
assert str(doc) == "\n\n".join([str(page) for page in pages])