2024-10-14 21:37:14 +00:00
|
|
|
import glob
|
2025-01-29 15:25:10 -08:00
|
|
|
import io
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import unittest
|
2024-10-01 22:10:58 +00:00
|
|
|
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
2025-01-27 18:30:41 +00:00
|
|
|
from olmocr.data.renderpdf import get_pdf_media_box_width_height
|
2025-01-29 15:25:10 -08:00
|
|
|
from olmocr.prompts.anchor import _linearize_pdf_report, _pdf_report, get_anchor_text
|
|
|
|
|
2024-10-07 17:01:59 +00:00
|
|
|
|
2024-10-01 22:10:58 +00:00
|
|
|
class AnchorTest(unittest.TestCase):
|
|
|
|
def testExtractText(self):
|
2024-10-01 23:15:53 +00:00
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
2024-10-01 22:10:58 +00:00
|
|
|
reader = PdfReader(local_pdf_path)
|
2024-10-01 23:15:53 +00:00
|
|
|
page = reader.pages[0]
|
2024-10-01 22:10:58 +00:00
|
|
|
|
|
|
|
def visitor_body(text, cm, tm, font_dict, font_size):
|
2024-10-01 23:15:53 +00:00
|
|
|
print(repr(text), cm, tm, font_size)
|
|
|
|
|
|
|
|
def visitor_op(op, args, cm, tm):
|
|
|
|
#print(op, args, cm, tm)
|
|
|
|
pass
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
|
|
|
def testAnchorBase(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
|
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
report = _pdf_report(local_pdf_path, 2)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
print(report)
|
|
|
|
|
|
|
|
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
def testAnchorImage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-02 16:44:39 +00:00
|
|
|
report = _pdf_report(local_pdf_path, 1)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
print(report)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-02 22:17:15 +00:00
|
|
|
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
|
|
|
|
|
|
|
def testSmallPage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
|
|
|
|
|
|
|
report = _pdf_report(local_pdf_path, 1)
|
|
|
|
|
|
|
|
print(report)
|
|
|
|
|
|
|
|
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
|
|
|
|
2024-10-07 17:01:59 +00:00
|
|
|
def testBadUTFSurrogatePairsGeneration(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
jsondata = json.dumps({
|
|
|
|
"text": anchor_text
|
|
|
|
})
|
|
|
|
|
|
|
|
import pyarrow as pa
|
|
|
|
import pyarrow.compute as pc
|
2025-01-29 15:25:10 -08:00
|
|
|
import pyarrow.json as paj
|
2024-10-07 17:01:59 +00:00
|
|
|
|
|
|
|
buffer = io.BytesIO(jsondata.encode('utf-8'))
|
|
|
|
paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))
|
|
|
|
|
2024-10-08 21:23:21 +00:00
|
|
|
def testLargePromptHint1(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint1.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 1000)
|
2024-10-08 21:23:21 +00:00
|
|
|
|
|
|
|
def testLargePromptHint2(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 4000)
|
2024-10-08 21:23:21 +00:00
|
|
|
|
2024-10-09 16:29:20 +00:00
|
|
|
def testLargePromptHint3(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 4000)
|
2024-10-09 16:29:20 +00:00
|
|
|
|
2024-10-08 21:23:21 +00:00
|
|
|
def testNewsPaperPromptHint(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 4000)
|
2024-10-08 21:23:21 +00:00
|
|
|
|
2024-10-16 16:05:44 +00:00
|
|
|
def testTobaccoPaperMissingParagraphs(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 4000)
|
2024-10-16 16:05:44 +00:00
|
|
|
|
2024-10-23 22:17:20 +00:00
|
|
|
def testAnchorOtherLengths(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=2000)
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 2000)
|
2024-10-23 22:17:20 +00:00
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
2024-10-30 16:26:02 +00:00
|
|
|
self.assertLessEqual(len(anchor_text), 6000)
|
2024-10-23 22:17:20 +00:00
|
|
|
|
2024-10-17 17:00:02 +00:00
|
|
|
def testFailingAnchor(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
|
|
|
self.assertLess(len(anchor_text), 4000)
|
2024-10-08 21:23:21 +00:00
|
|
|
|
2024-10-23 22:17:20 +00:00
|
|
|
def testEmptyAnchor(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=0)
|
|
|
|
|
|
|
|
self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")
|
|
|
|
|
2024-11-20 19:37:00 +00:00
|
|
|
def testCannotLoad(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "load_v_error.pdf")
|
|
|
|
|
|
|
|
reader = PdfReader(local_pdf_path)
|
|
|
|
page = 5
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=6000)
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
|
|
|
self.assertLess(len(anchor_text), 6000)
|
|
|
|
|
2025-01-29 15:15:10 -08:00
|
|
|
@unittest.skip("TODO, this unit test still fails, the map text is too large.")
|
2024-11-18 09:03:24 -08:00
|
|
|
def testExcessiveMapAnchor(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf")
|
|
|
|
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)
|
|
|
|
|
|
|
|
print(anchor_text)
|
|
|
|
print(len(anchor_text))
|
|
|
|
self.assertLess(len(anchor_text), 4000)
|
|
|
|
|
2024-10-02 22:17:15 +00:00
|
|
|
class BuildSilverTest(unittest.TestCase):
|
|
|
|
def testSmallPage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
|
|
|
|
2025-01-27 18:30:41 +00:00
|
|
|
from olmocr.data.buildsilver import build_page_query
|
2024-10-02 22:17:15 +00:00
|
|
|
|
|
|
|
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
|
|
|
|
|
2025-01-27 18:30:41 +00:00
|
|
|
from olmocr.data.renderpdf import get_png_dimensions_from_base64
|
2024-10-02 22:17:15 +00:00
|
|
|
|
|
|
|
base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"]
|
|
|
|
|
|
|
|
if base64data.startswith("data:image/png;base64,"):
|
|
|
|
base64data = base64data[22:]
|
|
|
|
|
|
|
|
width, height = get_png_dimensions_from_base64(base64data)
|
|
|
|
|
|
|
|
print(width, height)
|
|
|
|
|
2024-10-14 21:37:14 +00:00
|
|
|
assert max(width, height) == 2048
|
|
|
|
|
|
|
|
class TestRenderPdf(unittest.TestCase):
|
|
|
|
def testFastMediaBoxMatchesPyPdf(self):
|
|
|
|
for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):
|
|
|
|
reader = PdfReader(file)
|
|
|
|
print("checking", file)
|
|
|
|
|
|
|
|
for page_num in range(1, len(reader.pages) + 1):
|
|
|
|
w1, h1 = get_pdf_media_box_width_height(file, page_num)
|
|
|
|
pypdfpage = reader.pages[page_num - 1]
|
|
|
|
|
2024-11-25 09:13:13 -08:00
|
|
|
self.assertAlmostEqual(w1, pypdfpage.mediabox.width, places=3)
|
2025-01-10 19:38:42 +00:00
|
|
|
self.assertAlmostEqual(h1, pypdfpage.mediabox.height, places=3)
|
|
|
|
|
|
|
|
class TestOutputSamplePage(unittest.TestCase):
|
|
|
|
def testTobaccoPaper(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
|
|
|
|
anchor_text = get_anchor_text(local_pdf_path, 1, 'pdfreport', target_length=6000)
|
|
|
|
|
|
|
|
print("")
|
|
|
|
print(anchor_text)
|
|
|
|
print("")
|