olmocr/tests/test_anchor.py

import base64
import glob
import io
import json
import os
import re
import tempfile
import unittest

from pypdf import PdfReader

from olmocr.data.renderpdf import (
    get_pdf_media_box_width_height,
    render_pdf_to_base64png,
)
from olmocr.image_utils import convert_image_to_pdf_bytes
from olmocr.prompts.anchor import _linearize_pdf_report, _pdf_report, get_anchor_text


class AnchorTest(unittest.TestCase):
    def testExtractText(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
        reader = PdfReader(local_pdf_path)
        page = reader.pages[0]

        def visitor_body(text, cm, tm, font_dict, font_size):
            print(repr(text), cm, tm, font_size)

        def visitor_op(op, args, cm, tm):
            # print(op, args, cm, tm)
            pass

        page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)

    def testAnchorBase(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")

        report = _pdf_report(local_pdf_path, 2)

        print(report)

        print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))

    def testAnchorImage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")

        report = _pdf_report(local_pdf_path, 1)

        print(report)

        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))

    def testSmallPage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")

        report = _pdf_report(local_pdf_path, 1)

        print(report)

        print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))

    def testBadUTFSurrogatePairsGeneration(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")

        jsondata = json.dumps({"text": anchor_text})

        import pyarrow as pa
        import pyarrow.compute as pc
        import pyarrow.json as paj

        buffer = io.BytesIO(jsondata.encode("utf-8"))
        paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))

    def testLargePromptHint1(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 1000)

    def testLargePromptHint2(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testLargePromptHint3(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testNewsPaperPromptHint(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testTobaccoPaperMissingParagraphs(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testAnchorOtherLengths(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=2000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 2000)

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 6000)

    def testFailingAnchor(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testEmptyAnchor(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=0)

        self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")

    def testEmptyAnchorMatchesImageAnchor(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "edgar.pdf")

        orig_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")
        print(orig_anchor)

        lenneg1_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=-1)
        print(lenneg1_anchor)

        base64_png = render_pdf_to_base64png(local_pdf_path, 1, target_longest_image_dim=1024)

        # Decode base64 and save to temporary file
        temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False)
        temp_img.write(base64.b64decode(base64_png))
        temp_img.close()

        # Convert all images to a single PDF using our enhanced function
        pdf_bytes = convert_image_to_pdf_bytes([temp_img.name])

        # Write the PDF bytes to a temporary file
        temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
        temp_pdf.write(pdf_bytes)
        temp_pdf.close()

        # Update pdf_path to the new file
        img_pdf_path = temp_pdf.name

        image_only_anchor = get_anchor_text(img_pdf_path, 1, pdf_engine="pdfreport")
        print(image_only_anchor)

        # Parse page dimensions from both anchors and check with tolerance
        # Extract page dimensions and image bounds
        img_lines = image_only_anchor.strip().split("\n")
        len_lines = lenneg1_anchor.strip().split("\n")

        img_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", img_lines[0])
        img_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", img_lines[1])

        len_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", len_lines[0])
        len_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", len_lines[1])

        self.assertIsNotNone(img_page_match, f"Could not parse image anchor page dims: {image_only_anchor}")
        self.assertIsNotNone(img_image_match, f"Could not parse image anchor image dims: {image_only_anchor}")
        self.assertIsNotNone(len_page_match, f"Could not parse lenneg1 anchor page dims: {lenneg1_anchor}")
        self.assertIsNotNone(len_image_match, f"Could not parse lenneg1 anchor image dims: {lenneg1_anchor}")

        img_page_w, img_page_h = float(img_page_match.group(1)), float(img_page_match.group(2))
        img_img_w, img_img_h = int(img_image_match.group(1)), int(img_image_match.group(2))

        len_page_w, len_page_h = float(len_page_match.group(1)), float(len_page_match.group(2))
        len_img_w, len_img_h = int(len_image_match.group(1)), int(len_image_match.group(2))

        # Check page dimensions are within 1.4 tolerance
        self.assertAlmostEqual(img_page_w, len_page_w, delta=1.4, msg=f"Page width mismatch: {img_page_w} vs {len_page_w}")
        self.assertAlmostEqual(img_page_h, len_page_h, delta=1.4, msg=f"Page height mismatch: {img_page_h} vs {len_page_h}")

        # Check image dimensions are within 1 point tolerance
        self.assertAlmostEqual(img_img_w, len_img_w, delta=1, msg=f"Image width mismatch: {img_img_w} vs {len_img_w}")
        self.assertAlmostEqual(img_img_h, len_img_h, delta=1, msg=f"Image height mismatch: {img_img_h} vs {len_img_h}")

        self.assertEqual(image_only_anchor[:5], lenneg1_anchor[:5])
        self.assertEqual(image_only_anchor[-1:], lenneg1_anchor[-1:])

    def testCannotLoad(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "load_v_error.pdf")

        reader = PdfReader(local_pdf_path)
        page = 5
        anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 6000)

    @unittest.skip("TODO, this unit test still fails, the map text is too large.")
    def testExcessiveMapAnchor(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 4000)

    def testKyleOnePageAnchors1(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "dolma-page-1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 6000)

    def testKyleOnePageAnchors2(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "olmo-page-1.pdf")

        anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

        print(anchor_text)
        print(len(anchor_text))
        self.assertLessEqual(len(anchor_text), 6000)


class BuildSilverTest(unittest.TestCase):
    def testSmallPage(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")

        from olmocr.data.buildsilver import build_page_query

        result = build_page_query(local_pdf_path, "s3://test.pdf", 1)

        from olmocr.data.renderpdf import get_png_dimensions_from_base64

        base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"]

        if base64data.startswith("data:image/png;base64,"):
            base64data = base64data[22:]

        width, height = get_png_dimensions_from_base64(base64data)

        print(width, height)

        assert max(width, height) == 2048


class TestRenderPdf(unittest.TestCase):
    def testFastMediaBoxMatchesPyPdf(self):
        for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):
            reader = PdfReader(file)
            print("checking", file)

            for page_num in range(1, len(reader.pages) + 1):
                w1, h1 = get_pdf_media_box_width_height(file, page_num)
                pypdfpage = reader.pages[page_num - 1]

                self.assertAlmostEqual(w1, pypdfpage.mediabox.width, places=3)
                self.assertAlmostEqual(h1, pypdfpage.mediabox.height, places=3)


class TestOutputSamplePage(unittest.TestCase):
    def testTobaccoPaper(self):
        local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
        anchor_text = get_anchor_text(local_pdf_path, 1, "pdfreport", target_length=6000)

        print("")
        print(anchor_text)
        print("")
Format fixes 2025-05-29 23:23:02 +00:00			`import base64`
More stats hopefully running faster 2024-10-14 21:37:14 +00:00			`import glob`
isort 2025-01-29 15:25:10 -08:00			`import io`
			`import json`
			`import os`
Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`import re`
			`import tempfile`
Format fixes 2025-05-29 23:23:02 +00:00			`import unittest`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
			`from pypdf import PdfReader`

Format fixes 2025-05-29 23:23:02 +00:00			`from olmocr.data.renderpdf import (`
			`get_pdf_media_box_width_height,`
			`render_pdf_to_base64png,`
			`)`
Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`from olmocr.image_utils import convert_image_to_pdf_bytes`
Format fixes 2025-05-29 23:23:02 +00:00			`from olmocr.prompts.anchor import _linearize_pdf_report, _pdf_report, get_anchor_text`

Fix for unicode errors in big datasets for the future 2024-10-07 17:01:59 +00:00
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00			`class AnchorTest(unittest.TestCase):`
			`def testExtractText(self):`
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00			`reader = PdfReader(local_pdf_path)`
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`page = reader.pages[0]`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
			`def visitor_body(text, cm, tm, font_dict, font_size):`
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`print(repr(text), cm, tm, font_size)`

			`def visitor_op(op, args, cm, tm):`
Black formatting 2025-01-29 15:30:39 -08:00			`# print(op, args, cm, tm)`
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`pass`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
			`def testAnchorBase(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")`

Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`report = _pdf_report(local_pdf_path, 2)`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`print(report)`

			`print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`def testAnchorImage(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
Appears as if the report method works really well, might need one last step to detect rotated pages 2024-10-02 16:44:39 +00:00			`report = _pdf_report(local_pdf_path, 1)`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text 2024-10-01 23:15:53 +00:00			`print(report)`
Adding anchor code based off of pypdf that visits each text block, hopefully so we can make it output good bboxes 2024-10-01 22:10:58 +00:00
A few notes, starting to test dataloader with new structured response format 2024-10-02 22:17:15 +00:00			`print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))`

			`def testSmallPage(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")`

			`report = _pdf_report(local_pdf_path, 1)`

			`print(report)`

			`print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))`

Fix for unicode errors in big datasets for the future 2024-10-07 17:01:59 +00:00			`def testBadUTFSurrogatePairsGeneration(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "badlines.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")`

Black formatting 2025-01-29 15:30:39 -08:00			`jsondata = json.dumps({"text": anchor_text})`
Fix for unicode errors in big datasets for the future 2024-10-07 17:01:59 +00:00
			`import pyarrow as pa`
			`import pyarrow.compute as pc`
isort 2025-01-29 15:25:10 -08:00			`import pyarrow.json as paj`
Fix for unicode errors in big datasets for the future 2024-10-07 17:01:59 +00:00
Black formatting 2025-01-29 15:30:39 -08:00			`buffer = io.BytesIO(jsondata.encode("utf-8"))`
Fix for unicode errors in big datasets for the future 2024-10-07 17:01:59 +00:00			`paj.read_json(buffer, read_options=paj.ReadOptions(use_threads=False, block_size=len(jsondata)))`

Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00			`def testLargePromptHint1(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 1000)`
Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00
			`def testLargePromptHint2(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00
Cleaning up anchor text to deal with abnormally long lines 2024-10-09 16:29:20 +00:00			`def testLargePromptHint3(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Cleaning up anchor text to deal with abnormally long lines 2024-10-09 16:29:20 +00:00
Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00			`def testNewsPaperPromptHint(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00
Dolma viewer improvements 2024-10-16 16:05:44 +00:00			`def testTobaccoPaperMissingParagraphs(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Dolma viewer improvements 2024-10-16 16:05:44 +00:00
Adding empty anchor support 2024-10-23 22:17:20 +00:00			`def testAnchorOtherLengths(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=2000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 2000)`
Adding empty anchor support 2024-10-23 22:17:20 +00:00
			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
Fixing issues with pdf parsing 2024-10-30 16:26:02 +00:00			`self.assertLessEqual(len(anchor_text), 6000)`
Adding empty anchor support 2024-10-23 22:17:20 +00:00
Hmm, cant repro failing anchor case 2024-10-17 17:00:02 +00:00			`def testFailingAnchor(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 4, pdf_engine="pdfreport")`

			`print(anchor_text)`
			`print(len(anchor_text))`
CI 2025-02-14 20:51:04 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Adding image merging to pdf report/hint/anchor 2024-10-08 21:23:21 +00:00
Adding empty anchor support 2024-10-23 22:17:20 +00:00			`def testEmptyAnchor(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=0)`

			`self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")`

Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`def testEmptyAnchorMatchesImageAnchor(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "edgar.pdf")`

			`orig_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport")`
			`print(orig_anchor)`

			`lenneg1_anchor = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=-1)`
			`print(lenneg1_anchor)`

			`base64_png = render_pdf_to_base64png(local_pdf_path, 1, target_longest_image_dim=1024)`

			`# Decode base64 and save to temporary file`
			`temp_img = tempfile.NamedTemporaryFile("wb", suffix=".png", delete=False)`
			`temp_img.write(base64.b64decode(base64_png))`
			`temp_img.close()`

			`# Convert all images to a single PDF using our enhanced function`
			`pdf_bytes = convert_image_to_pdf_bytes([temp_img.name])`

			`# Write the PDF bytes to a temporary file`
			`temp_pdf = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)`
			`temp_pdf.write(pdf_bytes)`
			`temp_pdf.close()`

			`# Update pdf_path to the new file`
			`img_pdf_path = temp_pdf.name`

			`image_only_anchor = get_anchor_text(img_pdf_path, 1, pdf_engine="pdfreport")`
			`print(image_only_anchor)`

			`# Parse page dimensions from both anchors and check with tolerance`
			`# Extract page dimensions and image bounds`
Format fixes 2025-05-29 23:23:02 +00:00			`img_lines = image_only_anchor.strip().split("\n")`
			`len_lines = lenneg1_anchor.strip().split("\n")`

			`img_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", img_lines[0])`
			`img_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", img_lines[1])`

			`len_page_match = re.search(r"Page dimensions: ([\d.]+)x([\d.]+)", len_lines[0])`
			`len_image_match = re.search(r"\[Image \d+x\d+ to (\d+)x(\d+)\]", len_lines[1])`

Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`self.assertIsNotNone(img_page_match, f"Could not parse image anchor page dims: {image_only_anchor}")`
			`self.assertIsNotNone(img_image_match, f"Could not parse image anchor image dims: {image_only_anchor}")`
			`self.assertIsNotNone(len_page_match, f"Could not parse lenneg1 anchor page dims: {lenneg1_anchor}")`
			`self.assertIsNotNone(len_image_match, f"Could not parse lenneg1 anchor image dims: {lenneg1_anchor}")`
Format fixes 2025-05-29 23:23:02 +00:00
Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`img_page_w, img_page_h = float(img_page_match.group(1)), float(img_page_match.group(2))`
			`img_img_w, img_img_h = int(img_image_match.group(1)), int(img_image_match.group(2))`
Format fixes 2025-05-29 23:23:02 +00:00
Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`len_page_w, len_page_h = float(len_page_match.group(1)), float(len_page_match.group(2))`
			`len_img_w, len_img_h = int(len_image_match.group(1)), int(len_image_match.group(2))`
Format fixes 2025-05-29 23:23:02 +00:00
Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`# Check page dimensions are within 1.4 tolerance`
Format fixes 2025-05-29 23:23:02 +00:00			`self.assertAlmostEqual(img_page_w, len_page_w, delta=1.4, msg=f"Page width mismatch: {img_page_w} vs {len_page_w}")`
			`self.assertAlmostEqual(img_page_h, len_page_h, delta=1.4, msg=f"Page height mismatch: {img_page_h} vs {len_page_h}")`

Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`# Check image dimensions are within 1 point tolerance`
Format fixes 2025-05-29 23:23:02 +00:00			`self.assertAlmostEqual(img_img_w, len_img_w, delta=1, msg=f"Image width mismatch: {img_img_w} vs {len_img_w}")`
			`self.assertAlmostEqual(img_img_h, len_img_h, delta=1, msg=f"Image height mismatch: {img_img_h} vs {len_img_h}")`

Idea to improve retry performance 2025-05-28 18:27:40 +00:00			`self.assertEqual(image_only_anchor[:5], lenneg1_anchor[:5])`
			`self.assertEqual(image_only_anchor[-1:], lenneg1_anchor[-1:])`

More tests 2024-11-20 19:37:00 +00:00			`def testCannotLoad(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "load_v_error.pdf")`

			`reader = PdfReader(local_pdf_path)`
			`page = 5`
			`anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=6000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
CI 2025-02-14 20:51:04 +00:00			`self.assertLessEqual(len(anchor_text), 6000)`
More tests 2024-11-20 19:37:00 +00:00
Some unit test cleanup 2025-01-29 15:15:10 -08:00			`@unittest.skip("TODO, this unit test still fails, the map text is too large.")`
Fix a reliability issue 2024-11-18 09:03:24 -08:00			`def testExcessiveMapAnchor(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
CI 2025-02-14 20:51:04 +00:00			`self.assertLessEqual(len(anchor_text), 4000)`
Fix a reliability issue 2024-11-18 09:03:24 -08:00
Adding some gnarly 1 pager pdfs from kyle 2025-02-11 18:45:42 +00:00			`def testKyleOnePageAnchors1(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "dolma-page-1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
CI 2025-02-14 20:51:04 +00:00			`self.assertLessEqual(len(anchor_text), 6000)`
Adding some gnarly 1 pager pdfs from kyle 2025-02-11 18:45:42 +00:00
			`def testKyleOnePageAnchors2(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "olmo-page-1.pdf")`

			`anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)`

			`print(anchor_text)`
			`print(len(anchor_text))`
CI 2025-02-14 20:51:04 +00:00			`self.assertLessEqual(len(anchor_text), 6000)`
Adding some gnarly 1 pager pdfs from kyle 2025-02-11 18:45:42 +00:00
Black formatting 2025-01-29 15:30:39 -08:00
A few notes, starting to test dataloader with new structured response format 2024-10-02 22:17:15 +00:00			`class BuildSilverTest(unittest.TestCase):`
			`def testSmallPage(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")`

Massive refactor from pdelfin to olmocr 2025-01-27 18:30:41 +00:00			`from olmocr.data.buildsilver import build_page_query`
A few notes, starting to test dataloader with new structured response format 2024-10-02 22:17:15 +00:00
			`result = build_page_query(local_pdf_path, "s3://test.pdf", 1)`

Massive refactor from pdelfin to olmocr 2025-01-27 18:30:41 +00:00			`from olmocr.data.renderpdf import get_png_dimensions_from_base64`
A few notes, starting to test dataloader with new structured response format 2024-10-02 22:17:15 +00:00
			`base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"]`

			`if base64data.startswith("data:image/png;base64,"):`
			`base64data = base64data[22:]`

			`width, height = get_png_dimensions_from_base64(base64data)`

			`print(width, height)`

More stats hopefully running faster 2024-10-14 21:37:14 +00:00			`assert max(width, height) == 2048`

Black formatting 2025-01-29 15:30:39 -08:00
More stats hopefully running faster 2024-10-14 21:37:14 +00:00			`class TestRenderPdf(unittest.TestCase):`
			`def testFastMediaBoxMatchesPyPdf(self):`
			`for file in glob.glob(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "*.pdf")):`
			`reader = PdfReader(file)`
			`print("checking", file)`
Black formatting 2025-01-29 15:30:39 -08:00
More stats hopefully running faster 2024-10-14 21:37:14 +00:00			`for page_num in range(1, len(reader.pages) + 1):`
			`w1, h1 = get_pdf_media_box_width_height(file, page_num)`
			`pypdfpage = reader.pages[page_num - 1]`

Unit tests fixes 2024-11-25 09:13:13 -08:00			`self.assertAlmostEqual(w1, pypdfpage.mediabox.width, places=3)`
Small fixes 2025-01-10 19:38:42 +00:00			`self.assertAlmostEqual(h1, pypdfpage.mediabox.height, places=3)`

Black formatting 2025-01-29 15:30:39 -08:00
Small fixes 2025-01-10 19:38:42 +00:00			`class TestOutputSamplePage(unittest.TestCase):`
			`def testTobaccoPaper(self):`
			`local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")`
Black formatting 2025-01-29 15:30:39 -08:00			`anchor_text = get_anchor_text(local_pdf_path, 1, "pdfreport", target_length=6000)`
Small fixes 2025-01-10 19:38:42 +00:00
			`print("")`
			`print(anchor_text)`
Black formatting 2025-01-29 15:30:39 -08:00			`print("")`