2024-10-01 22:10:58 +00:00
|
|
|
import unittest
|
|
|
|
import os
|
|
|
|
import json
|
|
|
|
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
|
|
|
|
2024-10-01 22:10:58 +00:00
|
|
|
class AnchorTest(unittest.TestCase):
|
|
|
|
def testExtractText(self):
|
2024-10-01 23:15:53 +00:00
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
2024-10-01 22:10:58 +00:00
|
|
|
reader = PdfReader(local_pdf_path)
|
2024-10-01 23:15:53 +00:00
|
|
|
page = reader.pages[0]
|
2024-10-01 22:10:58 +00:00
|
|
|
|
|
|
|
def visitor_body(text, cm, tm, font_dict, font_size):
|
2024-10-01 23:15:53 +00:00
|
|
|
print(repr(text), cm, tm, font_size)
|
|
|
|
|
|
|
|
def visitor_op(op, args, cm, tm):
|
|
|
|
#print(op, args, cm, tm)
|
|
|
|
pass
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
|
|
|
def testAnchorBase(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
|
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
report = _pdf_report(local_pdf_path, 2)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
print(report)
|
|
|
|
|
|
|
|
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
def testAnchorImage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-02 16:44:39 +00:00
|
|
|
report = _pdf_report(local_pdf_path, 1)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-01 23:15:53 +00:00
|
|
|
print(report)
|
2024-10-01 22:10:58 +00:00
|
|
|
|
2024-10-02 22:17:15 +00:00
|
|
|
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
|
|
|
|
|
|
|
def testSmallPage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
|
|
|
|
|
|
|
report = _pdf_report(local_pdf_path, 1)
|
|
|
|
|
|
|
|
print(report)
|
|
|
|
|
|
|
|
print(get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport"))
|
|
|
|
|
|
|
|
class BuildSilverTest(unittest.TestCase):
|
|
|
|
def testSmallPage(self):
|
|
|
|
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")
|
|
|
|
|
|
|
|
from pdelfin.silver_data.buildsilver import build_page_query
|
|
|
|
|
|
|
|
result = build_page_query(local_pdf_path, "s3://test.pdf", 1)
|
|
|
|
|
|
|
|
from pdelfin.train.dataloader import get_png_dimensions_from_base64
|
|
|
|
|
|
|
|
base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"]
|
|
|
|
|
|
|
|
if base64data.startswith("data:image/png;base64,"):
|
|
|
|
base64data = base64data[22:]
|
|
|
|
|
|
|
|
width, height = get_png_dimensions_from_base64(base64data)
|
|
|
|
|
|
|
|
print(width, height)
|
|
|
|
|
|
|
|
assert max(width, height) == 2048
|