Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text

This commit is contained in:
Jake Poznanski 2024-10-01 23:15:53 +00:00
parent e42cecf96c
commit 6ef8226347
3 changed files with 117 additions and 18 deletions

View File

@ -9,16 +9,22 @@
# coherency score best of these three
import subprocess
from typing import Literal
import sys
import json
from dataclasses import dataclass
from typing import Literal, List
from pypdf import PdfReader
import pypdfium2 as pdfium
import pymupdf
from pdelfin.filter.coherency import get_document_coherency
from pypdf import PdfReader
from pypdf.generic import RectangleObject
from pdelfin.prompts._adv_anchor import mult
def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency"]) -> str:
def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"]) -> str:
assert page > 0, "Pages are 1-indexed in pdf-land"
if pdf_engine == "pdftotext":
@ -41,6 +47,10 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
# return option with the best (highest) score (higher is more likley, as these are logprobs)
return options[scores.index(max(scores))]
elif pdf_engine == "pdfreport":
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
else:
raise NotImplementedError("Unknown engine")
def _get_pdftotext(local_pdf_path: str, page: int) -> str:
@ -66,5 +76,89 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
def _get_pdfium(local_pdf_path: str, page: int) -> str:
pdf = pdfium.PdfDocument(local_pdf_path)
textpage = pdf[page - 1].get_textpage()
return textpage.get_text_range()
return textpage.get_text_bounded()
def _transform_point(x, y, m):
x_new = m[0]*x + m[2]*y + m[4]
y_new = m[1]*x + m[3]*y + m[5]
return x_new, y_new
@dataclass
class Element:
pass
@dataclass
class BoundingBox:
x0: float
y0: float
x1: float
y1: float
@staticmethod
def from_rectangle(rect: RectangleObject) -> "BoundingBox":
return BoundingBox(rect[0], rect[1], rect[2], rect[3])
@dataclass
class TextElement(Element):
text: str
x: float
y: float
@dataclass
class ImageElement(Element):
name: str
bbox: BoundingBox
@dataclass
class PageReport:
mediabox: BoundingBox
elements: List[Element]
def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
reader = PdfReader(local_pdf_path)
page = reader.pages[page - 1]
resources = page.get("/Resources", {})
xobjects = resources.get("/XObject", {})
elements = []
def visitor_body(text, cm, tm, font_dict, font_size):
txt2user = mult(tm, cm)
elements.append(TextElement(text, txt2user[4], txt2user[5]))
def visitor_op(op, args, cm, tm):
if op == b"Do":
xobject_name = args[0]
xobject = xobjects.get(xobject_name)
if xobject and xobject["/Subtype"] == "/Image":
# Compute image bbox
# The image is placed according to the CTM
width = xobject.get("/Width")
height = xobject.get("/Height")
x0, y0 = _transform_point(0, 0, cm)
x1, y1 = _transform_point(1, 1, cm)
elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
return PageReport(
mediabox=BoundingBox.from_rectangle(page.mediabox),
elements=elements,
)
def _linearize_pdf_report(report: PageReport) -> str:
result = ""
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
for index, element in enumerate(report.elements):
if isinstance(element, ImageElement):
result += f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
if isinstance(element, TextElement):
if len(element.text.strip()) == 0:
continue
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
return result

BIN
tests/gnarly_pdfs/edgar.pdf Normal file

Binary file not shown.

View File

@ -4,32 +4,37 @@ import json
from pypdf import PdfReader
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
class AnchorTest(unittest.TestCase):
def testExtractText(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
reader = PdfReader(local_pdf_path)
page = reader.pages[1]
page = reader.pages[0]
def visitor_body(text, cm, tm, font_dict, font_size):
print(repr(text))
print(repr(text), cm, tm, font_size)
page.extract_text(visitor_text=visitor_body)
def visitor_op(op, args, cm, tm):
#print(op, args, cm, tm)
pass
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
def testAnchorBase(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
from pdelfin.prompts._adv_anchor import extract_page
reader = PdfReader(local_pdf_path)
pypage = reader.pages[1]
report = _pdf_report(local_pdf_path, 2)
def visitor_body(text, cm, tm, font_dict, font_size):
print(repr(text))
print(report)
extract_page(pypage, reader, visitor_text=visitor_body)
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
# report = parse_pdf(local_pdf_path)
# print(json.dumps(report, indent=1))
def testAnchorImage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
# report = _pdf_report(local_pdf_path, 1)
report = _pdf_report(local_pdf_path, 2)
# print(json.dumps(report, indent=1))
print(report)
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))