mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Can spit out anchor text for a gpt engine using pypdf, showing locations of images and text
This commit is contained in:
parent
e42cecf96c
commit
6ef8226347
@ -9,16 +9,22 @@
|
||||
|
||||
# coherency score best of these three
|
||||
import subprocess
|
||||
from typing import Literal
|
||||
import sys
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, List
|
||||
|
||||
from pypdf import PdfReader
|
||||
import pypdfium2 as pdfium
|
||||
import pymupdf
|
||||
|
||||
from pdelfin.filter.coherency import get_document_coherency
|
||||
|
||||
from pypdf import PdfReader
|
||||
from pypdf.generic import RectangleObject
|
||||
from pdelfin.prompts._adv_anchor import mult
|
||||
|
||||
def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency"]) -> str:
|
||||
|
||||
def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"]) -> str:
|
||||
assert page > 0, "Pages are 1-indexed in pdf-land"
|
||||
|
||||
if pdf_engine == "pdftotext":
|
||||
@ -41,6 +47,10 @@ def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftote
|
||||
|
||||
# return option with the best (highest) score (higher is more likley, as these are logprobs)
|
||||
return options[scores.index(max(scores))]
|
||||
elif pdf_engine == "pdfreport":
|
||||
return _linearize_pdf_report(_pdf_report(local_pdf_path, page))
|
||||
else:
|
||||
raise NotImplementedError("Unknown engine")
|
||||
|
||||
|
||||
def _get_pdftotext(local_pdf_path: str, page: int) -> str:
|
||||
@ -66,5 +76,89 @@ def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
|
||||
def _get_pdfium(local_pdf_path: str, page: int) -> str:
|
||||
pdf = pdfium.PdfDocument(local_pdf_path)
|
||||
textpage = pdf[page - 1].get_textpage()
|
||||
return textpage.get_text_range()
|
||||
return textpage.get_text_bounded()
|
||||
|
||||
def _transform_point(x, y, m):
|
||||
x_new = m[0]*x + m[2]*y + m[4]
|
||||
y_new = m[1]*x + m[3]*y + m[5]
|
||||
return x_new, y_new
|
||||
|
||||
@dataclass
|
||||
class Element:
|
||||
pass
|
||||
|
||||
@dataclass
|
||||
class BoundingBox:
|
||||
x0: float
|
||||
y0: float
|
||||
x1: float
|
||||
y1: float
|
||||
|
||||
@staticmethod
|
||||
def from_rectangle(rect: RectangleObject) -> "BoundingBox":
|
||||
return BoundingBox(rect[0], rect[1], rect[2], rect[3])
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextElement(Element):
|
||||
text: str
|
||||
x: float
|
||||
y: float
|
||||
|
||||
@dataclass
|
||||
class ImageElement(Element):
|
||||
name: str
|
||||
bbox: BoundingBox
|
||||
|
||||
@dataclass
|
||||
class PageReport:
|
||||
mediabox: BoundingBox
|
||||
elements: List[Element]
|
||||
|
||||
def _pdf_report(local_pdf_path: str, page: int) -> PageReport:
|
||||
reader = PdfReader(local_pdf_path)
|
||||
page = reader.pages[page - 1]
|
||||
resources = page.get("/Resources", {})
|
||||
xobjects = resources.get("/XObject", {})
|
||||
elements = []
|
||||
|
||||
def visitor_body(text, cm, tm, font_dict, font_size):
|
||||
txt2user = mult(tm, cm)
|
||||
elements.append(TextElement(text, txt2user[4], txt2user[5]))
|
||||
|
||||
def visitor_op(op, args, cm, tm):
|
||||
if op == b"Do":
|
||||
xobject_name = args[0]
|
||||
xobject = xobjects.get(xobject_name)
|
||||
if xobject and xobject["/Subtype"] == "/Image":
|
||||
# Compute image bbox
|
||||
# The image is placed according to the CTM
|
||||
width = xobject.get("/Width")
|
||||
height = xobject.get("/Height")
|
||||
x0, y0 = _transform_point(0, 0, cm)
|
||||
x1, y1 = _transform_point(1, 1, cm)
|
||||
elements.append(ImageElement(xobject_name, BoundingBox(min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))))
|
||||
|
||||
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
||||
|
||||
return PageReport(
|
||||
mediabox=BoundingBox.from_rectangle(page.mediabox),
|
||||
elements=elements,
|
||||
)
|
||||
|
||||
|
||||
def _linearize_pdf_report(report: PageReport) -> str:
|
||||
result = ""
|
||||
|
||||
result += f"Page dimensions: {report.mediabox.x1:.1f}x{report.mediabox.y1:.1f}\n"
|
||||
|
||||
for index, element in enumerate(report.elements):
|
||||
if isinstance(element, ImageElement):
|
||||
result += f"[Image {element.bbox.x0:.0f}x{element.bbox.y0:.0f} to {element.bbox.x1:.0f}x{element.bbox.y1:.0f}]"
|
||||
if isinstance(element, TextElement):
|
||||
if len(element.text.strip()) == 0:
|
||||
continue
|
||||
|
||||
result += f"[{element.x:.0f}x{element.y:.0f}]{element.text}"
|
||||
|
||||
return result
|
||||
|
BIN
tests/gnarly_pdfs/edgar.pdf
Normal file
BIN
tests/gnarly_pdfs/edgar.pdf
Normal file
Binary file not shown.
@ -4,32 +4,37 @@ import json
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text
|
||||
|
||||
class AnchorTest(unittest.TestCase):
|
||||
def testExtractText(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
||||
reader = PdfReader(local_pdf_path)
|
||||
page = reader.pages[1]
|
||||
page = reader.pages[0]
|
||||
|
||||
def visitor_body(text, cm, tm, font_dict, font_size):
|
||||
print(repr(text))
|
||||
print(repr(text), cm, tm, font_size)
|
||||
|
||||
page.extract_text(visitor_text=visitor_body)
|
||||
def visitor_op(op, args, cm, tm):
|
||||
#print(op, args, cm, tm)
|
||||
pass
|
||||
|
||||
page.extract_text(visitor_text=visitor_body, visitor_operand_before=visitor_op)
|
||||
|
||||
def testAnchorBase(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf")
|
||||
|
||||
from pdelfin.prompts._adv_anchor import extract_page
|
||||
reader = PdfReader(local_pdf_path)
|
||||
pypage = reader.pages[1]
|
||||
report = _pdf_report(local_pdf_path, 2)
|
||||
|
||||
def visitor_body(text, cm, tm, font_dict, font_size):
|
||||
print(repr(text))
|
||||
print(report)
|
||||
|
||||
extract_page(pypage, reader, visitor_text=visitor_body)
|
||||
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
|
||||
|
||||
# report = parse_pdf(local_pdf_path)
|
||||
# print(json.dumps(report, indent=1))
|
||||
def testAnchorImage(self):
|
||||
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "some_ocr1.pdf")
|
||||
|
||||
# report = _pdf_report(local_pdf_path, 1)
|
||||
report = _pdf_report(local_pdf_path, 2)
|
||||
|
||||
# print(json.dumps(report, indent=1))
|
||||
print(report)
|
||||
|
||||
print(get_anchor_text(local_pdf_path, 2, pdf_engine="pdfreport"))
|
Loading…
x
Reference in New Issue
Block a user