Removing pymupdf

This commit is contained in:
Jake Poznanski 2025-01-30 15:51:54 -08:00
parent ddeea92591
commit 2ab7cb280c
4 changed files with 3 additions and 25 deletions

View File

@ -2,7 +2,6 @@ torchvision
cached-path
smart_open
pypdf
pymupdf
pypdfium2
lingua-language-detector
Pillow

View File

@ -4,7 +4,6 @@
# pdftotext
# pdfium
# pymupdf
# pypdf
import random
@ -16,7 +15,6 @@ from dataclasses import dataclass
from typing import List, Literal
import ftfy
import pymupdf
import pypdfium2 as pdfium
from pypdf import PdfReader
from pypdf.generic import RectangleObject
@ -25,7 +23,7 @@ from olmocr.filter.coherency import get_document_coherency
def get_anchor_text(
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
) -> str:
assert page > 0, "Pages are 1-indexed in pdf-land"
@ -35,12 +33,9 @@ def get_anchor_text(
return _get_pdfium(local_pdf_path, page)
elif pdf_engine == "pypdf":
return _get_pypdf_raw(local_pdf_path, page)
elif pdf_engine == "pymupdf":
return _get_pymupdf(local_pdf_path, page)
elif pdf_engine == "topcoherency":
options = {
"pdftotext": _get_pdftotext(local_pdf_path, page),
"pymupdf": _get_pymupdf(local_pdf_path, page),
"pdfium": _get_pdfium(local_pdf_path, page),
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page),
}
@ -70,11 +65,6 @@ def _get_pdftotext(local_pdf_path: str, page: int) -> str:
return pdftotext_result.stdout.decode("utf-8")
def _get_pymupdf(local_pdf_path: str, page: int) -> str:
pm_doc = pymupdf.open(local_pdf_path)
return pm_doc[page - 1].get_text()
def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
reader = PdfReader(local_pdf_path)
pypage = reader.pages[page - 1]

View File

@ -22,7 +22,6 @@ dependencies = [
"cached-path",
"smart_open",
"pypdf>=5.2.0",
"pymupdf",
"pypdfium2",
"cryptography",
"lingua-language-detector",

View File

@ -43,27 +43,17 @@ class TestCoherencyScores(unittest.TestCase):
page=2,
pdf_engine="pdftotext",
)
pymupdf_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pymupdf",
)
pdfium_text = get_anchor_text(
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
page=2,
pdf_engine="pdfium",
)
# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
self.assertLess(pdftotext_score, pymupdf_score)
self.assertLess(pdfium_score, pymupdf_score)
self.assertLess(pdfium_score, pdftotext_score)
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
self.assertEqual(anchor_text, pymupdf_text)
self.assertEqual(anchor_text, pdfium_text)