mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-26 14:47:13 +00:00
Removing pymupdf
This commit is contained in:
parent
ddeea92591
commit
2ab7cb280c
@ -2,7 +2,6 @@ torchvision
|
||||
cached-path
|
||||
smart_open
|
||||
pypdf
|
||||
pymupdf
|
||||
pypdfium2
|
||||
lingua-language-detector
|
||||
Pillow
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
|
||||
# pdftotext
|
||||
# pdfium
|
||||
# pymupdf
|
||||
# pypdf
|
||||
|
||||
import random
|
||||
@ -16,7 +15,6 @@ from dataclasses import dataclass
|
||||
from typing import List, Literal
|
||||
|
||||
import ftfy
|
||||
import pymupdf
|
||||
import pypdfium2 as pdfium
|
||||
from pypdf import PdfReader
|
||||
from pypdf.generic import RectangleObject
|
||||
@ -25,7 +23,7 @@ from olmocr.filter.coherency import get_document_coherency
|
||||
|
||||
|
||||
def get_anchor_text(
|
||||
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
|
||||
local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pypdf", "topcoherency", "pdfreport"], target_length: int = 4000
|
||||
) -> str:
|
||||
assert page > 0, "Pages are 1-indexed in pdf-land"
|
||||
|
||||
@ -35,12 +33,9 @@ def get_anchor_text(
|
||||
return _get_pdfium(local_pdf_path, page)
|
||||
elif pdf_engine == "pypdf":
|
||||
return _get_pypdf_raw(local_pdf_path, page)
|
||||
elif pdf_engine == "pymupdf":
|
||||
return _get_pymupdf(local_pdf_path, page)
|
||||
elif pdf_engine == "topcoherency":
|
||||
options = {
|
||||
"pdftotext": _get_pdftotext(local_pdf_path, page),
|
||||
"pymupdf": _get_pymupdf(local_pdf_path, page),
|
||||
"pdfium": _get_pdfium(local_pdf_path, page),
|
||||
"pypdf_raw": _get_pypdf_raw(local_pdf_path, page),
|
||||
}
|
||||
@ -70,11 +65,6 @@ def _get_pdftotext(local_pdf_path: str, page: int) -> str:
|
||||
return pdftotext_result.stdout.decode("utf-8")
|
||||
|
||||
|
||||
def _get_pymupdf(local_pdf_path: str, page: int) -> str:
|
||||
pm_doc = pymupdf.open(local_pdf_path)
|
||||
return pm_doc[page - 1].get_text()
|
||||
|
||||
|
||||
def _get_pypdf_raw(local_pdf_path: str, page: int) -> str:
|
||||
reader = PdfReader(local_pdf_path)
|
||||
pypage = reader.pages[page - 1]
|
||||
|
||||
@ -22,7 +22,6 @@ dependencies = [
|
||||
"cached-path",
|
||||
"smart_open",
|
||||
"pypdf>=5.2.0",
|
||||
"pymupdf",
|
||||
"pypdfium2",
|
||||
"cryptography",
|
||||
"lingua-language-detector",
|
||||
|
||||
@ -43,27 +43,17 @@ class TestCoherencyScores(unittest.TestCase):
|
||||
page=2,
|
||||
pdf_engine="pdftotext",
|
||||
)
|
||||
pymupdf_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page=2,
|
||||
pdf_engine="pymupdf",
|
||||
)
|
||||
pdfium_text = get_anchor_text(
|
||||
os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"),
|
||||
page=2,
|
||||
pdf_engine="pdfium",
|
||||
)
|
||||
|
||||
# pdftotext_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pdftotext")
|
||||
# pymupdf_text = get_document_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), pdf_engine="pymupdf")
|
||||
|
||||
print("pdftotext_text", pdftotext_score := get_document_coherency(pdftotext_text))
|
||||
print("pymupdf_text", pymupdf_score := get_document_coherency(pymupdf_text))
|
||||
print("pdfium_text", pdfium_score := get_document_coherency(pdfium_text))
|
||||
|
||||
self.assertLess(pdftotext_score, pymupdf_score)
|
||||
self.assertLess(pdfium_score, pymupdf_score)
|
||||
self.assertLess(pdfium_score, pdftotext_score)
|
||||
|
||||
anchor_text = get_anchor_text(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "pdftotext_two_column_issue.pdf"), 2, pdf_engine="topcoherency")
|
||||
|
||||
self.assertEqual(anchor_text, pymupdf_text)
|
||||
self.assertEqual(anchor_text, pdfium_text)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user