olmocr/pdelfin/extract_text.py

82 lines
2.2 KiB
Python
Raw Normal View History

import subprocess
2024-09-18 22:52:42 +00:00
from typing import Literal
import pymupdf
import pypdfium2 as pdfium
2024-09-18 22:52:42 +00:00
def get_page_text(
local_pdf_path: str, page_num: int, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"] = "pdftotext"
) -> str:
if pdf_engine == "pdftotext":
pdftotext_result = subprocess.run(
[
"pdftotext",
"-f",
str(page_num),
"-l",
str(page_num),
local_pdf_path,
"-",
],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftotext_result.returncode == 0
return pdftotext_result.stdout.decode("utf-8")
elif pdf_engine == "pymupdf":
pm_doc = pymupdf.open(local_pdf_path)
return pm_doc[page_num - 1].get_text()
elif pdf_engine == "pdfium":
2024-09-17 20:31:32 +00:00
pdf = pdfium.PdfDocument(local_pdf_path, autoclose=True)
page = pdf[page_num - 1]
textpage = page.get_textpage()
# Extract text from the whole page
2024-09-17 20:31:32 +00:00
result = textpage.get_text_range()
pdf.close()
return result
else:
raise NotImplementedError()
2024-09-18 22:52:42 +00:00
def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"] = "pdftotext") -> str:
if pdf_engine == "pdftotext":
pdftotext_result = subprocess.run(
[
"pdftotext",
local_pdf_path,
"-",
],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
assert pdftotext_result.returncode == 0
return pdftotext_result.stdout.decode("utf-8")
elif pdf_engine == "pymupdf":
pm_doc = pymupdf.open(local_pdf_path)
result = ""
for page in pm_doc:
result += page.get_text()
result += "\n"
return result
elif pdf_engine == "pdfium":
2024-09-17 20:31:32 +00:00
pdf = pdfium.PdfDocument(local_pdf_path, autoclose=True)
result = ""
for page in pdf:
textpage = page.get_textpage()
result += textpage.get_text_range()
result += "\n"
2024-09-17 20:31:32 +00:00
pdf.close()
return result
else:
2024-09-18 22:52:42 +00:00
raise NotImplementedError()