2024-09-17 16:26:55 +00:00
|
|
|
import subprocess
|
2024-09-18 22:52:42 +00:00
|
|
|
from typing import Literal
|
|
|
|
|
2024-09-17 16:26:55 +00:00
|
|
|
import pymupdf
|
2024-09-17 18:47:27 +00:00
|
|
|
import pypdfium2 as pdfium
|
2024-09-17 16:26:55 +00:00
|
|
|
|
|
|
|
|
2024-09-18 22:52:42 +00:00
|
|
|
def get_page_text(
|
|
|
|
local_pdf_path: str, page_num: int, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"] = "pdftotext"
|
|
|
|
) -> str:
|
2024-09-17 16:26:55 +00:00
|
|
|
if pdf_engine == "pdftotext":
|
|
|
|
pdftotext_result = subprocess.run(
|
|
|
|
[
|
|
|
|
"pdftotext",
|
|
|
|
"-f",
|
|
|
|
str(page_num),
|
|
|
|
"-l",
|
|
|
|
str(page_num),
|
|
|
|
local_pdf_path,
|
|
|
|
"-",
|
|
|
|
],
|
|
|
|
timeout=60,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
)
|
|
|
|
assert pdftotext_result.returncode == 0
|
|
|
|
|
|
|
|
return pdftotext_result.stdout.decode("utf-8")
|
|
|
|
elif pdf_engine == "pymupdf":
|
|
|
|
pm_doc = pymupdf.open(local_pdf_path)
|
|
|
|
return pm_doc[page_num - 1].get_text()
|
2024-09-17 18:47:27 +00:00
|
|
|
elif pdf_engine == "pdfium":
|
2024-09-17 20:31:32 +00:00
|
|
|
pdf = pdfium.PdfDocument(local_pdf_path, autoclose=True)
|
2024-09-17 18:47:27 +00:00
|
|
|
page = pdf[page_num - 1]
|
|
|
|
textpage = page.get_textpage()
|
|
|
|
|
|
|
|
# Extract text from the whole page
|
2024-09-17 20:31:32 +00:00
|
|
|
result = textpage.get_text_range()
|
|
|
|
pdf.close()
|
|
|
|
return result
|
2024-09-17 16:26:55 +00:00
|
|
|
else:
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
2024-09-18 22:52:42 +00:00
|
|
|
def get_document_text(local_pdf_path: str, pdf_engine: Literal["pdftotext", "pymupdf", "pdfium"] = "pdftotext") -> str:
|
2024-09-17 16:26:55 +00:00
|
|
|
if pdf_engine == "pdftotext":
|
|
|
|
pdftotext_result = subprocess.run(
|
|
|
|
[
|
|
|
|
"pdftotext",
|
|
|
|
local_pdf_path,
|
|
|
|
"-",
|
|
|
|
],
|
|
|
|
timeout=60,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
)
|
|
|
|
assert pdftotext_result.returncode == 0
|
|
|
|
|
|
|
|
return pdftotext_result.stdout.decode("utf-8")
|
|
|
|
elif pdf_engine == "pymupdf":
|
|
|
|
pm_doc = pymupdf.open(local_pdf_path)
|
|
|
|
result = ""
|
|
|
|
|
|
|
|
for page in pm_doc:
|
|
|
|
result += page.get_text()
|
|
|
|
result += "\n"
|
|
|
|
|
2024-09-17 18:47:27 +00:00
|
|
|
return result
|
|
|
|
elif pdf_engine == "pdfium":
|
2024-09-17 20:31:32 +00:00
|
|
|
pdf = pdfium.PdfDocument(local_pdf_path, autoclose=True)
|
2024-09-17 18:47:27 +00:00
|
|
|
result = ""
|
|
|
|
|
|
|
|
for page in pdf:
|
|
|
|
textpage = page.get_textpage()
|
|
|
|
result += textpage.get_text_range()
|
|
|
|
result += "\n"
|
|
|
|
|
2024-09-17 20:31:32 +00:00
|
|
|
pdf.close()
|
2024-09-17 16:26:55 +00:00
|
|
|
return result
|
|
|
|
else:
|
2024-09-18 22:52:42 +00:00
|
|
|
raise NotImplementedError()
|