mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-06-26 23:49:59 +00:00
Add debugging helper scripts
This commit is contained in:
parent
522f9d5f56
commit
36c82e0659
123
misc/ocrmypdf_compare.py
Normal file
123
misc/ocrmypdf_compare.py
Normal file
@ -0,0 +1,123 @@
|
||||
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Run OCRmyPDF on the same PDF with different options."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import shlex
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from subprocess import check_output, run
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pikepdf
|
||||
import pymupdf
|
||||
import streamlit as st
|
||||
from lxml import etree
|
||||
from streamlit_pdf_viewer import pdf_viewer
|
||||
|
||||
|
||||
def main():
|
||||
st.set_page_config(layout="wide")
|
||||
|
||||
st.title("OCRmyPDF Compare")
|
||||
st.write("Run OCRmyPDF on the same PDF with different options.")
|
||||
|
||||
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
|
||||
if uploaded_pdf is None:
|
||||
return
|
||||
|
||||
pdf_bytes = uploaded_pdf.read()
|
||||
|
||||
with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:
|
||||
with st.expander("PDF Metadata"):
|
||||
with p.open_metadata() as meta:
|
||||
xml_txt = str(meta)
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
tree = etree.fromstring(xml_txt, parser=parser)
|
||||
st.code(
|
||||
etree.tostring(tree, pretty_print=True).decode("utf-8"),
|
||||
language="xml",
|
||||
)
|
||||
st.write(p.docinfo)
|
||||
st.write("Number of pages:", len(p.pages))
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
cli1 = st.text_area(
|
||||
"Command line arguments for A",
|
||||
key="args1",
|
||||
value="ocrmypdf {in_} {out}",
|
||||
)
|
||||
env1 = st.text_area("Environment variables for A", key="env1")
|
||||
args1 = shlex.split(
|
||||
cli1.format(
|
||||
in_=os.path.join(d, "input.pdf"),
|
||||
out=os.path.join(d, "output1.pdf"),
|
||||
)
|
||||
)
|
||||
st.code(shlex.join(args1))
|
||||
with col2:
|
||||
cli2 = st.text_area(
|
||||
"Command line arguments for B",
|
||||
key="args2",
|
||||
value="ocrmypdf {in_} {out}",
|
||||
)
|
||||
env2 = st.text_area("Environment variables for B", key="env2")
|
||||
args2 = shlex.split(
|
||||
cli2.format(
|
||||
in_=os.path.join(d, "input.pdf"),
|
||||
out=os.path.join(d, "output2.pdf"),
|
||||
)
|
||||
)
|
||||
st.code(shlex.join(args2))
|
||||
|
||||
if not st.button("Execute and Compare"):
|
||||
return
|
||||
with st.spinner("Executing..."):
|
||||
Path(d, "input.pdf").write_bytes(pdf_bytes)
|
||||
run(args1, env=dict(os.environ, **eval(env1 or "{}")))
|
||||
run(args2, env=dict(os.environ, **eval(env2 or "{}")))
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
st.text(
|
||||
"Ghostscript version A: "
|
||||
+ check_output(
|
||||
["gs", "--version"],
|
||||
env=dict(os.environ, **eval(env1 or "{}")),
|
||||
text=True,
|
||||
)
|
||||
)
|
||||
with col2:
|
||||
st.text(
|
||||
"Ghostscript version B: "
|
||||
+ check_output(
|
||||
["gs", "--version"],
|
||||
env=dict(os.environ, **eval(env2 or "{}")),
|
||||
text=True,
|
||||
)
|
||||
)
|
||||
|
||||
doc1 = pymupdf.open(os.path.join(d, "output1.pdf"))
|
||||
doc2 = pymupdf.open(os.path.join(d, "output2.pdf"))
|
||||
for i, page1_2 in enumerate(zip(doc1, doc2)):
|
||||
st.write(f"Page {i+1}")
|
||||
page1, page2 = page1_2
|
||||
col1, col2 = st.columns(2)
|
||||
with col1, st.container(border=True):
|
||||
st.write(page1.get_text())
|
||||
with col2, st.container(border=True):
|
||||
st.write(page2.get_text())
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1, st.expander("PDF Viewer"):
|
||||
pdf_viewer(Path(d, "output1.pdf"))
|
||||
with col2, st.expander("PDF Viewer"):
|
||||
pdf_viewer(Path(d, "output2.pdf"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
83
misc/pdf_compare.py
Normal file
83
misc/pdf_compare.py
Normal file
@ -0,0 +1,83 @@
|
||||
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Compare two PDFs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pikepdf
|
||||
import pymupdf
|
||||
import streamlit as st
|
||||
from lxml import etree
|
||||
from streamlit_pdf_viewer import pdf_viewer
|
||||
|
||||
|
||||
def do_metadata(pdf):
|
||||
with pikepdf.open(pdf) as pdf:
|
||||
with pdf.open_metadata() as meta:
|
||||
xml_txt = str(meta)
|
||||
parser = etree.XMLParser(remove_blank_text=True)
|
||||
tree = etree.fromstring(xml_txt, parser=parser)
|
||||
st.code(
|
||||
etree.tostring(tree, pretty_print=True).decode("utf-8"),
|
||||
language="xml",
|
||||
)
|
||||
st.write(pdf.docinfo)
|
||||
st.write("Number of pages:", len(pdf.pages))
|
||||
|
||||
|
||||
def main():
|
||||
st.set_page_config(layout="wide")
|
||||
|
||||
st.title("PDF Compare")
|
||||
st.write("Compare two PDFs.")
|
||||
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1')
|
||||
with col2:
|
||||
uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2')
|
||||
if uploaded_pdf1 is None or uploaded_pdf2 is None:
|
||||
return
|
||||
|
||||
pdf_bytes1 = uploaded_pdf1.getvalue()
|
||||
pdf_bytes2 = uploaded_pdf2.getvalue()
|
||||
|
||||
with st.expander("PDF Metadata"):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
do_metadata(BytesIO(pdf_bytes1))
|
||||
with col2:
|
||||
do_metadata(BytesIO(pdf_bytes2))
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
Path(d, "1.pdf").write_bytes(pdf_bytes1)
|
||||
Path(d, "2.pdf").write_bytes(pdf_bytes2)
|
||||
|
||||
with st.expander("Text"):
|
||||
doc1 = pymupdf.open(os.path.join(d, "1.pdf"))
|
||||
doc2 = pymupdf.open(os.path.join(d, "2.pdf"))
|
||||
for i, page1_2 in enumerate(zip(doc1, doc2)):
|
||||
st.write(f"Page {i+1}")
|
||||
page1, page2 = page1_2
|
||||
col1, col2 = st.columns(2)
|
||||
with col1, st.container(border=True):
|
||||
st.write(page1.get_text())
|
||||
with col2, st.container(border=True):
|
||||
st.write(page2.get_text())
|
||||
|
||||
with st.expander("PDF Viewer"):
|
||||
col1, col2 = st.columns(2)
|
||||
with col1:
|
||||
pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True)
|
||||
with col2:
|
||||
pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
46
misc/pdf_text_diff.py
Normal file
46
misc/pdf_text_diff.py
Normal file
@ -0,0 +1,46 @@
|
||||
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
"""Compare text in PDFs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from subprocess import run
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
|
||||
def main(
|
||||
pdf1: Annotated[typer.FileBinaryRead, typer.Argument()],
|
||||
pdf2: Annotated[typer.FileBinaryRead, typer.Argument()],
|
||||
engine: Annotated[str, typer.Option()] = 'pdftotext',
|
||||
):
|
||||
"""Compare text in PDFs."""
|
||||
|
||||
text1 = run(
|
||||
['pdftotext', '-layout', '-', '-'], stdin=pdf1, capture_output=True, check=True
|
||||
)
|
||||
text2 = run(
|
||||
['pdftotext', '-layout', '-', '-'], stdin=pdf2, capture_output=True, check=True
|
||||
)
|
||||
|
||||
with NamedTemporaryFile() as f1, NamedTemporaryFile() as f2:
|
||||
f1.write(text1.stdout)
|
||||
f1.flush()
|
||||
f2.write(text2.stdout)
|
||||
f2.flush()
|
||||
diff = run(
|
||||
['diff', '--color=always', '--side-by-side', f1.name, f2.name],
|
||||
capture_output=True,
|
||||
)
|
||||
run(['less', '-R'], input=diff.stdout, check=True)
|
||||
if text1.stdout.strip() != text2.stdout.strip():
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
typer.run(main)
|
Loading…
x
Reference in New Issue
Block a user