Add debugging helper scripts

This commit is contained in:
James R. Barlow 2025-01-01 18:03:15 -08:00
parent 522f9d5f56
commit 36c82e0659
No known key found for this signature in database
GPG Key ID: E54A300D567E1260
3 changed files with 252 additions and 0 deletions

123
misc/ocrmypdf_compare.py Normal file
View File

@ -0,0 +1,123 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Run OCRmyPDF on the same PDF with different options."""
from __future__ import annotations
import os
import shlex
from io import BytesIO
from pathlib import Path
from subprocess import check_output, run
from tempfile import TemporaryDirectory
import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer
def main():
st.set_page_config(layout="wide")
st.title("OCRmyPDF Compare")
st.write("Run OCRmyPDF on the same PDF with different options.")
uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_pdf is None:
return
pdf_bytes = uploaded_pdf.read()
with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:
with st.expander("PDF Metadata"):
with p.open_metadata() as meta:
xml_txt = str(meta)
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.fromstring(xml_txt, parser=parser)
st.code(
etree.tostring(tree, pretty_print=True).decode("utf-8"),
language="xml",
)
st.write(p.docinfo)
st.write("Number of pages:", len(p.pages))
col1, col2 = st.columns(2)
with col1:
cli1 = st.text_area(
"Command line arguments for A",
key="args1",
value="ocrmypdf {in_} {out}",
)
env1 = st.text_area("Environment variables for A", key="env1")
args1 = shlex.split(
cli1.format(
in_=os.path.join(d, "input.pdf"),
out=os.path.join(d, "output1.pdf"),
)
)
st.code(shlex.join(args1))
with col2:
cli2 = st.text_area(
"Command line arguments for B",
key="args2",
value="ocrmypdf {in_} {out}",
)
env2 = st.text_area("Environment variables for B", key="env2")
args2 = shlex.split(
cli2.format(
in_=os.path.join(d, "input.pdf"),
out=os.path.join(d, "output2.pdf"),
)
)
st.code(shlex.join(args2))
if not st.button("Execute and Compare"):
return
with st.spinner("Executing..."):
Path(d, "input.pdf").write_bytes(pdf_bytes)
run(args1, env=dict(os.environ, **eval(env1 or "{}")))
run(args2, env=dict(os.environ, **eval(env2 or "{}")))
col1, col2 = st.columns(2)
with col1:
st.text(
"Ghostscript version A: "
+ check_output(
["gs", "--version"],
env=dict(os.environ, **eval(env1 or "{}")),
text=True,
)
)
with col2:
st.text(
"Ghostscript version B: "
+ check_output(
["gs", "--version"],
env=dict(os.environ, **eval(env2 or "{}")),
text=True,
)
)
doc1 = pymupdf.open(os.path.join(d, "output1.pdf"))
doc2 = pymupdf.open(os.path.join(d, "output2.pdf"))
for i, page1_2 in enumerate(zip(doc1, doc2)):
st.write(f"Page {i+1}")
page1, page2 = page1_2
col1, col2 = st.columns(2)
with col1, st.container(border=True):
st.write(page1.get_text())
with col2, st.container(border=True):
st.write(page2.get_text())
col1, col2 = st.columns(2)
with col1, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output1.pdf"))
with col2, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output2.pdf"))
if __name__ == "__main__":
main()

83
misc/pdf_compare.py Normal file
View File

@ -0,0 +1,83 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT
"""Compare two PDFs."""
from __future__ import annotations
import os
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory
import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer
def do_metadata(pdf):
with pikepdf.open(pdf) as pdf:
with pdf.open_metadata() as meta:
xml_txt = str(meta)
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.fromstring(xml_txt, parser=parser)
st.code(
etree.tostring(tree, pretty_print=True).decode("utf-8"),
language="xml",
)
st.write(pdf.docinfo)
st.write("Number of pages:", len(pdf.pages))
def main():
st.set_page_config(layout="wide")
st.title("PDF Compare")
st.write("Compare two PDFs.")
col1, col2 = st.columns(2)
with col1:
uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1')
with col2:
uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2')
if uploaded_pdf1 is None or uploaded_pdf2 is None:
return
pdf_bytes1 = uploaded_pdf1.getvalue()
pdf_bytes2 = uploaded_pdf2.getvalue()
with st.expander("PDF Metadata"):
col1, col2 = st.columns(2)
with col1:
do_metadata(BytesIO(pdf_bytes1))
with col2:
do_metadata(BytesIO(pdf_bytes2))
with TemporaryDirectory() as d:
Path(d, "1.pdf").write_bytes(pdf_bytes1)
Path(d, "2.pdf").write_bytes(pdf_bytes2)
with st.expander("Text"):
doc1 = pymupdf.open(os.path.join(d, "1.pdf"))
doc2 = pymupdf.open(os.path.join(d, "2.pdf"))
for i, page1_2 in enumerate(zip(doc1, doc2)):
st.write(f"Page {i+1}")
page1, page2 = page1_2
col1, col2 = st.columns(2)
with col1, st.container(border=True):
st.write(page1.get_text())
with col2, st.container(border=True):
st.write(page2.get_text())
with st.expander("PDF Viewer"):
col1, col2 = st.columns(2)
with col1:
pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True)
with col2:
pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True)
if __name__ == "__main__":
main()

46
misc/pdf_text_diff.py Normal file
View File

@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Compare text in PDFs."""
from __future__ import annotations
from subprocess import run
from tempfile import NamedTemporaryFile
from typing import Annotated
import typer
def main(
pdf1: Annotated[typer.FileBinaryRead, typer.Argument()],
pdf2: Annotated[typer.FileBinaryRead, typer.Argument()],
engine: Annotated[str, typer.Option()] = 'pdftotext',
):
"""Compare text in PDFs."""
text1 = run(
['pdftotext', '-layout', '-', '-'], stdin=pdf1, capture_output=True, check=True
)
text2 = run(
['pdftotext', '-layout', '-', '-'], stdin=pdf2, capture_output=True, check=True
)
with NamedTemporaryFile() as f1, NamedTemporaryFile() as f2:
f1.write(text1.stdout)
f1.flush()
f2.write(text2.stdout)
f2.flush()
diff = run(
['diff', '--color=always', '--side-by-side', f1.name, f2.name],
capture_output=True,
)
run(['less', '-R'], input=diff.stdout, check=True)
if text1.stdout.strip() != text2.stdout.strip():
return 1
return 0
if __name__ == '__main__':
typer.run(main)