2022-07-28 01:06:46 -07:00
|
|
|
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
|
|
|
# SPDX-License-Identifier: MPL-2.0
|
2019-12-30 17:51:09 -08:00
|
|
|
|
2022-07-23 00:39:24 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2024-11-10 02:05:57 -08:00
|
|
|
import pickle
|
2023-10-12 16:30:16 -07:00
|
|
|
from io import BytesIO
|
|
|
|
from pathlib import Path
|
2019-12-30 17:51:09 -08:00
|
|
|
|
|
|
|
import pytest
|
2023-10-15 00:49:58 -07:00
|
|
|
from pdfminer.high_level import extract_text
|
2019-12-30 17:51:09 -08:00
|
|
|
|
|
|
|
import ocrmypdf
|
2024-11-10 02:05:57 -08:00
|
|
|
import ocrmypdf._pipelines
|
2023-10-30 00:05:34 -07:00
|
|
|
import ocrmypdf.api
|
2019-12-30 17:51:09 -08:00
|
|
|
|
|
|
|
|
2020-04-14 23:18:52 -07:00
|
|
|
def test_language_list():
|
2020-04-15 02:26:20 -07:00
|
|
|
with pytest.raises(
|
|
|
|
(ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)
|
|
|
|
):
|
|
|
|
ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])
|
2020-04-30 03:38:27 -07:00
|
|
|
|
|
|
|
|
2023-10-12 16:30:16 -07:00
|
|
|
def test_stream_api(resources: Path):
|
2020-04-30 03:38:27 -07:00
|
|
|
in_ = (resources / 'graph.pdf').open('rb')
|
|
|
|
out = BytesIO()
|
|
|
|
|
|
|
|
ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)
|
|
|
|
out.seek(0)
|
|
|
|
assert b'%PDF' in out.read(1024)
|
2023-10-12 16:30:16 -07:00
|
|
|
|
|
|
|
|
2024-04-07 01:38:55 -07:00
|
|
|
def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):
|
|
|
|
s = BytesIO()
|
|
|
|
ocrmypdf.ocr(
|
|
|
|
resources / 'ccitt.pdf',
|
|
|
|
outpdf,
|
|
|
|
plugins=['tests/plugins/tesseract_cache.py'],
|
2024-11-10 02:05:57 -08:00
|
|
|
sidecar=s,
|
2024-04-07 01:38:55 -07:00
|
|
|
)
|
|
|
|
s.seek(0)
|
|
|
|
assert b'the' in s.getvalue()
|
|
|
|
|
|
|
|
|
2023-10-15 00:49:58 -07:00
|
|
|
def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
|
2023-10-30 00:05:34 -07:00
|
|
|
ocrmypdf.api._pdf_to_hocr(
|
2023-10-12 16:30:16 -07:00
|
|
|
resources / 'multipage.pdf',
|
|
|
|
outdir,
|
|
|
|
language='eng',
|
|
|
|
skip_text=True,
|
|
|
|
plugins=['tests/plugins/tesseract_cache.py'],
|
2023-10-12 16:30:16 -07:00
|
|
|
)
|
|
|
|
assert (outdir / '000001_ocr_hocr.hocr').exists()
|
|
|
|
assert (outdir / '000006_ocr_hocr.hocr').exists()
|
|
|
|
assert not (outdir / '000004_ocr_hocr.hocr').exists()
|
2023-10-13 03:25:12 -07:00
|
|
|
|
2023-10-30 00:05:34 -07:00
|
|
|
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)
|
2023-10-15 00:49:58 -07:00
|
|
|
assert outpdf.exists()
|
|
|
|
|
2023-10-13 03:25:12 -07:00
|
|
|
|
|
|
|
def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
|
2023-10-30 00:05:34 -07:00
|
|
|
ocrmypdf.api._pdf_to_hocr(
|
2023-10-13 03:25:12 -07:00
|
|
|
resources / 'ccitt.pdf',
|
|
|
|
outdir,
|
|
|
|
language='eng',
|
|
|
|
skip_text=True,
|
|
|
|
plugins=['tests/plugins/tesseract_cache.py'],
|
|
|
|
)
|
|
|
|
assert (outdir / '000001_ocr_hocr.hocr').exists()
|
|
|
|
hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')
|
|
|
|
mangled = hocr.replace('the', 'hocr')
|
|
|
|
(outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')
|
|
|
|
|
2023-10-30 00:05:34 -07:00
|
|
|
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)
|
2023-10-15 00:49:58 -07:00
|
|
|
|
|
|
|
text = extract_text(outpdf)
|
|
|
|
assert 'hocr' in text and 'the' not in text
|
2024-04-07 01:38:55 -07:00
|
|
|
|
2024-11-10 02:05:57 -08:00
|
|
|
|
|
|
|
def test_hocr_result_json():
|
|
|
|
result = ocrmypdf._pipelines._common.HOCRResult(
|
|
|
|
pageno=1,
|
|
|
|
pdf_page_from_image=Path('a'),
|
|
|
|
hocr=Path('b'),
|
|
|
|
textpdf=Path('c'),
|
|
|
|
orientation_correction=180,
|
|
|
|
)
|
|
|
|
assert (
|
|
|
|
result.to_json()
|
|
|
|
== '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '
|
|
|
|
'"textpdf": {"Path": "c"}, "orientation_correction": 180}'
|
|
|
|
)
|
|
|
|
assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result
|
|
|
|
|
|
|
|
|
|
|
|
def test_hocr_result_pickle():
|
|
|
|
result = ocrmypdf._pipelines._common.HOCRResult(
|
|
|
|
pageno=1,
|
|
|
|
pdf_page_from_image=Path('a'),
|
|
|
|
hocr=Path('b'),
|
|
|
|
textpdf=Path('c'),
|
|
|
|
orientation_correction=180,
|
|
|
|
)
|
|
|
|
assert result == pickle.loads(pickle.dumps(result))
|