OCRmyPDF/tests/test_api.py

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pickle
from io import BytesIO
from pathlib import Path

import pytest
from pdfminer.high_level import extract_text

import ocrmypdf
import ocrmypdf._pipelines
import ocrmypdf.api


def test_language_list():
    with pytest.raises(
        (ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)
    ):
        ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])


def test_stream_api(resources: Path):
    in_ = (resources / 'graph.pdf').open('rb')
    out = BytesIO()

    ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)
    out.seek(0)
    assert b'%PDF' in out.read(1024)


def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):
    s = BytesIO()
    ocrmypdf.ocr(
        resources / 'ccitt.pdf',
        outpdf,
        plugins=['tests/plugins/tesseract_cache.py'],
        sidecar=s,
    )
    s.seek(0)
    assert b'the' in s.getvalue()


def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
    ocrmypdf.api._pdf_to_hocr(
        resources / 'multipage.pdf',
        outdir,
        language='eng',
        skip_text=True,
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    assert (outdir / '000001_ocr_hocr.hocr').exists()
    assert (outdir / '000006_ocr_hocr.hocr').exists()
    assert not (outdir / '000004_ocr_hocr.hocr').exists()

    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)
    assert outpdf.exists()


def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
    ocrmypdf.api._pdf_to_hocr(
        resources / 'ccitt.pdf',
        outdir,
        language='eng',
        skip_text=True,
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    assert (outdir / '000001_ocr_hocr.hocr').exists()
    hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')
    mangled = hocr.replace('the', 'hocr')
    (outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')

    ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)

    text = extract_text(outpdf)
    assert 'hocr' in text and 'the' not in text


def test_hocr_result_json():
    result = ocrmypdf._pipelines._common.HOCRResult(
        pageno=1,
        pdf_page_from_image=Path('a'),
        hocr=Path('b'),
        textpdf=Path('c'),
        orientation_correction=180,
    )
    assert (
        result.to_json()
        == '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '
        '"textpdf": {"Path": "c"}, "orientation_correction": 180}'
    )
    assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result


def test_hocr_result_pickle():
    result = ocrmypdf._pipelines._common.HOCRResult(
        pageno=1,
        pdf_page_from_image=Path('a'),
        hocr=Path('b'),
        textpdf=Path('c'),
        orientation_correction=180,
    )
    assert result == pickle.loads(pickle.dumps(result))
Change to SPDX license tracking 2022-07-28 01:06:46 -07:00			`# SPDX-FileCopyrightText: 2022 James R. Barlow`
			`# SPDX-License-Identifier: MPL-2.0`
tests: test TqdmConsole 2019-12-30 17:51:09 -08:00
Modernize type annotations 2022-07-23 00:39:24 -07:00			`from __future__ import annotations`

Fix issue with unpickling HOCRResult Fixes [Bug]: HOCRResult.from_json() not unpickling correctly #1427 2024-11-10 02:05:57 -08:00			`import pickle`
Introduce pdf_to_hocr API 2023-10-12 16:30:16 -07:00			`from io import BytesIO`
			`from pathlib import Path`
tests: test TqdmConsole 2019-12-30 17:51:09 -08:00
			`import pytest`
hocr_to_ocr_pdf: handle missing hocr json file 2023-10-15 00:49:58 -07:00			`from pdfminer.high_level import extract_text`
tests: test TqdmConsole 2019-12-30 17:51:09 -08:00
			`import ocrmypdf`
Fix issue with unpickling HOCRResult Fixes [Bug]: HOCRResult.from_json() not unpickling correctly #1427 2024-11-10 02:05:57 -08:00			`import ocrmypdf._pipelines`
Make hocr API experimental for now This commit can be reverted when we are ready to release a new version. 2023-10-30 00:05:34 -07:00			`import ocrmypdf.api`
tests: test TqdmConsole 2019-12-30 17:51:09 -08:00

Fix language argument not working as list Fixes #523 2020-04-14 23:18:52 -07:00			`def test_language_list():`
pytest picky about list vs tuple 2020-04-15 02:26:20 -07:00			`with pytest.raises(`
			`(ocrmypdf.exceptions.InputFileError, ocrmypdf.exceptions.MissingDependencyError)`
			`):`
			`ocrmypdf.ocr('doesnotexist.pdf', '_.pdf', language=['eng', 'deu'])`
Support input/output streams at API level 2020-04-30 03:38:27 -07:00

Introduce pdf_to_hocr API 2023-10-12 16:30:16 -07:00			`def test_stream_api(resources: Path):`
Support input/output streams at API level 2020-04-30 03:38:27 -07:00			`in_ = (resources / 'graph.pdf').open('rb')`
			`out = BytesIO()`

			`ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)`
			`out.seek(0)`
			`assert b'%PDF' in out.read(1024)`
Introduce pdf_to_hocr API 2023-10-12 16:30:16 -07:00

Add support for sidecar output to io.BytesIO Closes #1252 2024-04-07 01:38:55 -07:00			`def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):`
			`s = BytesIO()`
			`ocrmypdf.ocr(`
			`resources / 'ccitt.pdf',`
			`outpdf,`
			`plugins=['tests/plugins/tesseract_cache.py'],`
Fix issue with unpickling HOCRResult Fixes [Bug]: HOCRResult.from_json() not unpickling correctly #1427 2024-11-10 02:05:57 -08:00			`sidecar=s,`
Add support for sidecar output to io.BytesIO Closes #1252 2024-04-07 01:38:55 -07:00			`)`
			`s.seek(0)`
			`assert b'the' in s.getvalue()`


hocr_to_ocr_pdf: handle missing hocr json file 2023-10-15 00:49:58 -07:00			`def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):`
Make hocr API experimental for now This commit can be reverted when we are ready to release a new version. 2023-10-30 00:05:34 -07:00			`ocrmypdf.api._pdf_to_hocr(`
pdf_to_hocr: improve plugin handling 2023-10-12 16:30:16 -07:00			`resources / 'multipage.pdf',`
			`outdir,`
			`language='eng',`
			`skip_text=True,`
			`plugins=['tests/plugins/tesseract_cache.py'],`
Introduce pdf_to_hocr API 2023-10-12 16:30:16 -07:00			`)`
			`assert (outdir / '000001_ocr_hocr.hocr').exists()`
			`assert (outdir / '000006_ocr_hocr.hocr').exists()`
			`assert not (outdir / '000004_ocr_hocr.hocr').exists()`
Working HOCR folder to PDF converter 2023-10-13 03:25:12 -07:00
Make hocr API experimental for now This commit can be reverted when we are ready to release a new version. 2023-10-30 00:05:34 -07:00			`ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)`
hocr_to_ocr_pdf: handle missing hocr json file 2023-10-15 00:49:58 -07:00			`assert outpdf.exists()`

Working HOCR folder to PDF converter 2023-10-13 03:25:12 -07:00
			`def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):`
Make hocr API experimental for now This commit can be reverted when we are ready to release a new version. 2023-10-30 00:05:34 -07:00			`ocrmypdf.api._pdf_to_hocr(`
Working HOCR folder to PDF converter 2023-10-13 03:25:12 -07:00			`resources / 'ccitt.pdf',`
			`outdir,`
			`language='eng',`
			`skip_text=True,`
			`plugins=['tests/plugins/tesseract_cache.py'],`
			`)`
			`assert (outdir / '000001_ocr_hocr.hocr').exists()`
			`hocr = (outdir / '000001_ocr_hocr.hocr').read_text(encoding='utf-8')`
			`mangled = hocr.replace('the', 'hocr')`
			`(outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')`

Make hocr API experimental for now This commit can be reverted when we are ready to release a new version. 2023-10-30 00:05:34 -07:00			`ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)`
hocr_to_ocr_pdf: handle missing hocr json file 2023-10-15 00:49:58 -07:00
			`text = extract_text(outpdf)`
			`assert 'hocr' in text and 'the' not in text`
Add support for sidecar output to io.BytesIO Closes #1252 2024-04-07 01:38:55 -07:00
Fix issue with unpickling HOCRResult Fixes [Bug]: HOCRResult.from_json() not unpickling correctly #1427 2024-11-10 02:05:57 -08:00
			`def test_hocr_result_json():`
			`result = ocrmypdf._pipelines._common.HOCRResult(`
			`pageno=1,`
			`pdf_page_from_image=Path('a'),`
			`hocr=Path('b'),`
			`textpdf=Path('c'),`
			`orientation_correction=180,`
			`)`
			`assert (`
			`result.to_json()`
			`== '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '`
			`'"textpdf": {"Path": "c"}, "orientation_correction": 180}'`
			`)`
			`assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result`


			`def test_hocr_result_pickle():`
			`result = ocrmypdf._pipelines._common.HOCRResult(`
			`pageno=1,`
			`pdf_page_from_image=Path('a'),`
			`hocr=Path('b'),`
			`textpdf=Path('c'),`
			`orientation_correction=180,`
			`)`
			`assert result == pickle.loads(pickle.dumps(result))`