# © 2018 James R. Barlow: github.com/jbarlow83 # # This file is part of OCRmyPDF. # # OCRmyPDF is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # OCRmyPDF is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . import datetime import logging import mmap import os from datetime import timezone from os import fspath from pathlib import Path from shutil import copyfile, move from unittest.mock import MagicMock, patch import pikepdf import pytest from pikepdf.models.metadata import decode_pdf_date from ocrmypdf._jobcontext import PDFContext from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.cli import parser from ocrmypdf.exceptions import ExitCode from ocrmypdf.pdfa import SRGB_ICC_PROFILE, file_claims_pdfa, generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo try: import fitz except ImportError: fitz = None # pytest.helpers is dynamic # pylint: disable=no-member # pylint: disable=w0612 pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning') check_ocrmypdf = pytest.helpers.check_ocrmypdf run_ocrmypdf = pytest.helpers.run_ocrmypdf spoof = pytest.helpers.spoof @pytest.mark.parametrize("output_type", ['pdfa', 'pdf']) def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') output = check_ocrmypdf( resources / 'graph.pdf', outpdf, '--output-type', output_type, env=spoof_tesseract_noop, ) pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type @pytest.mark.parametrize("output_type", ['pdfa', 'pdf']) def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf): # Ghostscript doesn't support high Unicode, so neither do we, to be # safe input_file = resources / 'c02-22.pdf' high_unicode = 'U+1030C is: 𐌌' p, out, err = run_ocrmypdf( input_file, no_outpdf, '--subject', high_unicode, '--output-type', 'pdfa', env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.bad_args, err @pytest.mark.skipif(not fitz, reason="test uses fitz") @pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr']) @pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) def test_bookmarks_preserved( spoof_tesseract_noop, output_type, ocr_option, resources, outpdf ): input_file = resources / 'toc.pdf' before_toc = fitz.Document(str(input_file)).getToC() check_ocrmypdf( input_file, outpdf, ocr_option, '--output-type', output_type, env=spoof_tesseract_noop, ) after_toc = fitz.Document(str(outpdf)).getToC() print(before_toc) print(after_toc) assert before_toc == after_toc def seconds_between_dates(date1, date2): return (date2 - date1).total_seconds() @pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf']) @pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) def test_creation_date_preserved( spoof_tesseract_noop, output_type, resources, infile, outpdf ): input_file = resources / infile check_ocrmypdf( input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop ) pdf_before = pikepdf.open(input_file) pdf_after = pikepdf.open(outpdf) before = pdf_before.trailer.get('/Info', {}) after = pdf_after.trailer.get('/Info', {}) if not before: assert after.get('/CreationDate', '') != '' else: # We expect that the creation date stayed the same date_before = decode_pdf_date(str(before['/CreationDate'])) date_after = decode_pdf_date(str(after['/CreationDate'])) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(str(after['/ModDate'])) assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000 @pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) def test_xml_metadata_preserved(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'graph.pdf' try: from libxmp import consts from libxmp.utils import file_to_dict except Exception: pytest.skip("libxmp not available or libexempi3 not installed") before = file_to_dict(str(input_file)) check_ocrmypdf( input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop ) after = file_to_dict(str(outpdf)) equal_properties = [ 'dc:contributor', 'dc:coverage', 'dc:creator', 'dc:description', 'dc:format', 'dc:identifier', 'dc:language', 'dc:publisher', 'dc:relation', 'dc:rights', 'dc:source', 'dc:subject', 'dc:title', 'dc:type', 'pdf:keywords', ] might_change_properties = [ 'dc:date', 'pdf:pdfversion', 'pdf:Producer', 'xmp:CreateDate', 'xmp:ModifyDate', 'xmp:MetadataDate', 'xmp:CreatorTool', 'xmpMM:DocumentId', 'xmpMM:DnstanceId', ] # Cleanup messy data structure # Top level is key-value mapping of namespaces to keys under namespace, # so we put everything in the same namespace def unify_namespaces(xmpdict): for entries in xmpdict.values(): yield from entries # Now we have a list of (key, value, {infodict}). We don't care about # infodict. Just flatten to keys and values def keyval_from_tuple(list_of_tuples): for k, v, *_ in list_of_tuples: yield k, v before = dict(keyval_from_tuple(unify_namespaces(before))) after = dict(keyval_from_tuple(unify_namespaces(after))) for prop in equal_properties: if prop in before: assert prop in after, f'{prop} dropped from xmp' assert before[prop] == after[prop] # Certain entries like title appear as dc:title[1], with the possibility # of several propidx = f'{prop}[1]' if propidx in before: assert ( after.get(propidx) == before[propidx] or after.get(prop) == before[propidx] ) def test_srgb_in_unicode_path(tmp_path): """Test that we can produce pdfmark when install path is not ASCII""" dstdir = tmp_path / b'\xe4\x80\x80'.decode('utf-8') dstdir.mkdir() dst = dstdir / 'sRGB.icc' copyfile(SRGB_ICC_PROFILE, fspath(dst)) with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)): generate_pdfa_ps(dstdir / 'out.ps') def test_kodak_toc(resources, outpdf, spoof_tesseract_noop): output = check_ocrmypdf( resources / 'kcs.pdf', outpdf, '--output-type', 'pdf', env=spoof_tesseract_noop ) p = pikepdf.open(outpdf) if pikepdf.Name.First in p.root.Outlines: assert isinstance(p.root.Outlines.First, pikepdf.Dictionary) def test_metadata_fixup_warning(resources, outdir, caplog): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import metadata_fixup options = parser.parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'] ) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PDFContext(options, outdir, outdir / 'graph.pdf', None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records) def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) print(caplog.records) assert any( 'malformed DocumentInfo block' in record.message for record in caplog.records )