OCRmyPDF/tests/test_metadata.py

# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.


import pytest

from datetime import timezone
from pathlib import Path
from shutil import copyfile
from unittest.mock import patch
import datetime

import pikepdf

from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import fspath
from ocrmypdf.pdfa import (
    file_claims_pdfa, encode_pdf_date, decode_pdf_date, generate_pdfa_ps,
    SRGB_ICC_PROFILE
)
from ocrmypdf.exec import ghostscript

try:
    import fitz
except ImportError:
    fitz = None

# pytest.helpers is dynamic
# pylint: disable=no-member
# pylint: disable=w0612

check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof


@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_preserve_metadata(spoof_tesseract_noop, output_type,
                           resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')

    output = check_ocrmypdf(
            resources / 'graph.pdf', outpdf,
            '--output-type', output_type,
            env=spoof_tesseract_noop)

    pdf_after = pikepdf.open(output)

    for key in ('/Title', '/Author'):
        assert pdf_before.metadata[key] == pdf_after.metadata[key]

    pdfa_info = file_claims_pdfa(str(output))
    assert pdfa_info['output'] == output_type


@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

    before = pikepdf.open(input_file)
    after = pikepdf.open(outpdf)

    if ghostscript.version() >= '9.24':
        pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO')

    assert after.metadata.Title == german, after.metadata
    assert after.metadata.Author == chinese, after.metadata
    assert after.metadata.get('/Keywords', '') == ''

    before_date = decode_pdf_date(str(before.metadata.CreationDate))
    after_date = decode_pdf_date(str(after.metadata.CreationDate))
    assert before_date == after_date

    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type


def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):

    # Ghostscript doesn't support high Unicode, so neither do we, to be
    # safe
    input_file = resources / 'c02-22.pdf'
    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, no_outpdf,
        '--subject', high_unicode,
        '--output-type', 'pdfa',
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.bad_args, err


@pytest.mark.skipif(not fitz, reason="test uses fitz")
@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
                             resources, outpdf):
    input_file = resources / 'toc.pdf'
    before_toc = fitz.Document(str(input_file)).getToC()

    check_ocrmypdf(
        input_file, outpdf,
        ocr_option,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    after_toc = fitz.Document(str(outpdf)).getToC()
    print(before_toc)
    print(after_toc)
    assert before_toc == after_toc


def seconds_between_dates(date1, date2):
    return (date2 - date1).total_seconds()


@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
                                 infile, outpdf):
    input_file = resources / infile

    check_ocrmypdf(
        input_file, outpdf, '--output-type', output_type,
        env=spoof_tesseract_noop)

    pdf_before = pikepdf.open(input_file)
    pdf_after = pikepdf.open(outpdf)

    before = pdf_before.trailer.get('/Info', {})
    after = pdf_after.trailer.get('/Info', {})

    if not before:
        # If there was input creation date, none should be output
        # because of Ghostscript quirks we set it to null
        # This test would be better if we had a test file with /DocumentInfo but
        # no /CreationDate, which we don't
        assert after.get('/CreationDate', '') == ''
    else:
        # We expect that the creation date stayed the same
        date_before = decode_pdf_date(str(before['/CreationDate']))
        date_after = decode_pdf_date(str(after['/CreationDate']))
        assert seconds_between_dates(date_before, date_after) < 1000

    # We expect that the modified date is quite recent
    date_after = decode_pdf_date(str(after['/ModDate']))
    assert seconds_between_dates(
        date_after, datetime.datetime.now(timezone.utc)) < 1000


@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
def test_xml_metadata_preserved(spoof_tesseract_noop, output_type,
                                resources, outpdf):
    input_file = resources / 'graph.pdf'

    try:
        import libxmp
        from libxmp.utils import file_to_dict
        from libxmp import consts
    except Exception:
        pytest.skip("libxmp not available or libexempi3 not installed")

    before = file_to_dict(str(input_file))

    check_ocrmypdf(
        input_file, outpdf,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    after = file_to_dict(str(outpdf))

    equal_properties = [
        'dc:contributor',
        'dc:coverage',
        'dc:creator',
        'dc:description',
        'dc:format',
        'dc:identifier',
        'dc:language',
        'dc:publisher',
        'dc:relation',
        'dc:rights',
        'dc:source',
        'dc:subject',
        'dc:title',
        'dc:type',
        'pdf:keywords',
    ]
    might_change_properties = [
        'dc:date',
        'pdf:pdfversion',
        'pdf:Producer',
        'xmp:CreateDate',
        'xmp:ModifyDate',
        'xmp:MetadataDate',
        'xmp:CreatorTool',
        'xmpMM:DocumentId',
        'xmpMM:DnstanceId'
    ]

    # Cleanup messy data structure
    # Top level is key-value mapping of namespaces to keys under namespace,
    # so we put everything in the same namespace
    def unify_namespaces(xmpdict):
        for entries in xmpdict.values():
            yield from entries

    # Now we have a list of (key, value, {infodict}). We don't care about
    # infodict. Just flatten to keys and values
    def keyval_from_tuple(list_of_tuples):
        for k, v, *_ in list_of_tuples:
            yield k, v

    before = dict(keyval_from_tuple(unify_namespaces(before)))
    after = dict(keyval_from_tuple(unify_namespaces(after)))

    for prop in equal_properties:
        if prop in before:
            assert prop in after, '{} dropped from xmp'.format(prop)
            assert before[prop] == after[prop]

        # Certain entries like title appear as dc:title[1], with the possibility
        # of several
        propidx = '{}[1]'.format(prop)
        if propidx in before:
            assert after.get(propidx) == before[propidx] \
                    or after.get(prop) == before[propidx]


def test_srgb_in_unicode_path(tmpdir):
    """Test that we can produce pdfmark when install path is not ASCII"""

    dstdir = Path(fspath(tmpdir)) / b'\xe4\x80\x80'.decode('utf-8')
    dstdir.mkdir()
    dst = dstdir / 'sRGB.icc'

    copyfile(SRGB_ICC_PROFILE, fspath(dst))

    with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)):
        generate_pdfa_ps(dstdir / 'out.ps', {})


def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
    output = check_ocrmypdf(
        resources / 'kcs.pdf', outpdf,
        '--output-type', 'pdf',
        env=spoof_tesseract_noop)

    p = pikepdf.open(outpdf)

    if pikepdf.Name.First in p.root.Outlines:
        assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00			`# © 2018 James R. Barlow: github.com/jbarlow83`
			`#`
			`# This file is part of OCRmyPDF.`
			`#`
			`# OCRmyPDF is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# OCRmyPDF is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.`


			`import pytest`
Fix issue #275: doesn't work when installed in non-Unicode path Closes #275 2018-07-07 01:35:05 -07:00
Fix regression: time stamp test suite failures 2018-04-17 16:59:21 -07:00			`from datetime import timezone`
Fix issue #275: doesn't work when installed in non-Unicode path Closes #275 2018-07-07 01:35:05 -07:00			`from pathlib import Path`
			`from shutil import copyfile`
			`from unittest.mock import patch`
			`import datetime`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`import pikepdf`

Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00			`from ocrmypdf.exceptions import ExitCode`
Fix issue #275: doesn't work when installed in non-Unicode path Closes #275 2018-07-07 01:35:05 -07:00			`from ocrmypdf.helpers import fspath`
			`from ocrmypdf.pdfa import (`
			`file_claims_pdfa, encode_pdf_date, decode_pdf_date, generate_pdfa_ps,`
			`SRGB_ICC_PROFILE`
			`)`
Work around loss of Unicode DOCINFO in Ghostscript 9.24+ Ghostscript no longer supports UTF-16-BE-hex strings as a way of supplying Unicode data in pdfmark so we have lost this functionality too: http://git.ghostscript.com/?p=ghostpdl.git;a=commit;h=e997c6836d243ab37fe3a5f0d57974af95eb5eac For users this means setting --title, --author, etc. will not work if gs 9.24 is installed, but if the file has existing metadata it might work. For now we enforce police-state-strict ASCII, until there's time to implement proper metadata editing. Relevant tests set to xfail. 2018-09-13 21:33:39 -07:00			`from ocrmypdf.exec import ghostscript`
Remove other references to PyMuPDF 2018-06-13 01:02:53 -07:00
			`try:`
			`import fitz`
			`except ImportError:`
			`fitz = None`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
			`# pytest.helpers is dynamic`
			`# pylint: disable=no-member`
			`# pylint: disable=w0612`

			`check_ocrmypdf = pytest.helpers.check_ocrmypdf`
			`run_ocrmypdf = pytest.helpers.run_ocrmypdf`
			`spoof = pytest.helpers.spoof`


			`@pytest.mark.parametrize("output_type", [`
			`'pdfa', 'pdf'`
			`])`
			`def test_preserve_metadata(spoof_tesseract_noop, output_type,`
			`resources, outpdf):`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`pdf_before = pikepdf.open(resources / 'graph.pdf')`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
			`output = check_ocrmypdf(`
			`resources / 'graph.pdf', outpdf,`
			`'--output-type', output_type,`
			`env=spoof_tesseract_noop)`

tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`pdf_after = pikepdf.open(output)`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
			`for key in ('/Title', '/Author'):`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`assert pdf_before.metadata[key] == pdf_after.metadata[key]`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
			`pdfa_info = file_claims_pdfa(str(output))`
			`assert pdfa_info['output'] == output_type`


			`@pytest.mark.parametrize("output_type", [`
			`'pdfa', 'pdf'`
			`])`
			`def test_override_metadata(spoof_tesseract_noop, output_type, resources,`
			`outpdf):`
			`input_file = resources / 'c02-22.pdf'`
			`german = 'Du siehst den Wald vor lauter Bäumen nicht.'`
			`chinese = '孔子'`

			`p, out, err = run_ocrmypdf(`
			`input_file, outpdf,`
			`'--title', german,`
			`'--author', chinese,`
			`'--output-type', output_type,`
			`env=spoof_tesseract_noop)`

			`assert p.returncode == ExitCode.ok, err`

tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`before = pikepdf.open(input_file)`
			`after = pikepdf.open(outpdf)`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
Work around loss of Unicode DOCINFO in Ghostscript 9.24+ Ghostscript no longer supports UTF-16-BE-hex strings as a way of supplying Unicode data in pdfmark so we have lost this functionality too: http://git.ghostscript.com/?p=ghostpdl.git;a=commit;h=e997c6836d243ab37fe3a5f0d57974af95eb5eac For users this means setting --title, --author, etc. will not work if gs 9.24 is installed, but if the file has existing metadata it might work. For now we enforce police-state-strict ASCII, until there's time to implement proper metadata editing. Relevant tests set to xfail. 2018-09-13 21:33:39 -07:00			`if ghostscript.version() >= '9.24':`
			`pytest.xfail('Ghostscript 9.24+ does not support Unicode DOCINFO')`

			`assert after.metadata.Title == german, after.metadata`
			`assert after.metadata.Author == chinese, after.metadata`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`assert after.metadata.get('/Keywords', '') == ''`
Fix XMP validation issue with /CreationDate Related to previous validation issue. If the /CreationDate had no timezone, Ghostscript also creates invalid metadata. Work around this. Also fix up PDF date decoding, and transcode dates to standardize them. 2018-05-03 16:30:20 -07:00
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`before_date = decode_pdf_date(str(before.metadata.CreationDate))`
			`after_date = decode_pdf_date(str(after.metadata.CreationDate))`
Fix XMP validation issue with /CreationDate Related to previous validation issue. If the /CreationDate had no timezone, Ghostscript also creates invalid metadata. Work around this. Also fix up PDF date decoding, and transcode dates to standardize them. 2018-05-03 16:30:20 -07:00			`assert before_date == after_date`
Move metadata tests to new test_metadata 2018-03-26 01:49:25 -07:00
			`pdfa_info = file_claims_pdfa(outpdf)`
			`assert pdfa_info['output'] == output_type`


			`def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):`

			`# Ghostscript doesn't support high Unicode, so neither do we, to be`
			`# safe`
			`input_file = resources / 'c02-22.pdf'`
			`high_unicode = 'U+1030C is: 𐌌'`

			`p, out, err = run_ocrmypdf(`
			`input_file, no_outpdf,`
			`'--subject', high_unicode,`
			`'--output-type', 'pdfa',`
			`env=spoof_tesseract_noop)`

Fix table of contents not preserved in PDF/A 2018-03-26 02:23:19 -07:00			`assert p.returncode == ExitCode.bad_args, err`


test_metadata: change from xfail to skipif without fitz 2018-05-17 00:14:57 -07:00			`@pytest.mark.skipif(not fitz, reason="test uses fitz")`
Fix table of contents not preserved in PDF/A 2018-03-26 02:23:19 -07:00			`@pytest.mark.parametrize('ocr_option', ['--skip-text', '--force-ocr'])`
			`@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])`
			`def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,`
			`resources, outpdf):`
			`input_file = resources / 'toc.pdf'`
			`before_toc = fitz.Document(str(input_file)).getToC()`

			`check_ocrmypdf(`
			`input_file, outpdf,`
			`ocr_option,`
			`'--output-type', output_type,`
			`env=spoof_tesseract_noop)`

			`after_toc = fitz.Document(str(outpdf)).getToC()`
			`print(before_toc)`
			`print(after_toc)`
			`assert before_toc == after_toc`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00

			`def seconds_between_dates(date1, date2):`
			`return (date2 - date1).total_seconds()`


			`@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])`
			`@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])`
			`def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,`
			`infile, outpdf):`
			`input_file = resources / infile`

			`check_ocrmypdf(`
test_metadata: change from xfail to skipif without fitz 2018-05-17 00:14:57 -07:00			`input_file, outpdf, '--output-type', output_type,`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00			`env=spoof_tesseract_noop)`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00
			`pdf_before = pikepdf.open(input_file)`
			`pdf_after = pikepdf.open(outpdf)`

			`before = pdf_before.trailer.get('/Info', {})`
			`after = pdf_after.trailer.get('/Info', {})`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00
			`if not before:`
			`# If there was input creation date, none should be output`
			`# because of Ghostscript quirks we set it to null`
			`# This test would be better if we had a test file with /DocumentInfo but`
			`# no /CreationDate, which we don't`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`assert after.get('/CreationDate', '') == ''`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00			`else:`
			`# We expect that the creation date stayed the same`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`date_before = decode_pdf_date(str(before['/CreationDate']))`
			`date_after = decode_pdf_date(str(after['/CreationDate']))`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00			`assert seconds_between_dates(date_before, date_after) < 1000`

			`# We expect that the modified date is quite recent`
tests: Migrate metadata tests to pikepdf For some reason PyPDF2 has begun to trigger internal errors in pytest on macOS alone. Not sure why, but nothing is wrong that I can see. Seemed like an opportune time to switch to pikepdf; found some new issues in the process anyway. 2018-09-10 16:06:01 -07:00			`date_after = decode_pdf_date(str(after['/ModDate']))`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00			`assert seconds_between_dates(`
Fix regression: time stamp test suite failures 2018-04-17 16:59:21 -07:00			`date_after, datetime.datetime.now(timezone.utc)) < 1000`
Fix creation date metadata lost from input Closes #247 2018-04-02 17:53:39 -07:00
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00			`@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])`
			`def test_xml_metadata_preserved(spoof_tesseract_noop, output_type,`
			`resources, outpdf):`
			`input_file = resources / 'graph.pdf'`
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00
			`try:`
			`import libxmp`
			`from libxmp.utils import file_to_dict`
			`from libxmp import consts`
			`except Exception:`
			`pytest.skip("libxmp not available or libexempi3 not installed")`

			`before = file_to_dict(str(input_file))`
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00
			`check_ocrmypdf(`
			`input_file, outpdf,`
			`'--output-type', output_type,`
			`env=spoof_tesseract_noop)`

Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`after = file_to_dict(str(outpdf))`

Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00			`equal_properties = [`
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`'dc:contributor',`
			`'dc:coverage',`
			`'dc:creator',`
			`'dc:description',`
			`'dc:format',`
			`'dc:identifier',`
			`'dc:language',`
			`'dc:publisher',`
			`'dc:relation',`
			`'dc:rights',`
			`'dc:source',`
			`'dc:subject',`
			`'dc:title',`
			`'dc:type',`
			`'pdf:keywords',`
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00			`]`
			`might_change_properties = [`
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`'dc:date',`
			`'pdf:pdfversion',`
			`'pdf:Producer',`
			`'xmp:CreateDate',`
			`'xmp:ModifyDate',`
			`'xmp:MetadataDate',`
			`'xmp:CreatorTool',`
			`'xmpMM:DocumentId',`
			`'xmpMM:DnstanceId'`
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00			`]`

Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`# Cleanup messy data structure`
			`# Top level is key-value mapping of namespaces to keys under namespace,`
			`# so we put everything in the same namespace`
			`def unify_namespaces(xmpdict):`
			`for entries in xmpdict.values():`
			`yield from entries`
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`# Now we have a list of (key, value, {infodict}). We don't care about`
			`# infodict. Just flatten to keys and values`
			`def keyval_from_tuple(list_of_tuples):`
			`for k, v, *_ in list_of_tuples:`
			`yield k, v`

			`before = dict(keyval_from_tuple(unify_namespaces(before)))`
			`after = dict(keyval_from_tuple(unify_namespaces(after)))`
Add metadata preservation test from stash 2018-05-10 16:43:28 -07:00
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`for prop in equal_properties:`
			`if prop in before:`
			`assert prop in after, '{} dropped from xmp'.format(prop)`
			`assert before[prop] == after[prop]`
test_metadata: change from xfail to skipif without fitz 2018-05-17 00:14:57 -07:00
Make XML metadata test actually work 2018-05-10 20:37:10 -07:00			`# Certain entries like title appear as dc:title[1], with the possibility`
			`# of several`
			`propidx = '{}[1]'.format(prop)`
			`if propidx in before:`
			`assert after.get(propidx) == before[propidx] \`
			`or after.get(prop) == before[propidx]`
Fix issue #275: doesn't work when installed in non-Unicode path Closes #275 2018-07-07 01:35:05 -07:00

			`def test_srgb_in_unicode_path(tmpdir):`
			`"""Test that we can produce pdfmark when install path is not ASCII"""`

Fix path error on Py3.5 2018-07-08 01:01:06 -07:00			`dstdir = Path(fspath(tmpdir)) / b'\xe4\x80\x80'.decode('utf-8')`
Fix issue #275: doesn't work when installed in non-Unicode path Closes #275 2018-07-07 01:35:05 -07:00			`dstdir.mkdir()`
			`dst = dstdir / 'sRGB.icc'`

			`copyfile(SRGB_ICC_PROFILE, fspath(dst))`

			`with patch('ocrmypdf.pdfa.SRGB_ICC_PROFILE', new=str(dst)):`
			`generate_pdfa_ps(dstdir / 'out.ps', {})`
Work around invalid TOC entries Kodak Capture Desktop and probably other software creates a /Outlines entry with /First being set to an invalid indirect reference to an object that hasn't been created. This is legal in the PDF spec but problematic for qpdf. The objgen will be (max valid object ID + 1, 0). Because we create new objects in _weave, some TOC entries will end up assigned to new objects we create. Typically /ProcSet. We solve the issue by refactoring page traversal and then doing it twice, once to resolve all references (eliminating the null reference problem) and a second pass to make our changes. 2018-09-11 14:44:16 -07:00

			`def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):`
			`output = check_ocrmypdf(`
			`resources / 'kcs.pdf', outpdf,`
			`'--output-type', 'pdf',`
			`env=spoof_tesseract_noop)`

			`p = pikepdf.open(outpdf)`

			`if pikepdf.Name.First in p.root.Outlines:`
			`assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)`