diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index be06ada2..dc4d084e 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -12,11 +12,12 @@ New features ------------ - Easier installation with Python's package manager +- Eliminated many external dependencies, so it's easier to setup - Now installs ``ocrmypdf`` to ``/usr/local/bin`` or equivalent for system-wide - access -- Tesseract 3.03 PDF page can be used instead for better positioning - of recognized text (``--pdf-renderer tesseract``) + access and easier typing - Improved command line syntax and usage help (``--help``) +- Tesseract 3.03 PDF page renderning can be used instead for better positioning + of recognized text (``--pdf-renderer tesseract``) - PDF metadata (title, author, keywords) are now transferred to the output PDF - PDF metadata can also be set from the command line (``--title``, etc.) @@ -31,7 +32,7 @@ Changes ------- - New, robust rewrite in Python 3.4+ with ruffus_ pipelines -- Now uses Ghostscript 9.14's improved color conversion model +- Now uses Ghostscript 9.14's improved color conversion model to preserve PDF colors - All "tasks" in the pipeline can be executed in parallel on any available CPUs, increasing performance - The ``-o DPI`` argument has been phased out, in favor of ``--oversample DPI``, in @@ -45,6 +46,7 @@ Changes - Poppler - MuPDF_ tools - shell scripts + - Java and JHOVE_ - Some new external dependencies are required or optional, compared to v2.x: @@ -59,11 +61,17 @@ Changes .. _MuPDF: http://mupdf.com/docs/ .. _qpdf: http://qpdf.sourceforge.net/ .. _Unpaper: https://github.com/Flameeyes/unpaper - +.. _JHOVE: http://jhove.sourceforge.net/ Release candidates ------------------ +- rc5: + + - dropped Java and JHOVE in favour of qpdf + - improved command line error output + - additional tests and bug fixes + - rc4: - dropped MuPDF in favour of qpdf diff --git a/ocrmypdf/jhove/COPYING b/ocrmypdf/jhove/COPYING deleted file mode 100644 index fa79db0a..00000000 --- a/ocrmypdf/jhove/COPYING +++ /dev/null @@ -1,502 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! diff --git a/ocrmypdf/jhove/LICENSE b/ocrmypdf/jhove/LICENSE deleted file mode 100644 index 89b8f920..00000000 --- a/ocrmypdf/jhove/LICENSE +++ /dev/null @@ -1,17 +0,0 @@ -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003-2008 by JSTOR and the President and Fellows of Harvard College - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU Lessor General Public License as -published by the Free Software Foundation; either version 2.1 of the -License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Lesser General Public License for more details. - -You should have received a copy of the GNU Lesser General Public -License along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 -USA diff --git a/ocrmypdf/jhove/README b/ocrmypdf/jhove/README deleted file mode 100644 index 82b08abf..00000000 --- a/ocrmypdf/jhove/README +++ /dev/null @@ -1,227 +0,0 @@ -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003-2012 by JSTOR and the President and Fellows of Harvard College -JHOVE is made available under the GNU Lesser General Public License (LGPL; -see the file LICENSE for details) - -Rev. 1.11, 2013-09-29 - -JHOVE (the JSTOR/Harvard Object Validation Environment, pronounced "jhove") -is an extensible software framework for performing format identification, -validation, and characterization of digital objects. - -o Format identification is the process of determining the format to which a - digital object conforms: "I have a digital object; what format is it?" -o Format validation is the process of determining the level of compliance of a - digital object to the specification for its purported format: "I have an - object purportedly of format F; is it?" -o Format characterization is the process of determing the format-specific - significant properties of an object of a given format: "I have an object of - format F; what are its salient properties?" - -These actions are frequently necessary during routine operation of digital -repositories and for digital preservation activities. - -The output from JHOVE is controlled by output handlers. JHOVE uses an -extensible plug-in architecture; it can be configured at the time of its -invocation to include whatever specific format modules and output handlers -that are desired. The initial release of JHOVE includes modules for -arbitrary byte streams, ASCII and UTF-8 encoded text, AIFF and WAVE audio, -GIF, JPEG, JPEG 2000, TIFF, and PDF; and text and XML output handlers. - -The JHOVE project is a collaboration of JSTOR and the Harvard University -Library. Development of JHOVE was funded in part by the Andrew W. Mellon -Foundation. JHOVE is made available under the GNU Lesser General Public -License (LGPL; see the file LICENSE for details). - -JHOVE is currently being maintained by indpendent developers. - -REQUIREMENTS - -1. Java J2SE 1.5 -(JHOVE was originally implemented using the Sun J2SE SDK 1.4.1 and has -been tested to work with 1.5) - -2. If you would like to compile the JHOVE source code, then -Apache Ant, a Java-based build tool is necessary. -Note that the JAVA_HOME environment variable must be set appropriately for -Ant to work properly. -(JHOVE was implemented and tested using Ant 1.5.1.) - -DISTRIBUTION - -The JHOVE distribution package includes: - - jhove/ # JHOVE home directory - COPYING # GNU Lesser General Public License - LICENSE # JHOVE license information - README - RELEASENOTES # JHOVE release notes - bin/ - jhove.jar # JHOVE API package - jhove-handler.jar # Standard output handler package - jhove-module.jar # Standard module package - JhoveApp.jar # JHOVE command line application - JhoveView.jar # JHOVE with Swing GUI front-end - build.xml # Ant configuration file - classes/ - build.xml # Ant configuration file - edu/ ... # JHOVE API packages - ADump.* # AIFF dump utility class - GDump.* # GIF dump utility class - Jhove.* # JHOVE main class - JDump.* # JPEG dump utility class - J2Dump.* # JPEG 2000 dump utility class - PDump.* # PDF dump utility class - TDump.* # TIFF dump utility class - UserHome.* # user.home property utility class - WDump.* # WAVE dump utility class - conf/ - jhove.conf # JHOVE configuration file - jhove.xsd # JHOVE output schema - jhoveConfig.xsd # JHOVE configuration file schema - doc/ - *.html # API documentation - ... - examples/ # Sample files - ascii/ ... - gif/ ... - jpeg/ ... - jpeg2000/ ... - pdf/ ... - tiff/ ... - utf-8/ ... - adump* # AIFF dump Bourne shell driver - adump.bat* # AIFF dump DOS shell driver script - gdump* # GIF dump Bourne shell driver - gdump.bat* # GIF dump DOS shell driver script - jdump* # JPEG dump Bourne shell driver - jdump.bat* # JPEG dump DOS shell driver script - j2dump* # JPEG 2000 dump Bourne shell driver - j2dump.bat* # JPEG 2000 dump DOS shell driver - jhove.tmpl* # Template for JHOVE Bourne shell driver script - jhove_bat.tmpl* # Template for JHOVE DOS shell driver script - pdump* # PDF dump Bourne shell driver - pdump.bat* # PDF dump DOS shell driver script - tdump* # TIFF dump Bourne shell driver - tdump.bat* # TIFF dump DOS shell driver script - userhome* # user.home Bourne shell driver - userhome.bat* # user.home DOS shell driver script - wdump* # WAVE dump Bourne shell driver - wdump.bat* # WAVE dump DOS shell driver script - -INSTALLATION - -Edit the configuration file, jhove/conf/jhove.conf, and set the absolute -pathname of the JHOVE home directory and the temporary directory (in which -temporary files are created): - - jhove-home-directory - temporary-directory - -The JHOVE home directory is the top-most directory in the distribution TAR -or ZIP file. On Unix systems, "/var/tmp" is an appropriate temporary -directory; on Windows, "C:\Temp". For example, if the distribution TAR -file is disaggregated on a Unix system in the directory "/users/stephen/ -projects", then the configuration file should read: - - /users/stephen/projects/jhove - /var/tmp - -In the JHOVE home directory, copy the JHOVE Bourne shell driver script -template, "jhove.tmpl", to "jhove" (or the equivalent Windows shell -script, "jhove_bat.tmpl" to "jhove.bat"), and set the -JHOVE home directory, Java home directory, and Java interpreter: - - JHOVE_HOME=jhove-home-directory - JAVA_HOME=java-home-directory - JAVA=java-interpreter - -The JAVA_HOME property should provide the absolute pathname of the Java -runtime or SDK installation; JAVA should provide the absolute pathname of the -Java interpreter. For example: - - JHOVE_HOME=/users/stephen/projects/jhove - JAVA_HOME=/usr/local/j2re1.4.1_02 - JAVA=$JAVA_HOME/bin/java - -In the DOS shell driver script, jhove.bat, the equivalent three -variables are: - - SET JHOVE_HOME=jhove-home-directory - SET JAVA_HOME=java-home-directory - SET JAVA=%JAVA_HOME%\bin\java - -For example: - - SET JHOVE_HOME="C:\Program Files\jhove" - SET JAVA_HOME="C:\Program Files\java\j2re1.4.1_02" - SET JAVA=%JAVA_HOME%\bin\java - -The quotation marks are necessary because of the embedded space characters. -On Windows platforms it may also be necessary to add the Java bin subdirectory -to the System PATH environment variable: - - PATH=C:\Program Files\java\j2re1.4.1_02\bin;... - -(For information on setting a Windows environment variable, consult your local -documentation or system administrator.) - -USAGE - - java Jhove [-c config] [-m module] [-h handler] [-e encoding] [-H handler] - [-o output] [-x saxclass] [-t tempdir] [-b bufsize] - [-l loglevel] [[-krs] dir-file-or-uri [...]] - -where -c config Configuration file pathname - -m module Module name - -h handler Output handler name (defaults to TEXT) - -e encoding Character encoding used by output handler (defaults to UTF-8) - -H handler About handler name - -o output Output file pathname (defaults to standard output) - -x saxclass SAX parser class (defaults to J2SE default) - -t tempdir Temporary directory in which to create temporary files - -b bufsize Buffer size for buffered I/O (defaults to J2SE 1.4 default) - -l loglevel Logging level - -k Calculate CRC32, MD5, and SHA-1 checksums - -r Display raw data flags, not textual equivalents - -s Format identification based on internal signatures only - dir-file-or-uri Directory or file pathname or URI of formated content - stream - -All named modules and output handlers must be found on the Java CLASSPATH at -the time of invocation. The JHOVE driver script, jhove/jhove, automatically -sets the CLASSPATH and invokes the Jhove main class: - - jhove [-c config] [-m module] [-h handler] [-e encoding] [-H handler] - [-o output] [-x saxclass] [-t tempdir] [-b bufsize] [-l loglevel] - [[-krs] dir-file-or-uri [...]] - -The following additional programs are available, primarily for testing -and debugging purposes. They display a minimally processed, human-readable -version of the contents of AIFF, GIF, JPEG, JPEG 2000, PDF, TIFF, and WAVE -files: - - java ADump aiff-file - java GDump gif-file - java JDump jpeg-file - java J2Dump jpeg2000-file - java PDump pdf-file - java TDump tiff-file - java WDump wave-file - -For convenience, the following driver scripts are also available: - - adump aiff-file - gdump gif-file - jdump jpeg-file - j2dump jpeg2000-file - pdump pdf-file - tdump tiff-file - wdump wave-file - -The JHOVE Swing-based GUI interface can be invoked from a command shell from -the jhove/bin sub-directory: - - java -jar JhoveView.jar -c - -where is the pathname of the JHOVE configuration file. diff --git a/ocrmypdf/jhove/RELEASENOTES b/ocrmypdf/jhove/RELEASENOTES deleted file mode 100644 index f283981a..00000000 --- a/ocrmypdf/jhove/RELEASENOTES +++ /dev/null @@ -1,1755 +0,0 @@ -RELEASENOTES -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003-2009 by JSTOR and the President and Fellows of Harvard College -JHOVE is made available under the GNU Lesser General Public License (LGPL; -see the file LICENSE for details) - -Versions 1.7 and beyond of JHOVE are no longer under the control of Harvard. - -RELEASE NOTES FOR JHOVE 1.11 - -GENERAL - -1. I've added lots of logging code. Calls at the FINE level and lower - don't show up no matter what I do, so I've put them at the INFO level. - The level is set in JhoveBase.java. - -2. All .bat and _bat.tmpl files now have CR-LF line endings. That is, they - do in the gzip and zip archives you download. I'm not sure how - SourceForge will treat files that you download individually, - but hopefully it will have the sense to keep CR-LF when downloading - to a Windows system. - -3. All .bat files now assume JHOVE_HOME is the directory from which they're - run. They no longer try to set JAVA_HOME (which was still stuck in - Java 1.4 and probably wasn't working for many people), instead assuming - that the JAVA command is available on the command line. - -4. All javac commands in build.xml files now specify source=1.5 for - compatibility with more recent compilers. - -5. gdumpwin.bat is deleted. It's redundant with gdump.bat and has bugs - of its own. - -PDF MODULE - -1. Fix to PDF module, submitted by willp-bl, may reduce tendency - to run out of heap space on some files. - -RELEASE NOTES FOR JHOVE 1.10 - -GENERAL - -1. The amount of logging code has been increased, mostly at the - DEBUG level. - -2. Further work on generics in Java code. - -3. JhoveView now checks for Java 1.5. Was previously allowing 1.4 even - though it wouldn't work. - -HTML MODULE - -1. XHTML files are processed by the HTML module, which invokes the XML - modules. In this case, the XML module doesn't have the parameters - specified in the JHOVE configuration file and so won't use local - copies of schemas. Starting with this version, the parameters of - the HTML module are passed to the XML module when invoking it. - However, this doesn't work properly (in either module) for a DTD - that invokes additional DTDs by relative URLs. Such DTDs should - be edited to use only absolute URLs. - -PDF MODULE - -1. Failure to get a page object number wasn't being handled cleanly, - resulting in a report of an invalid document without an error message - to explain it (SourceForge bug 49). This has been fixed. - -2. The PDF module unnecessarily uses huge amounts of memory to build - complex structure trees, when it doesn't need to keep the whole - tree in memory to validate it. In the new version, it uses memory - more economically. This should result in the successful processing - of some PDF files that ran out of memory or took hours to process before. - -3. If an annotation isn't a dictionary object, report that explicitly. - This happens with some otherwise good files; I can't find any warrant - for it in the PDF spec. - -4. Some efficiency improvements to PDF parser. Increased buffer size from 4K - to 64K. Made Parser.collapseObjectVector more efficient. Parser now - returns pseudo-objects for array and dictionary end instead of throwing - an exception. - -5. Minor cleanup of error reporting. - -6. If an object uses a compression scheme which JHOVE can't deal with, JHOVE - will try to give a specific error message. - -RELEASE NOTES FOR JHOVE 1.9 - -GENERAL - -1. Jhove.java and JhoveView.java now get their version information from - JhoveBase.java. Before it was redundantly kept in three places, and - sometimes they didn't all get updated for a new release. Like in 1.8. - -2. ConfigWriter was in the package edu.harvard.hul.ois.jhove.viewer, which - caused a NoClassDefFoundError if non-GUI configurations didn't include - JhoveViewer.jar in the classpath. It's been moved to - edu.harvard.hul.ois.jhove. - -3. Added script packagejhove.sh and made md5.pl part of the CVS repository - to make packaging for delivery easier. - -4. jhove.bat now simply uses the Java command rather than requiring - the user to set up the Java path. - -5. JhoveView.jar and jhove (the top level shell script) are now forced - by ant to be executable so there are no mistakes. - -6. Warning message given on invalid buffer size string, and minimum - buffer size is 1024. - -7. Configuration file code for adding handlers and giving init strings - to modules was an awful mess that never could have worked. - Major repairs done. - -AIFF MODULE - -1. If an AIFF file was found to be little-endian, the module instance - would stay in little-endian mode for all subsequent files. This - has been fixed. - -TIFF MODULE - -1. TIFF files that had strip or tile offsets but no corresponding byte - counts were throwing an exception all the way to the top level. Now - they're correctly being reported as invalid. - -XML MODULE - -1. Cleaned up reporting of schemas, Added some small classes to replace - the use of string arrays for information structures. Made URI comparison - for local schema parameter case-independent. Resolved conflict between - "s" and "schema" parameters. - -WAVE MODULE - -1. Some uncaught exceptions caused the module to throw all the way - back to JhoveBase and not report any result for certain defective - files. These now report the file as not well-formed. - - -RELEASE NOTES FOR JHOVE 1.8 - -GENERAL - -1. If JHOVE doesn't find a configuration file, it creates a default one. - -2. Generics widely added to clean up the code. - -3. build.xml files fixed to force compilation to Java 1.5. - -4. Shell script "jhove" no longer makes you figure out where JAVA_HOME is. - -PDF MODULE - -1. Several errors in checking for PDF-A compliance were corrected. Aside from - fixing some outright bugs, the Contents key for non-text Annotations is - no longer checked, as its presence is only recommended and not required. - -2. Improved code by HÃ¥kan Svenson is now used for finding the trailer. - -TIFF MODULE - -1. TIFF tag 700 (XMP) now accepts field type 7 (UNDEFINED) as well as 1 - (BYTE), on the basis of Adobe's XMP spec, part 3. - -2. If compression scheme 6 is used in a file, an InfoMessage will report - that the file uses deprecated compression. - -WAVE MODULE - -1. The Originator Reference property, found in the Broadcast Wave Extension - (BEXT) chunk, is now reported. - -RELEASE NOTES FOR JHOVE 1.7 -2012-08-12 - -GENERAL - -1. JHOVE 1.7, as well as future releases unless noted otherwise, is - released independently of Harvard under the GNU General Public License. - -2. JHOVE now will tell you where it was looking for the config file if it - can't open it. This should help debug configuration problems. - -XML HANDLER - -1. Changes to XmlHandler.java and NisoImageMetadata.java to correct invalid MIX - 2.0 XML output in the value of grayResponseUnit. It was previously writing - integers (as in 1.0) rather than the expected enumerated strings. - -PDF MODULE - -1. A situation that caused an infinite loop and eventual memory exhaustion - processing in some PDF files with malformed literals has been fixed. - -RELEASE NOTES FOR JHOVE 1.6 -2011-01-03 - -XML HANDLER AND TEXT HANDLER - -1. The default version of MIX is now 2.0. In earlier versions it was 0.2. - However, MIX 2.0 still isn't supported in the text handler, so it will - produce 1.0 output by default. The XML handler will produce MIX 2.0 - output. - -TIFF MODULE - -1. JHOVE returned a \"String index out of range: 4\" exceptions during - TIFF validation for a tiff contains an empty (not NULL) date/time - field. This has been corrected so that a date/time field with - the wrong length won't be parsed but will report an error instead. - -2. If text tags contain characters which aren't printable ASCII, these - are now output as escape sequences so that invalid XML isn't - output. - -UTF-8 MODULE - -1. Updated to Unicode 6.0.0. - -RELEASE NOTES FOR JHOVE 1.5 -2009-12-17 - -PDF MODULE - -1. An ArrayIndexOutOfBoundsException was thrown on a PDF with an invalid - object number in the cross-reference stream. In JHOVE 1.5, this is - correctly reported as a violation of well-formedness. - -UTF-8 MODULE - -1. With some very simple UTF-8 files, JHOVE handlers would throw an exception - processing them, and the GUI would fail silently. This happened with files - using no UTF-8 blocks. This has been fixed. - -TEXTMD (multiple modules) - -1. TextMD metadata can now optionally be reported. To get this, it's - necessary to edit jhove.conf. TextMD can be enabled on a per-module - basis for HtmlModule, AsciiModule, Utf8Module, and XmlModule. - The element for each chosen module must contain the element - withtextmd=true (no spaces). - -2. The TextMD feature was added by Thomas Ledoux. - - - -RELEASE NOTES FOR JHOVE 1.4 -2009-07-30 - -PDF MODULE - - 1. The PDF/A profile has been updated to the final version of - 19005-1:2005(E) and made more thorough. Among the changes: - - a. The set-state and no-op actions disqualify a PDF/A candidate. - - b. The ASCIIHexDecode and ASCII85Decode filters no longer - disqualify a candidate. - - c. Checking of outlines has been added. - - d. Additional checking of Type 1 fonts and symbolic fonts. - - e. Bug fix in checking type 2 subfonts. - - f. An LZW filter in an image object disqualifies a candidate. - - g. The xpacket processing instruction is checked for attributes - which disqualify from PDF/A. - - h. Conformity to implementation limits is checked as a condition - of PDF/A conformity. - -JPEG2000 MODULE - - 1. The pathological case of an image with no components is checked so - it won't cause a crash. - -XML HANDLER - - 1. A reset() function has been added so that if the handler is reused, - it will return to a valid initial state. - -RELEASE NOTES FOR JHOVE 1.3 -2009-06-04 - -GENERAL - - 1. The build.xml files now force compilation to Java 1.4, preventing - accidental distributions that aren't 1.4-compatible. - 2. Spaces are allowed in file paths on Windows, if the path is - enclosed in quotes. This fix had been in version 1.1i, and had been - lost since then. - -PDF MODULE - - 1. According to the PDF 1.6 specification, table 3.4, parameters for a - stream filter can be either a dictionary or the null object. The null - object was treated as an error; it is now allowed. - 2. Object stream handling was seriously buggy, causing rejection of - well-formed and valid files; it's better now. - 3. In PDF 1.4, an outline dictionary unconditionally must have a "First" - and a "Last" entry. JHOVE follows this requirement, declaring a file - invalid if it isn't met. However, PDF 1.6 relaxes the requirement, - applying it only "if there are any open or closed outline entries." - Thus, an empty outline dictionary with no "First" or "Last" entry - is valid. It is now accepted (for all PDF versions). - 4. If a page number tree in a PDF file is missing an expected "Nums" - entry, this was being reported as an invalid date. A more appropriate - error message is now given. - -TIFF MODULE - - 1. TIFF tag 33723 (IPTC-NAA) was considered valid only if the data - type is ASCII or LONG. But according to Aware Systems, the valid - types are UNDEFINED and BYTE. All four types are now accepted. - -XML HANDLER - - 1. Omissions in MIX 1.0 and 2.0 output have been fixed. - -RELEASE NOTES FOR JHOVE 1.2 -2009-02-10 - -GENERAL - - 1. A bug has been fixed in CountedInputStream, which could potentially - have caused infinite recursion in some modules. - -HTML MODULE - - 1. An incompatibility with Java 1.6 has been fixed. - -PDF MODULE - - 1. A null pointer exception would be thrown for PDF documents without a - document root tree. This has been fixed. - 2. A source of possible false positives in PDF profiles has been fixed. - 3. Certain checks weren't being done to Type 2 fonts, and some PDF/A - profile violations might have been missed as a result. This has - been fixed. - -WAVE MODULE - - 1. Sub-chunks of the 'adtl' chunk are now constrained to even byte - boundaries. - -XML HANDLER - - 1. MIX 2.0 is now supported. - 2. The URL for the MIX 0.2 schema has changed to reflect the change - on the LOC MIX site. - 3. The handler was sometimes incorrectly reporting whether the - AESAudioMetadata property had an empty value or not. This has - been fixed. - - -RELEASE NOTES FOR JHOVE 1.1 -Rev. 2008-02-22 - -COMMAND-LINE INTERFACE - - 1. Allow filenames with internal spaces if they are quoted on the - command line. - 2. Corrected error setting the Classpath in the Windows Shell script - (jhove.bat) - 3. Corrected error opening the configuration file using the default - GCJ parser in the GNU Java Runtime Environment. - -GUI (SWING) INTERFACE (JHOVE VIEW) - - 1. AES metadata properties displayed in the RepInfo window rearranged - slightly to make their ordering consistent with the Text and XML - handlers. - 2. The JhoveView.main() method will now accept a "-c configFile" option - on the command line. The GUI interface can now be invoked by: - - java -jar bin/JhoveView.jar -c configFile - - 3. Corrected error opening the configuration file using the default - GCJ parser in the GNU Java Runtime Environment. - 4. Correct recurrent problems with reading the configuration file on - Windows installations. - -AIFF MODULE - - 1. Correct value for first sample offset by included non-zero offset - defined in the SSND chunk. - 2. Do not report bitrate reduction data for PCM data. - 3. All non-final instance fields and methods are protected, rather than - private. - -ASCII MODULE - - 1. A minimal file containing no line-end characters now does not - produce an empty ASCIIMetadata property, which is invalid against - the JHOVE schema. - 2. Zero-length files are considered not well-formed. - 3. Issue informative message if file contains no printable characters. - 4. All non-final instance fields and methods are protected, rather than - private. - -BYTESTREAM MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - -GIF MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - -HTML MODULE - - 1. The HTMLMetadata block in the module output is only produced if - there is at least one actual metadata property to report. - 2. All non-final instance fields and methods are protected, rather than - private. - -JPEG MODULE - - 1. The JPEG module reports the X and Y sampling frequency for files - meeting the JFIF profile. - 2. The JPEG module reports the pixel aspect ratio for JFIF profile - files for which it is defined. - 3. File handles were not being properly closed when processing embedded - EXIF metadata. In cases where JHOVE was invoked against large - numbers of objects this was causing a premature crash due to the - resource leak. - 4. All non-final instance fields and methods are protected, rather than - private. - 5. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 6. Validation errors in embedded EXIF metdata were not being fully - reported. - -JPEG 2000 MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - 2. Files generated by the LuraWave codec are no longer incorrecly identified - as having unrecognized QCC marker segments. - -PDF MODULE - - 1. Date strings are now parsed with strict conformance to the ASN.1 - syntax. - 2. Destinations defined by indirect references to non-existent objects - are assumed to have the value "null". Files containing such - destinations are reported as "well-formed, but not valid". - 3. No attempt is made to display encrypted outline item title strings are - not displayed. - 4. Catch error if the Info key of the trailer dictionary is not an - indirect reference. - 5. Read entire page tree structure, regardless of its internal - organization. This error may have caused the under reporting of - page resources, such as fonts and images. - 6. The NISO Compression Scheme for all images using the CCITTFaxDecode - compression filter is now reported properly; previously, the scheme - was always reported as CCITT 1D even if the actual compression - algorithm was CCITT Group 3 or 4. - 7. Properly parse UTF-16 escape characters encoded in double-byte form. - 8. The module properly stops looking for the header comment after 1024 - bytes. - 9. All non-final instance fields and methods are protected, rather than - private. - 10. The number of incremental updates is now reported correctly, rather than - the total number of file trailers, which is one greater than the number - of updates. - 11. Only up to 1000 fonts will be reported. After that, an informative - message will be generated. The limit can be set using the parameter - "nxxxx" in the module-specific section of the configuration file: - - - edu.harvard.hul.ois.jhove.module.PdfModule - n2000 - - - 12. Subfonts of Type 0 are now being properly reported. - 13. PDF/A-1b profile is now being properly reported. - 14. Permit trailer info key to be optional. - 15. Additional correction for outline recursion. - 16. Fix treatment of indirect object of Actions. - 17. Correctly handle trailer dictionary without Info entry. - 18. Ignore comments within dictionaries. - -TIFF MODULE - - 1. Corrected error parsing pyramidal TIFF using the SubIFDs tag with a - type of IFD (13) rather than LONG (4). - 7. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 2. All sub-IFDs of a pyramidal TIFF are now properly parsed. - 3. The EXIF GainControl tag (41991) is now correctly identified as - a SHORT, not a RATIONAL, value. - 4. Corrected error in which valid files were reported as being only - well-formed due to an incorrect parsing of the DateTime (306) tag. - 5. Byte-aligned offsets can be considered well-formed if the module - parameter "byteoffset=true" is set in the configuration file: - - - edu.harvard.hul.ois.jhove.module.TiffModule - byteoffset=true - - - 6. All non-final instance fields and methods are protected, rather than - private. - 7. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 8. Using the "-s" option, the TIFF module was incorrectlly reporting - signature matches for text files starting with "II". - 9. Validation errors in embedded EXIF metdata were not being fully - reported. - -UTF8 MODULE - - 1. Corrected error under which malformed UTF-8 files containing encoding - sequences starting with a byte value in the range 0xF8 through 0xFF - were reported as well-formed and valid. - 2. Zero-length files are considered not well-formed. - 3. Issue informative message if file contains no printable characters. - 4. All non-final instance fields and methods are protected, rather than - private. - -WAVE MODULE - - 1. BWF files now set the correct start time in the AES metadata. - 2. All non-final instance fields and methods are protected, rather than - private. - 3. "cue " and "adtl" chunks are now properly read. - -XML MODULE - - 1. The DTD is assumed to be the first DOCTYPE system ID in the file with an - ".dtd" extension. - 2. All non-final instance fields and methods are protected, rather than - private. - 3. The module correctly handles schemaLocation attributes that do not - provide two whitespace-separated URIs. - -TEXT HANDLER - - 1. AES audio metadata properties rearranged slightly to make their - ordering consistent with the XML schema. - -XML HANDLER - - 1. Correct sample rate formatting in AES Time Code Format (TCF) - temporal references. - 2. Correct face IDREF in AES metadata. - 3. Disallowed control characters are removed from content. - 4. Null property values no longer generate empty elements. - 5. Image technical metadata can be reported in terms of the MIX 1.0 schema, - as opposed to the default reporting against MIX 0.2. To specify the - 1.0 schema include the directive: - - 1.0 - - if the configuration file. - -JHOVE API - - 1. The process() and processFile() methods of the JhoveBase class are now - public, to permit direct access to the API by applications. - 2. Checksum calculations now use buffered I/O uniformly for improved - performance. - 3. All non-final fields and methods in the JhoveBase class are - protected, rather than private. - 4. When invoked with the "-s" option JHOVE now reports the signature - matched format and MIME type. - 5. The processing of files in a directory is now performed in an - alphabetically sorted order. - -ADUMP UTILITY - - 1. Display the field values of known chunks. - -TDUMP UTILITY - - 1. New format that sorts all tag definitions by their byte offset and - also displays the byte ranges for image data. - 2. Command line flags permit the suppression of BYTE data display (-b) and - and subIFD parsing (-s). - -USERHOME UTILITY - - 1. A new utility program, UserHome, is available to determine the value - of the Java user.home property needed to know where to place the - configuration file. This utility can be invoked by the driver scripts - "userhome" (Bourne shell) or "userhome.bat" (Windows). - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 -Rev. 2005-05-26 - -GENERAL - - 1. Zero length files are now handled properly in all modules. - - 2. Missing start time in audio files is now handled property in all - audio modules. - - 3. Miscellaneous bug fixes, enhancements, and documentation updates. - -AIFF MODULE - - 1. Corrected error causing BitrateReduction to be incorrectly reported - for uncompressed PCM audio. - - -JPEG2000 MODULE - - 1. The module now validates the enumerated ICC profile types in the - Color Specification Box. In the JP2 profile, an unrecognized ICC - profile type marks the file as not well formed; in the JPX, the file - is merely not valid. - - 2. In the beta 3 release certain invalid JPEG 2000 files were - reported as well formed in the JP2 profile. This has been corrected. - -PDF MODULE - - 1. Following the practice of Acrobar, the PDF module will accept - the "%PDF-1.n" header comment anywhere in the first 1024 bytes of a - file (with appropriate notification via an information message), - rather than requiring that it start at byte offset 0. - - 2. The requirements for the PDF/A profile have been brought into - conformance with the most recent version of the PDF/A specification, - ISO/DIS 19005-1 of 2004-12-22. - - 3. Corrected bug that prevented valid PDF/X-1 files from being - recognized as such. - -WAVE MODULE - - 1. Corrected error causing BitrateReduction to be incorrectly reported - for uncompressed PCM audio. - -XML HANDLER - - 1. Dates reported for the NISO Z39.87 - element are now canonicalized to be in proper ISO 8601 form. - - 2. The NISO Z39.87 element is now - reported, if known. - -AUDIT HANDLER - - 1. The current working directory is reported as the "home" - attribute of the element and individual files are reported - as relative pathnames - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 (beta 3) -Rev. 2005-02-04 - -1 GENERAL - - 1. The architecture has been modified to simplify the use of JHOVE - with new "front ends." The new JhoveBase class is used in - conjunction with the App class to incorporate nearly all the - work of setting up a JHOVE instance. The main Jhove class and the App - class are now smaller than before. - - 2. Checksums were often being reported with incorrect values due to - an output formatting error that dropped zeroes. This has been fixed. - - 3. New utilities GDUMP and JDUMP created for GIF and JPEG documents. - - 4. Error messages are more consistently factored into submessages. - This allows messages indicating the same type of error to - be more readily grouped. - - 5. Some modules were reporting a MIME type for a document that is - not well-formed. This no longer occurs. - - 6. Duplicate reporting of AES BitDepth has been suppressed. - - 7. New module for HTML format. Be sure to update the configuration - file, jhove/conf/jhove.conf, to include the module: - - ... - - edu.harvard.hul.ois.jhove.module.HtmlModule - - ... - - 8. The AES audio metadata representation has been updated to - conform with schema version 1.02b (pre-release). - - 9. New property, sigMatches, has been added to RepInfo. This - records which module(s) regarded the signature of the document as a - match, even if the document was not well-formed. This is useful in - identifying broken documents that are reported as ASCII or Bytestream. - - 10. The logging API is supported, permitting the generation of - debugging messages. - - 11. All modules are now non-final, so that they can be subclassed by - adventurous users. - - 12. The -p and -P arguments of the command line are no longer - supported. Instead, the equivalent parameters can be - provided to all variants of JHOVE (including those which - don't take a command line) by specifying a element - within the element of the configuration file. - Example: - - - edu.harvard.hul.ois.jhove.module.PdfModule - a - f - p - - -2 JHOVE COMMAND-LINE INTERFACE - - 1. The JHOVE command-line interface can now accept directory names, - as well as file pathnames and URIs: - - java Jhove [-c config] [-m module] [-h handler] [-e encoding] - [-H handler] [-o output] [-x saxclass] [-t tempdir] - [-b bufsize] [-l loglevel] [[-krs] dir-file-or-uri [...]] - - All of the files in the directories are processed in a - depth-first recursive descent. - -3 JHOVEVIEWER (SWING GUI) INTERFACE - - 1. The JhoveViewer class now allows dragging of a directory or of - multiple files, and the output for all files is presented in a single - window. This significantly reduces the window clutter. - - 2. The JhoveViewer presents the module menu in alphabetical order - rather than configuration file order. - - 3. The JhoveViewer was failing to report some submessages. This is fixed. - - 4. The JhoveViewer was failing silently on certain URL errors; it - now puts up an error alert. - - 5. If an empty module class name is added in the Configuration - dialog, it is ignored. - -4 AIFF MODULE - - 1. Descriptive properties added. - - 2. Checksum was sometimes missing; fixed. - - 3. Specification URL added to descriptive information. - - 4. Reported MIME type changed to 'audio/x-aiff' from 'application/aiff'. - -5 GIF MODULE - - 1. BitsPerSample is now reported. - -6 JPEG MODULE - - 1. Errors occurring when parsing an optional EXIF segment were not - being reported. This problem manifested itself by incorrectly - reporting that the JPEG file is not well-formed. - - 2. Array size bug in BitsPerSample fixed. - -7 JPEG2000 MODULE - - 1. Specification information added for ITU. - - 2. Errors in parsing of an EXIF segment are now reported. - -8 PDF MODULE - - 1. In certain instances the module was inappropriately reporting - well-formed PDF files as being non-well-formed, indicating - (incorrectly) that the file does not contain a trailer. - - 2. Fixed a NullPointerException being thrown with a defective page - root tree. - - 3. Certain broken cross-reference tables would throw the module - into a loop. This is fixed. - - 4. Problems in XMP data that triggered a SAX error were being - reported to standard output as a "fatal error." They are now properly - reported. - - 5. Error in offset reporting fixed. - - 6. Now reports FontFile2 and FontFile3. - - 7. File trailers are now found more reliably. - - 8. PDF/A profile updated to latest draft proposal, ISO/CD 19005-1 - (2004-09-20). - - 9. Parameters that would have been specified by the -p argument - of the command line are now specified by the element - in the configuration file. The sense of these parameters - has been reversed; by default, the PDF module presents - the maximum amount of information unless suppressed by - including the characters a, p, f, or o in the parameter value(s). - -9 TIFF MODULE - - 1. Adobe DNG tags are recognized, and a DNG profile has been added. - - 2. Bug in DATETIME checking fixed. - - 3. Changes in validity tests for PhotometricInterpretation, - SamplesPerPixel and BitsPerSample. - - 4. Corrected spurious null values for some properties. - - 5. Tag data type checking was badly broken, now fixed. - -10 WAVE MODULE - - 1. Type 'exif' recognized in LIST chunk. - - 2. Format and signature information updated. - - 3. Checksum was sometimes missing; fixed. - - 4. Reported MIME type changed to 'audio/x-wave' from 'audio/x-wav'. - -11 XML MODULE - - 1. Now reports 1.0 and 1.1 as versions rather than profiles. - - 2. Reported MIME type changed to 'text/xml' from 'application/xml'. - - 3. A base URL for DTD's may now be specified using the - element. The URL must be preceded by the letter b - to distinguish it from potential future parameters, e.g., - - - edu.harvard.hul.ois.jhove.module.XmlModule - bhttp://www.example.com/ - - -12 XML HANDLER - - 1. The "xsi" namespace is now defined in the NISO Image Metadata - and AES Audio Metadata elements. This - allows these segments to validate when extracted from the JHOVE output - document. - - 2. The element is properly named; it - had been improperly displayed as . - - 3. X and YSamplingFrequency are reported as positive integers - ("600"), not ratios ("600/1"), for consistency with the MIX schema. - - 4. An empty Properties element in the XML handler is now suppressed. - -13 GDUMP UTILITY - - 1. New utility to dump GIF files in human-readable form. - -14 JDUMP UTILITY - - 1. New utility to dump JPEG files in human-readable form. - -15 TDUMP UTILITY - - 1. The output format has changed slightly, e.g. - - 00000000: "II" (little endian) 42 - 00000008: IFD 1 with 15 entries - 00000034: 254 (NewSubFileType) LONG 1 = 0 - 00000046: 256 (ImageWidth) LONG 1 = 2948 - 00000058: 257 (ImageLength) LONG 1 = 4620 - ... - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 (beta 2) -Rev. 2004-07-19 - -1. GENERAL - - 1.1 Multiple files can now be specified in command line. - - jhove ... [[-krs] file-or-uri ...] - - A single output document (XML or text) will be generated for a - set of files specified in a command line. - - 1.2 API version information is now available through methods in the - App class. - - 1.3 AESAudioMetadata property has been added for sound formats. The - new PropertyPath class facilitates the extraction of Properties - by applications that use the JHOVE API. - - 1.4 The ErrorMessage and InfoMessage classes now support a submessage - string for more flexible message factoring. - - 1.5 The SAX parser class may now be specified in the jhove.properties - file in the property "edu.harvard.hul.ois.jhove.saxClass". - -2. GRAPHIC USER INTERFACE (JhoveView) - - 2.1 Supports drag and drop of directories; subdirectories are - processed recursively. - - 2.2 The menu option "File > Close document windows" closes all document - windows. - -3. MODULES (GENERAL) - - 3.1 Performance has been improved in all modules. - - 3.2 New modules for JPEG 2000, AIFF, and WAVE formats. Be sure to - update the configuration file, jhove/conf/jhove.conf, to include - these modules: - - ... - - edu.harvard.hul.ois.jhove.module.AiffModule - - - edu.harvard.hul.ois.jhove.module.WaveModule - - - edu.harvard.hul.ois.jhove.module.Jpeg2000Module - - ... - - 3.3 Bug reading unsigned integers has been fixed. - -4. PDF MODULE - - 4.1 More information provided about encryption keys. - - 4.2 UserAccess property now shows "No permissions" if no bits are - set. - -5. GIF MODULE - - 5.1 Unexpected EOF is now handled cleanly. - -6. JPEG MODULE - - 6.1 Exif data exception properly thrown. - -7. TIFF MODULE - - 7.1 Identification of Exif profile has been improved. - - 7.2 Photoshop tags 34377 and 50255 are now recognized. - - 7.3 Bug in handling ExtraSamples tag fixed. - - 7.4 Bug in determining valid date/time formats; the range for hours was - incorrectly constrained to 1-24, rather than 0-24. - -8. XML MODULE - - 8.1 If no encoding is specified, encoding is now reported as UTF-8. - - 8.2 Catches and reports UTFDataFormatException. - - 8.3 A greater range of parsers (including Xerces) now will do - schema validation. - -9. XML HANDLER - - 9.1 Omitted values in NisoImageMetadata were being reported in XML - in some cases as default values (e.g., -1). These have been - suppressed. - - 9.2 element was inappropriately nested underneath - the element. - - 9.3 The "subMessage" attribute is now properly defined in the jhove.xsd - schema. -======= -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003-2009 by JSTOR and the President and Fellows of Harvard College -JHOVE is made available under the GNU Lesser General Public License (LGPL; -see the file LICENSE for details) - -RELEASE NOTES FOR JHOVE 1.5 -2009-12-17 - -PDF MODULE - -1. An ArrayIndexOutOfBoundsException was thrown on a PDF with an invalid - object number in the cross-reference stream. In JHOVE 1.5, this is - correctly reported as a violation of well-formedness. - -UTF-8 MODULE - -1. With some very simple UTF-8 files, JHOVE handlers would throw an exception - processing them, and the GUI would fail silently. This happened with files - using no UTF-8 blocks. This has been fixed. - -TEXTMD (multiple modules) - -1. TextMD metadata can now optionally be reported. To get this, it's - necessary to edit jhove.conf. TextMD can be enabled on a per-module - basis for HtmlModule, AsciiModule, Utf8Module, and XmlModule. - The element for each chosen module must contain the element - withtextmd=true (no spaces). - -2. The TextMD feature was added by Thomas Ledoux. - - - -RELEASE NOTES FOR JHOVE 1.4 -2009-07-30 - -PDF MODULE - - 1. The PDF/A profile has been updated to the final version of - 19005-1:2005(E) and made more thorough. Among the changes: - - a. The set-state and no-op actions disqualify a PDF/A candidate. - - b. The ASCIIHexDecode and ASCII85Decode filters no longer - disqualify a candidate. - - c. Checking of outlines has been added. - - d. Additional checking of Type 1 fonts and symbolic fonts. - - e. Bug fix in checking type 2 subfonts. - - f. An LZW filter in an image object disqualifies a candidate. - - g. The xpacket processing instruction is checked for attributes - which disqualify from PDF/A. - - h. Conformity to implementation limits is checked as a condition - of PDF/A conformity. - -JPEG2000 MODULE - - 1. The pathological case of an image with no components is checked so - it won't cause a crash. - -XML HANDLER - - 1. A reset() function has been added so that if the handler is reused, - it will return to a valid initial state. - -RELEASE NOTES FOR JHOVE 1.3 -2009-06-04 - -GENERAL - - 1. The build.xml files now force compilation to Java 1.4, preventing - accidental distributions that aren't 1.4-compatible. - 2. Spaces are allowed in file paths on Windows, if the path is - enclosed in quotes. This fix had been in version 1.1i, and had been - lost since then. - -PDF MODULE - - 1. According to the PDF 1.6 specification, table 3.4, parameters for a - stream filter can be either a dictionary or the null object. The null - object was treated as an error; it is now allowed. - 2. Object stream handling was seriously buggy, causing rejection of - well-formed and valid files; it's better now. - 3. In PDF 1.4, an outline dictionary unconditionally must have a "First" - and a "Last" entry. JHOVE follows this requirement, declaring a file - invalid if it isn't met. However, PDF 1.6 relaxes the requirement, - applying it only "if there are any open or closed outline entries." - Thus, an empty outline dictionary with no "First" or "Last" entry - is valid. It is now accepted (for all PDF versions). - 4. If a page number tree in a PDF file is missing an expected "Nums" - entry, this was being reported as an invalid date. A more appropriate - error message is now given. - -TIFF MODULE - - 1. TIFF tag 33723 (IPTC-NAA) was considered valid only if the data - type is ASCII or LONG. But according to Aware Systems, the valid - types are UNDEFINED and BYTE. All four types are now accepted. - -XML HANDLER - - 1. Omissions in MIX 1.0 and 2.0 output have been fixed. - -RELEASE NOTES FOR JHOVE 1.2 -2009-02-10 - -GENERAL - - 1. A bug has been fixed in CountedInputStream, which could potentially - have caused infinite recursion in some modules. - -HTML MODULE - - 1. An incompatibility with Java 1.6 has been fixed. - -PDF MODULE - - 1. A null pointer exception would be thrown for PDF documents without a - document root tree. This has been fixed. - 2. A source of possible false positives in PDF profiles has been fixed. - 3. Certain checks weren't being done to Type 2 fonts, and some PDF/A - profile violations might have been missed as a result. This has - been fixed. - -WAVE MODULE - - 1. Sub-chunks of the 'adtl' chunk are now constrained to even byte - boundaries. - -XML HANDLER - - 1. MIX 2.0 is now supported. - 2. The URL for the MIX 0.2 schema has changed to reflect the change - on the LOC MIX site. - 3. The handler was sometimes incorrectly reporting whether the - AESAudioMetadata property had an empty value or not. This has - been fixed. - - -RELEASE NOTES FOR JHOVE 1.1 -Rev. 2008-02-22 - -COMMAND-LINE INTERFACE - - 1. Allow filenames with internal spaces if they are quoted on the - command line. - 2. Corrected error setting the Classpath in the Windows Shell script - (jhove.bat) - 3. Corrected error opening the configuration file using the default - GCJ parser in the GNU Java Runtime Environment. - -GUI (SWING) INTERFACE (JHOVE VIEW) - - 1. AES metadata properties displayed in the RepInfo window rearranged - slightly to make their ordering consistent with the Text and XML - handlers. - 2. The JhoveView.main() method will now accept a "-c configFile" option - on the command line. The GUI interface can now be invoked by: - - java -jar bin/JhoveView.jar -c configFile - - 3. Corrected error opening the configuration file using the default - GCJ parser in the GNU Java Runtime Environment. - 4. Correct recurrent problems with reading the configuration file on - Windows installations. - -AIFF MODULE - - 1. Correct value for first sample offset by included non-zero offset - defined in the SSND chunk. - 2. Do not report bitrate reduction data for PCM data. - 3. All non-final instance fields and methods are protected, rather than - private. - -ASCII MODULE - - 1. A minimal file containing no line-end characters now does not - produce an empty ASCIIMetadata property, which is invalid against - the JHOVE schema. - 2. Zero-length files are considered not well-formed. - 3. Issue informative message if file contains no printable characters. - 4. All non-final instance fields and methods are protected, rather than - private. - -BYTESTREAM MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - -GIF MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - -HTML MODULE - - 1. The HTMLMetadata block in the module output is only produced if - there is at least one actual metadata property to report. - 2. All non-final instance fields and methods are protected, rather than - private. - -JPEG MODULE - - 1. The JPEG module reports the X and Y sampling frequency for files - meeting the JFIF profile. - 2. The JPEG module reports the pixel aspect ratio for JFIF profile - files for which it is defined. - 3. File handles were not being properly closed when processing embedded - EXIF metadata. In cases where JHOVE was invoked against large - numbers of objects this was causing a premature crash due to the - resource leak. - 4. All non-final instance fields and methods are protected, rather than - private. - 5. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 6. Validation errors in embedded EXIF metdata were not being fully - reported. - -JPEG 2000 MODULE - - 1. All non-final instance fields and methods are protected, rather than - private. - 2. Files generated by the LuraWave codec are no longer incorrecly identified - as having unrecognized QCC marker segments. - -PDF MODULE - - 1. Date strings are now parsed with strict conformance to the ASN.1 - syntax. - 2. Destinations defined by indirect references to non-existent objects - are assumed to have the value "null". Files containing such - destinations are reported as "well-formed, but not valid". - 3. No attempt is made to display encrypted outline item title strings are - not displayed. - 4. Catch error if the Info key of the trailer dictionary is not an - indirect reference. - 5. Read entire page tree structure, regardless of its internal - organization. This error may have caused the under reporting of - page resources, such as fonts and images. - 6. The NISO Compression Scheme for all images using the CCITTFaxDecode - compression filter is now reported properly; previously, the scheme - was always reported as CCITT 1D even if the actual compression - algorithm was CCITT Group 3 or 4. - 7. Properly parse UTF-16 escape characters encoded in double-byte form. - 8. The module properly stops looking for the header comment after 1024 - bytes. - 9. All non-final instance fields and methods are protected, rather than - private. - 10. The number of incremental updates is now reported correctly, rather than - the total number of file trailers, which is one greater than the number - of updates. - 11. Only up to 1000 fonts will be reported. After that, an informative - message will be generated. The limit can be set using the parameter - "nxxxx" in the module-specific section of the configuration file: - - - edu.harvard.hul.ois.jhove.module.PdfModule - n2000 - - - 12. Subfonts of Type 0 are now being properly reported. - 13. PDF/A-1b profile is now being properly reported. - 14. Permit trailer info key to be optional. - 15. Additional correction for outline recursion. - 16. Fix treatment of indirect object of Actions. - 17. Correctly handle trailer dictionary without Info entry. - 18. Ignore comments within dictionaries. - -TIFF MODULE - - 1. Corrected error parsing pyramidal TIFF using the SubIFDs tag with a - type of IFD (13) rather than LONG (4). - 7. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 2. All sub-IFDs of a pyramidal TIFF are now properly parsed. - 3. The EXIF GainControl tag (41991) is now correctly identified as - a SHORT, not a RATIONAL, value. - 4. Corrected error in which valid files were reported as being only - well-formed due to an incorrect parsing of the DateTime (306) tag. - 5. Byte-aligned offsets can be considered well-formed if the module - parameter "byteoffset=true" is set in the configuration file: - - - edu.harvard.hul.ois.jhove.module.TiffModule - byteoffset=true - - - 6. All non-final instance fields and methods are protected, rather than - private. - 7. Correct parsing of the EXIF "subsecTimeOriginal" (37251) and - "subsecTimeDigitized" (37522) properties. - 8. Using the "-s" option, the TIFF module was incorrectlly reporting - signature matches for text files starting with "II". - 9. Validation errors in embedded EXIF metdata were not being fully - reported. - -UTF8 MODULE - - 1. Corrected error under which malformed UTF-8 files containing encoding - sequences starting with a byte value in the range 0xF8 through 0xFF - were reported as well-formed and valid. - 2. Zero-length files are considered not well-formed. - 3. Issue informative message if file contains no printable characters. - 4. All non-final instance fields and methods are protected, rather than - private. - -WAVE MODULE - - 1. BWF files now set the correct start time in the AES metadata. - 2. All non-final instance fields and methods are protected, rather than - private. - 3. "cue " and "adtl" chunks are now properly read. - -XML MODULE - - 1. The DTD is assumed to be the first DOCTYPE system ID in the file with an - ".dtd" extension. - 2. All non-final instance fields and methods are protected, rather than - private. - 3. The module correctly handles schemaLocation attributes that do not - provide two whitespace-separated URIs. - -TEXT HANDLER - - 1. AES audio metadata properties rearranged slightly to make their - ordering consistent with the XML schema. - -XML HANDLER - - 1. Correct sample rate formatting in AES Time Code Format (TCF) - temporal references. - 2. Correct face IDREF in AES metadata. - 3. Disallowed control characters are removed from content. - 4. Null property values no longer generate empty elements. - 5. Image technical metadata can be reported in terms of the MIX 1.0 schema, - as opposed to the default reporting against MIX 0.2. To specify the - 1.0 schema include the directive: - - 1.0 - - if the configuration file. - -JHOVE API - - 1. The process() and processFile() methods of the JhoveBase class are now - public, to permit direct access to the API by applications. - 2. Checksum calculations now use buffered I/O uniformly for improved - performance. - 3. All non-final fields and methods in the JhoveBase class are - protected, rather than private. - 4. When invoked with the "-s" option JHOVE now reports the signature - matched format and MIME type. - 5. The processing of files in a directory is now performed in an - alphabetically sorted order. - -ADUMP UTILITY - - 1. Display the field values of known chunks. - -TDUMP UTILITY - - 1. New format that sorts all tag definitions by their byte offset and - also displays the byte ranges for image data. - 2. Command line flags permit the suppression of BYTE data display (-b) and - and subIFD parsing (-s). - -USERHOME UTILITY - - 1. A new utility program, UserHome, is available to determine the value - of the Java user.home property needed to know where to place the - configuration file. This utility can be invoked by the driver scripts - "userhome" (Bourne shell) or "userhome.bat" (Windows). - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 -Rev. 2005-05-26 - -GENERAL - - 1. Zero length files are now handled properly in all modules. - - 2. Missing start time in audio files is now handled property in all - audio modules. - - 3. Miscellaneous bug fixes, enhancements, and documentation updates. - -AIFF MODULE - - 1. Corrected error causing BitrateReduction to be incorrectly reported - for uncompressed PCM audio. - - -JPEG2000 MODULE - - 1. The module now validates the enumerated ICC profile types in the - Color Specification Box. In the JP2 profile, an unrecognized ICC - profile type marks the file as not well formed; in the JPX, the file - is merely not valid. - - 2. In the beta 3 release certain invalid JPEG 2000 files were - reported as well formed in the JP2 profile. This has been corrected. - -PDF MODULE - - 1. Following the practice of Acrobar, the PDF module will accept - the "%PDF-1.n" header comment anywhere in the first 1024 bytes of a - file (with appropriate notification via an information message), - rather than requiring that it start at byte offset 0. - - 2. The requirements for the PDF/A profile have been brought into - conformance with the most recent version of the PDF/A specification, - ISO/DIS 19005-1 of 2004-12-22. - - 3. Corrected bug that prevented valid PDF/X-1 files from being - recognized as such. - -WAVE MODULE - - 1. Corrected error causing BitrateReduction to be incorrectly reported - for uncompressed PCM audio. - -XML HANDLER - - 1. Dates reported for the NISO Z39.87 - element are now canonicalized to be in proper ISO 8601 form. - - 2. The NISO Z39.87 element is now - reported, if known. - -AUDIT HANDLER - - 1. The current working directory is reported as the "home" - attribute of the element and individual files are reported - as relative pathnames - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 (beta 3) -Rev. 2005-02-04 - -1 GENERAL - - 1. The architecture has been modified to simplify the use of JHOVE - with new "front ends." The new JhoveBase class is used in - conjunction with the App class to incorporate nearly all the - work of setting up a JHOVE instance. The main Jhove class and the App - class are now smaller than before. - - 2. Checksums were often being reported with incorrect values due to - an output formatting error that dropped zeroes. This has been fixed. - - 3. New utilities GDUMP and JDUMP created for GIF and JPEG documents. - - 4. Error messages are more consistently factored into submessages. - This allows messages indicating the same type of error to - be more readily grouped. - - 5. Some modules were reporting a MIME type for a document that is - not well-formed. This no longer occurs. - - 6. Duplicate reporting of AES BitDepth has been suppressed. - - 7. New module for HTML format. Be sure to update the configuration - file, jhove/conf/jhove.conf, to include the module: - - ... - - edu.harvard.hul.ois.jhove.module.HtmlModule - - ... - - 8. The AES audio metadata representation has been updated to - conform with schema version 1.02b (pre-release). - - 9. New property, sigMatches, has been added to RepInfo. This - records which module(s) regarded the signature of the document as a - match, even if the document was not well-formed. This is useful in - identifying broken documents that are reported as ASCII or Bytestream. - - 10. The logging API is supported, permitting the generation of - debugging messages. - - 11. All modules are now non-final, so that they can be subclassed by - adventurous users. - - 12. The -p and -P arguments of the command line are no longer - supported. Instead, the equivalent parameters can be - provided to all variants of JHOVE (including those which - don't take a command line) by specifying a element - within the element of the configuration file. - Example: - - - edu.harvard.hul.ois.jhove.module.PdfModule - a - f - p - - -2 JHOVE COMMAND-LINE INTERFACE - - 1. The JHOVE command-line interface can now accept directory names, - as well as file pathnames and URIs: - - java Jhove [-c config] [-m module] [-h handler] [-e encoding] - [-H handler] [-o output] [-x saxclass] [-t tempdir] - [-b bufsize] [-l loglevel] [[-krs] dir-file-or-uri [...]] - - All of the files in the directories are processed in a - depth-first recursive descent. - -3 JHOVEVIEWER (SWING GUI) INTERFACE - - 1. The JhoveViewer class now allows dragging of a directory or of - multiple files, and the output for all files is presented in a single - window. This significantly reduces the window clutter. - - 2. The JhoveViewer presents the module menu in alphabetical order - rather than configuration file order. - - 3. The JhoveViewer was failing to report some submessages. This is fixed. - - 4. The JhoveViewer was failing silently on certain URL errors; it - now puts up an error alert. - - 5. If an empty module class name is added in the Configuration - dialog, it is ignored. - -4 AIFF MODULE - - 1. Descriptive properties added. - - 2. Checksum was sometimes missing; fixed. - - 3. Specification URL added to descriptive information. - - 4. Reported MIME type changed to 'audio/x-aiff' from 'application/aiff'. - -5 GIF MODULE - - 1. BitsPerSample is now reported. - -6 JPEG MODULE - - 1. Errors occurring when parsing an optional EXIF segment were not - being reported. This problem manifested itself by incorrectly - reporting that the JPEG file is not well-formed. - - 2. Array size bug in BitsPerSample fixed. - -7 JPEG2000 MODULE - - 1. Specification information added for ITU. - - 2. Errors in parsing of an EXIF segment are now reported. - -8 PDF MODULE - - 1. In certain instances the module was inappropriately reporting - well-formed PDF files as being non-well-formed, indicating - (incorrectly) that the file does not contain a trailer. - - 2. Fixed a NullPointerException being thrown with a defective page - root tree. - - 3. Certain broken cross-reference tables would throw the module - into a loop. This is fixed. - - 4. Problems in XMP data that triggered a SAX error were being - reported to standard output as a "fatal error." They are now properly - reported. - - 5. Error in offset reporting fixed. - - 6. Now reports FontFile2 and FontFile3. - - 7. File trailers are now found more reliably. - - 8. PDF/A profile updated to latest draft proposal, ISO/CD 19005-1 - (2004-09-20). - - 9. Parameters that would have been specified by the -p argument - of the command line are now specified by the element - in the configuration file. The sense of these parameters - has been reversed; by default, the PDF module presents - the maximum amount of information unless suppressed by - including the characters a, p, f, or o in the parameter value(s). - -9 TIFF MODULE - - 1. Adobe DNG tags are recognized, and a DNG profile has been added. - - 2. Bug in DATETIME checking fixed. - - 3. Changes in validity tests for PhotometricInterpretation, - SamplesPerPixel and BitsPerSample. - - 4. Corrected spurious null values for some properties. - - 5. Tag data type checking was badly broken, now fixed. - -10 WAVE MODULE - - 1. Type 'exif' recognized in LIST chunk. - - 2. Format and signature information updated. - - 3. Checksum was sometimes missing; fixed. - - 4. Reported MIME type changed to 'audio/x-wave' from 'audio/x-wav'. - -11 XML MODULE - - 1. Now reports 1.0 and 1.1 as versions rather than profiles. - - 2. Reported MIME type changed to 'text/xml' from 'application/xml'. - - 3. A base URL for DTD's may now be specified using the - element. The URL must be preceded by the letter b - to distinguish it from potential future parameters, e.g., - - - edu.harvard.hul.ois.jhove.module.XmlModule - bhttp://www.example.com/ - - -12 XML HANDLER - - 1. The "xsi" namespace is now defined in the NISO Image Metadata - and AES Audio Metadata elements. This - allows these segments to validate when extracted from the JHOVE output - document. - - 2. The element is properly named; it - had been improperly displayed as . - - 3. X and YSamplingFrequency are reported as positive integers - ("600"), not ratios ("600/1"), for consistency with the MIX schema. - - 4. An empty Properties element in the XML handler is now suppressed. - -13 GDUMP UTILITY - - 1. New utility to dump GIF files in human-readable form. - -14 JDUMP UTILITY - - 1. New utility to dump JPEG files in human-readable form. - -15 TDUMP UTILITY - - 1. The output format has changed slightly, e.g. - - 00000000: "II" (little endian) 42 - 00000008: IFD 1 with 15 entries - 00000034: 254 (NewSubFileType) LONG 1 = 0 - 00000046: 256 (ImageWidth) LONG 1 = 2948 - 00000058: 257 (ImageLength) LONG 1 = 4620 - ... - -************************************************************************ - -RELEASE NOTES FOR JHOVE 1.0 (beta 2) -Rev. 2004-07-19 - -1. GENERAL - - 1.1 Multiple files can now be specified in command line. - - jhove ... [[-krs] file-or-uri ...] - - A single output document (XML or text) will be generated for a - set of files specified in a command line. - - 1.2 API version information is now available through methods in the - App class. - - 1.3 AESAudioMetadata property has been added for sound formats. The - new PropertyPath class facilitates the extraction of Properties - by applications that use the JHOVE API. - - 1.4 The ErrorMessage and InfoMessage classes now support a submessage - string for more flexible message factoring. - - 1.5 The SAX parser class may now be specified in the jhove.properties - file in the property "edu.harvard.hul.ois.jhove.saxClass". - -2. GRAPHIC USER INTERFACE (JhoveView) - - 2.1 Supports drag and drop of directories; subdirectories are - processed recursively. - - 2.2 The menu option "File > Close document windows" closes all document - windows. - -3. MODULES (GENERAL) - - 3.1 Performance has been improved in all modules. - - 3.2 New modules for JPEG 2000, AIFF, and WAVE formats. Be sure to - update the configuration file, jhove/conf/jhove.conf, to include - these modules: - - ... - - edu.harvard.hul.ois.jhove.module.AiffModule - - - edu.harvard.hul.ois.jhove.module.WaveModule - - - edu.harvard.hul.ois.jhove.module.Jpeg2000Module - - ... - - 3.3 Bug reading unsigned integers has been fixed. - -4. PDF MODULE - - 4.1 More information provided about encryption keys. - - 4.2 UserAccess property now shows "No permissions" if no bits are - set. - -5. GIF MODULE - - 5.1 Unexpected EOF is now handled cleanly. - -6. JPEG MODULE - - 6.1 Exif data exception properly thrown. - -7. TIFF MODULE - - 7.1 Identification of Exif profile has been improved. - - 7.2 Photoshop tags 34377 and 50255 are now recognized. - - 7.3 Bug in handling ExtraSamples tag fixed. - - 7.4 Bug in determining valid date/time formats; the range for hours was - incorrectly constrained to 1-24, rather than 0-24. - -8. XML MODULE - - 8.1 If no encoding is specified, encoding is now reported as UTF-8. - - 8.2 Catches and reports UTFDataFormatException. - - 8.3 A greater range of parsers (including Xerces) now will do - schema validation. - -9. XML HANDLER - - 9.1 Omitted values in NisoImageMetadata were being reported in XML - in some cases as default values (e.g., -1). These have been - suppressed. - - 9.2 element was inappropriately nested underneath - the element. - - 9.3 The "subMessage" attribute is now properly defined in the jhove.xsd - schema. - diff --git a/ocrmypdf/jhove/bin/JhoveApp.jar b/ocrmypdf/jhove/bin/JhoveApp.jar deleted file mode 100644 index d6136b74..00000000 Binary files a/ocrmypdf/jhove/bin/JhoveApp.jar and /dev/null differ diff --git a/ocrmypdf/jhove/bin/JhoveView.jar b/ocrmypdf/jhove/bin/JhoveView.jar deleted file mode 100644 index 8c57eff6..00000000 Binary files a/ocrmypdf/jhove/bin/JhoveView.jar and /dev/null differ diff --git a/ocrmypdf/jhove/bin/README b/ocrmypdf/jhove/bin/README deleted file mode 100644 index 892fd9c0..00000000 --- a/ocrmypdf/jhove/bin/README +++ /dev/null @@ -1,25 +0,0 @@ -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003 by JSTOR and the President and Fellows of Harvard College -JHOVE is made available under the GNU General Public License (see the file -LICENSE for details) - -Rev. 2003-11-25 - -The following jar files are meant to be used for embedding JHOVE functionality -into new applications or systems. - - jhove.jar Contains the JHOVE API interfaces and classes - jhove-module.jar Contains the standard JHOVE modules () - jhove-handler.jar Contains the standard JHOVE output handlers (TEXT and XML) - -The following jar file is meant to be used with the stand-alone JHOVE -application using a command-line interface. It contains the main Jhove class -and the contents of jhove.jar, jhove-module.jar, and jhove-handler.jar. - - JhoveApp.jar - -The following jar file is meant to be used with the stand-alone JHOVE -application using a Swing GUI interface. It contains the main JhoveView class -and the contents of jhove.jar, jhove-module.jar, and jhove-handler.jar. - - JhoveView.jar diff --git a/ocrmypdf/jhove/bin/jhove-handler.jar b/ocrmypdf/jhove/bin/jhove-handler.jar deleted file mode 100644 index 8d5509f4..00000000 Binary files a/ocrmypdf/jhove/bin/jhove-handler.jar and /dev/null differ diff --git a/ocrmypdf/jhove/bin/jhove-module.jar b/ocrmypdf/jhove/bin/jhove-module.jar deleted file mode 100644 index 1ba82296..00000000 Binary files a/ocrmypdf/jhove/bin/jhove-module.jar and /dev/null differ diff --git a/ocrmypdf/jhove/bin/jhove.jar b/ocrmypdf/jhove/bin/jhove.jar deleted file mode 100644 index 8fc6078e..00000000 Binary files a/ocrmypdf/jhove/bin/jhove.jar and /dev/null differ diff --git a/ocrmypdf/jhove/build.xml b/ocrmypdf/jhove/build.xml deleted file mode 100644 index 6e595955..00000000 --- a/ocrmypdf/jhove/build.xml +++ /dev/null @@ -1,78 +0,0 @@ - - Project build file - Jhove - JSTOR/Harvard Object Validation Environment - Version 1.0 2004-09-10 - Copyright 2004 by JSTOR and the President and Fellows of Harvard College - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/ocrmypdf/jhove/conf/README b/ocrmypdf/jhove/conf/README deleted file mode 100644 index c518ca28..00000000 --- a/ocrmypdf/jhove/conf/README +++ /dev/null @@ -1,63 +0,0 @@ -JHOVE - JSTOR/Harvard Object Validation Environment -Copyright 2003-2007 by JSTOR and the President and Fellows of Harvard College -JHOVE is made available under the GNU General Public License (see the file -LICENSE for details) - -Rev. 2007-08-30 - -Edit the configuration file, jhove.conf, and set the JHOVE home -directory: - - jhove-home-directory - -and temporary directory: - - temporary-directory - -On most Unix systems, a reasonable temporary directory is "/var/tmp"; -on Windows, "C:\temp". - -The optional - - buffer-size - -element defines the buffer size used for buffer I/O operations. - -The optional - - 1.0 - -element specifies that the XML output handler should conform to the -MIX 1.0 schema. The default behavior is for handler output to conform -to the MIX 0.2 schema. - -The optional - - n - -element specifies that JHOVE modules will look for format signatures -in the first bytes of the file. The default value is 1024. - -All class names must be fully qualified with their package name: - - - fully-package-qualified-class-name - optional-initialization-argument - optional-invocation-argument - - -The optional argument is passed to the module once at the time -its class is instantiated. See module-specific documentation for a -description of any initialization options. - -The optional argument is passed to the module every time it is -invoked. See module-specific documentation for a description of any -invocation options. - -The order in which format modules are defined is important; when -performing a format identification operation, JHOVE will search for a -matching module in the order in which the modules are defined in the -configuration file. In general, the modules for more generic formats -should come later in the list. For example, the standard module ASCII -should be defined before the UTF-8 module, since all ASCII objects -are, by definition, UTF-8 objects, but not vice versa. diff --git a/ocrmypdf/jhove/conf/jhove-byteoffset=true.conf b/ocrmypdf/jhove/conf/jhove-byteoffset=true.conf deleted file mode 100644 index 515f87b7..00000000 --- a/ocrmypdf/jhove/conf/jhove-byteoffset=true.conf +++ /dev/null @@ -1,47 +0,0 @@ - - - /users/stephen/projects/jhove - utf-8 - /var/tmp - 131072 - 1.0 - 1024 - - edu.harvard.hul.ois.jhove.module.AiffModule - - - edu.harvard.hul.ois.jhove.module.WaveModule - - - edu.harvard.hul.ois.jhove.module.PdfModule - - - edu.harvard.hul.ois.jhove.module.Jpeg2000Module - - - edu.harvard.hul.ois.jhove.module.JpegModule - - - edu.harvard.hul.ois.jhove.module.GifModule - - - edu.harvard.hul.ois.jhove.module.TiffModule - byteoffset=true - - - edu.harvard.hul.ois.jhove.module.XmlModule - - - edu.harvard.hul.ois.jhove.module.HtmlModule - - - edu.harvard.hul.ois.jhove.module.AsciiModule - - - edu.harvard.hul.ois.jhove.module.Utf8Module - - diff --git a/ocrmypdf/jhove/conf/jhove-withTextMD.conf b/ocrmypdf/jhove/conf/jhove-withTextMD.conf deleted file mode 100644 index 189897ed..00000000 --- a/ocrmypdf/jhove/conf/jhove-withTextMD.conf +++ /dev/null @@ -1,51 +0,0 @@ - - - /users/stephen/projects/jhove - utf-8 - /var/tmp - 131072 - 1.0 - 1024 - - edu.harvard.hul.ois.jhove.module.AiffModule - - - edu.harvard.hul.ois.jhove.module.WaveModule - - - edu.harvard.hul.ois.jhove.module.PdfModule - - - edu.harvard.hul.ois.jhove.module.Jpeg2000Module - - - edu.harvard.hul.ois.jhove.module.JpegModule - - - edu.harvard.hul.ois.jhove.module.GifModule - - - edu.harvard.hul.ois.jhove.module.TiffModule - byteoffset=true - - - edu.harvard.hul.ois.jhove.module.XmlModule - withTextMD=true - - - edu.harvard.hul.ois.jhove.module.HtmlModule - withTextMD=true - - - edu.harvard.hul.ois.jhove.module.AsciiModule - withTextMD=true - - - edu.harvard.hul.ois.jhove.module.Utf8Module - withTextMD=true - - diff --git a/ocrmypdf/jhove/conf/jhove.conf b/ocrmypdf/jhove/conf/jhove.conf deleted file mode 100644 index 6cc1672c..00000000 --- a/ocrmypdf/jhove/conf/jhove.conf +++ /dev/null @@ -1,45 +0,0 @@ - - - ./jhove/ - utf-8 - /var/tmp - 131072 - - edu.harvard.hul.ois.jhove.module.AiffModule - - - edu.harvard.hul.ois.jhove.module.WaveModule - - - edu.harvard.hul.ois.jhove.module.PdfModule - - - edu.harvard.hul.ois.jhove.module.Jpeg2000Module - - - edu.harvard.hul.ois.jhove.module.JpegModule - - - edu.harvard.hul.ois.jhove.module.GifModule - - - edu.harvard.hul.ois.jhove.module.TiffModule - - - edu.harvard.hul.ois.jhove.module.XmlModule - schema=http://www.example.com/schema;/home/schemas/exampleschema.xsd - - - edu.harvard.hul.ois.jhove.module.HtmlModule - - - edu.harvard.hul.ois.jhove.module.AsciiModule - - - edu.harvard.hul.ois.jhove.module.Utf8Module - - diff --git a/ocrmypdf/jhove/configure.pl b/ocrmypdf/jhove/configure.pl deleted file mode 100644 index 8f48277b..00000000 --- a/ocrmypdf/jhove/configure.pl +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/perl - -######################################################################## -# Jhove - JSTOR/Harvard Object Validation Environment -# Copyright 2004 by JSTOR and the President and Fellows of Harvard College -# -# A Perl script for plugging local path information into the -# various script files of JHOVE, as well as conf/jhove.conf. -# -# This is configured only for Unix (including OS X). -# -# Usage: configure.pl jhove_home_directory [java_home_directory [java_runtime_directory]] -# -# If invoked with no arguments, it will output a usage message. -# -######################################################################## -use File::Copy; - -sub mung { - my $f = $_[0]; - my $bak = $f . "~"; - #If there is no backup file, copy the file to the - #backup. Otherwise work from the backup. - if (!(-e $bak)) { - rename ($f, $bak); - } - open (INFILE, $bak); - open (OUTFILE, ">" . $f); - - #Walks through each line of file, making substitutions. - #Remember that the JAVA_HOME and JAVA arguments are optional. - while () { - s/^JHOVE_HOME=.*/JHOVE_HOME=$ARGV[0]/; - if ($narg >= 2) { - s/^JAVA_HOME=.*/JAVA_HOME=$ARGV[1]/; - } - if ($narg >= 3) { - s/^JAVA=.*/JAVA=$ARGV[2]/; - } - print OUTFILE; - } - close (INFILE); - close (OUTFILE); - if (-e $f) { - print ("Fixed " . $f . "\n"); - } -} - - -$narg = $#ARGV + 1; -if ($narg <= 0) { - print "Usage: configure.pl jhove_home_directory [java_home_directory [java_runtime_directory]]\n"; - exit; -} -print "JHOVE_HOME will be set to " . $ARGV[0] . "\n"; -if ($narg >= 2) { - print "JAVA_HOME will be set to " . $ARGV[1] . "\n"; -} -if ($narg >= 3) { - print "JAVA will be set to " . $ARGV[2] . "\n"; -} -mung ("jhove"); -mung ("adump"); -mung ("gdump"); -mung ("jdump"); -mung ("j2dump"); -mung ("pdump"); -mung ("tdump"); -mung ("wdump"); - -#Fix up the config file. We assume that the -#element is all on one line. -if (!(-e "conf/jhove.conf~")) { - rename ("conf/jhove.conf", "conf/jhove.conf~"); -} -open (INFILE, "conf/jhove.conf~"); -open (OUTFILE, ">conf/jhove.conf"); -while () { - s!.*!$ARGV[0]!; - print OUTFILE; -} -close (INFILE); -close (OUTFILE); -if (-e "conf/jhove.conf") { - print "Fixed conf/jhove.conf\n"; -} -exit; - diff --git a/ocrmypdf/jhove/gdump b/ocrmypdf/jhove/gdump deleted file mode 100644 index cb4135d3..00000000 --- a/ocrmypdf/jhove/gdump +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -######################################################################## -# gdump - JSTOR/Harvard Object Validation Environment -# Copyright 2004-2005 by the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Driver script for the GIF dump utility -# -# Usage: gdump file -# -# where file is a GIF file -# -# Configuration constants: - -JHOVE_HOME=/users/stephen/projects/jhove - -JAVA_HOME=/usr/java # Java JRE directory -JAVA=$JAVA_HOME/bin/java # Java interpreter - -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -${JAVA} -classpath $CP GDump $ARGS diff --git a/ocrmypdf/jhove/j2dump b/ocrmypdf/jhove/j2dump deleted file mode 100644 index fe9d377b..00000000 --- a/ocrmypdf/jhove/j2dump +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -######################################################################## -# j2dump - JSTOR/Harvard Object Validation Environment -# Copyright 2004-2005 by the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Driver script for the JPEG 2000 dump utility -# -# Usage: j2dump file -# -# where file is a JPEG file -# -# Configuration constants: - -JHOVE_HOME=/users/stephen/projects/jhove - -JAVA_HOME=/usr/java # Java JRE directory -JAVA=$JAVA_HOME/bin/java # Java interpreter - -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -${JAVA} -classpath $CP J2Dump $ARGS diff --git a/ocrmypdf/jhove/jdump b/ocrmypdf/jhove/jdump deleted file mode 100644 index 9bd24b73..00000000 --- a/ocrmypdf/jhove/jdump +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -######################################################################## -# jdump - JSTOR/Harvard Object Validation Environment -# Copyright 2004-2005 by the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Driver script for the JPEG dump utility -# -# Usage: jdump file -# -# where file is a JPEG file -# -# Configuration constants: - -JHOVE_HOME=/users/stephen/projects/jhove - -JAVA_HOME=/usr/java # Java JRE directory -JAVA=$JAVA_HOME/bin/java # Java interpreter - -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -${JAVA} -classpath $CP JDump $ARGS diff --git a/ocrmypdf/jhove/jhove b/ocrmypdf/jhove/jhove deleted file mode 100644 index f9d49437..00000000 --- a/ocrmypdf/jhove/jhove +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh - -######################################################################## -# JHOVE - JSTOR/Harvard Object Validation Environment -# Copyright 2003-2005 by JSTOR and the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Usage: jhove [-c config] [-m module] [-h handler] [-e encoding] [-H handler] -# [-o output] [-x saxclass] [-t tempdir] [-b bufsize] -# [-l loglevel] [[-krs] dir-file-or-uri [...]] -# -# where -c config Configuration file pathname -# -m module Module name -# -h handler Output handler name (defaults to TEXT) -# -e encoding Character encoding of output handler (defaults to UTF-8) -# -H handler About handler name -# -o output Output file pathname (defaults to standard output) -# -x saxclass SAX parser class (defaults to J2SE 1.4 default) -# -t tempdir Temporary directory in which to create temporary files -# -b bufsize Buffer size for buffered I/O (defaults to J2SE 1.4 default) -# -k Calculate CRC32, MD5, and SHA-1 checksums -# -r Display raw data flags, not textual equivalents -# -s Format identification based on internal signatures only -# dir-file-or-uri Directory, file pathname or URI of formatted content -# -# CHANGE for JHOVE 1.8: -# You no longer have to figure out where JAVA_HOME is; that's the -# operating system's job. If the OS tells you it can't find Java, -# adjust your shell's path or revert to the old way (commented out). -# Configuration constants: - -#JHOVE_HOME=/users/gary/dev/jhove -JHOVE_HOME=[fill in path to jhove directory] - -JAVA_HOME=/usr/java # Java JRE directory -- change to your local java home -JAVA=$JAVA_HOME/bin/java # Java interpreter -- usually won't need change - -#XTRA_JARS=/users/stephen/xercesImpl.jar -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -#{JAVA} -classpath $CP Jhove $ARGS -x org.apache.xerces.parsers.SAXParser -#${JAVA} -classpath $CP Jhove $ARGS -# New way, doesn't require you to use JAVA_HOME. -java -classpath $CP Jhove $ARGS \ No newline at end of file diff --git a/ocrmypdf/jhove/jhove.tmpl b/ocrmypdf/jhove/jhove.tmpl deleted file mode 100644 index b48b035b..00000000 --- a/ocrmypdf/jhove/jhove.tmpl +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/sh - -######################################################################## -# JHOVE - JSTOR/Harvard Object Validation Environment -# Copyright 2003-2004 by JSTOR and the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Copy jhove.tmpl to jhove, and replace the value of JHOVE_HOME with -# the path to your jhove directory. -# -# Usage: jhove [-c config] [-m module [-p param]] [-h handler [-P param]] -# [-e encoding] [-H handler] [-o output] [-x saxclass] -# [-t tempdir] [-b bufsize] [[-krs] dir-file-or-uri [...]] -# -# where -c config Configuration file pathname -# -m module Module name -# -p param Module-specific parameter -# -h handler Output handler name (defaults to TEXT) -# -P param Handler-specific parameter -# -o output Output file pathname (defaults to standard output) -# -x saxclass SAX parser class (defaults to J2SE 1.4 default) -# -t tempdir Temporary directory in which to create temporary files -# -b bufsize Buffer size for buffered I/O (defaults to J2SE 1.4 default) -# -k Calculate CRC32, MD5, and SHA-1 checksums -# -r Display raw data flags, not textual equivalents -# -s Format identification based on internal signatures only -# dir-file-or-uri Directory, file pathname or URI of formatted content -# -# Configuration constants: - -JHOVE_HOME=[your directory path]/jhove - -JAVA_HOME=/usr/java -JAVA=/usr/bin/java - -#XTRA_JARS=/users/stephen/xercesImpl.jar -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -#{JAVA} -classpath $CP Jhove $ARGS -x org.apache.xerces.parsers.SAXParser -${JAVA} -classpath $CP Jhove $ARGS diff --git a/ocrmypdf/jhove/jhove_bat.tmpl b/ocrmypdf/jhove/jhove_bat.tmpl deleted file mode 100644 index e0723c8a..00000000 --- a/ocrmypdf/jhove/jhove_bat.tmpl +++ /dev/null @@ -1,60 +0,0 @@ -@ECHO OFF -REM JHOVE - JSTOR/Harvard Object Validation Environment -REM Copyright 2003-2005 by JSTOR and the President and Fellows of Harvard College -REM JHOVE is made available under the GNU General Public License (see the -REM file LICENSE for details) -REM -REM Usage: jhove [-c config] [-m module] [-h handler] [-e encoding] -REM [-H handler] [-o output] [-x saxclass] [-t tempdir] -REM [-b bufsize] [-l loglevel] [[-krs] dir-file-or-uri [...]] -REM -REM For Windows systems, copy jhove_bat.tmpl to jhove.bat and change -REM the value of JHOVE_HOME to the path to your jhove directory. -REM -REM where -c config Configuration file pathname -REM -m module Module name -REM -h handler Output handler name (defaults to TEXT) -REM -e encoding Character encoding of output handler (defaults to UTF-8) -REM -H handler About handler name -REM -o output Output file pathname (defaults to standard output) -REM -x saxclass SAX parser class (defaults to J2SE 1.4 default) -REM -t tempdir Temporary directory in which to create temporary files -REM -b bufsize Buffer size for buffered I/O (defaults to J2SE default) -REM -l loglevel Logging level -REM -k Calculate CRC32, MD5, and SHA-1 checksums -REM -r Display raw data flags, not textual equivalents -REM -s Format identification based on internal signatures only -REM dir-file-or-uri Directory, file pathname, or URI of formatted content -REM -REM Configuration constants: -REM JHOVE_HOME Jhove installation directory -REM JAVA_HOME Java JRE directory -REM JAVA Java interpreter -REM EXTRA_JARS Extra jar files to add to CLASSPATH - -REM SET JHOVE_HOME="C:\Program Files\jhove" -SET JHOVE_HOME="[your directory path]\jhove" - -SET EXTRA_JARS= - -REM NOTE: Nothing below this line should be edited -REM ######################################################################### - - -SET CP=%JHOVE_HOME%\bin\JhoveApp.jar -IF "%EXTRA_JARS%"=="" GOTO FI - SET CP=%CP%:%EXTRA_JARS -:FI - -REM Retrieve a copy of all command line arguments to pass to the application - -SET ARGS= -:WHILE -IF "%1"=="" GOTO LOOP - SET ARGS=%ARGS% %1 - SHIFT - GOTO WHILE -:LOOP - -REM Set the CLASSPATH and invoke the Java loader -JAVA -classpath %CP% Jhove %ARGS% diff --git a/ocrmypdf/jhove/lib/OdfModule.jar b/ocrmypdf/jhove/lib/OdfModule.jar deleted file mode 100644 index 0f73d7dc..00000000 Binary files a/ocrmypdf/jhove/lib/OdfModule.jar and /dev/null differ diff --git a/ocrmypdf/jhove/lib/PngModule.jar b/ocrmypdf/jhove/lib/PngModule.jar deleted file mode 100644 index a226c21b..00000000 Binary files a/ocrmypdf/jhove/lib/PngModule.jar and /dev/null differ diff --git a/ocrmypdf/jhove/md5.pl b/ocrmypdf/jhove/md5.pl deleted file mode 100644 index 1695db78..00000000 --- a/ocrmypdf/jhove/md5.pl +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/perl -w - -# Generate MD5 checksum - -use Digest::MD5; - -die "usage: md5.pl file\n" if $#ARGV < 0; - -open (FILE, "<$ARGV[0]") or die "can't open file\"$ARGV[0]\"!\n"; -$ctx = Digest::MD5->new->addfile (*FILE); -close (FILE); -$digest = $ctx->hexdigest; - -print "$digest\n"; diff --git a/ocrmypdf/jhove/packagejhove.sh b/ocrmypdf/jhove/packagejhove.sh deleted file mode 100644 index 0afb0612..00000000 --- a/ocrmypdf/jhove/packagejhove.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/sh -#DO NOT RUN THIS ON A DEVELOPEMENT DIRECTORY, ONLY ON A -#CHECKED-OUT COPY TO BE PACKAGED! - -if [ "$1" = "" ]; then - echo "Usage: packagejhove.sh [version]" - echo "e.g., packagejhove.sh 1_8" - exit 1 -fi - -echo "This script will prepare your directory for uploading." -echo "DO NOT RUN IT unless you're a developer and know what" -echo "you are doing. " - -echo "Start in the top-level directory of the JHOVE checkout." -echo "Run ant and ant javadoc and do any necessary testing and" -echo "committing before running this script." -echo -echo - -echo "To continue, enter the secret phrase." -read OATH -if [ "$OATH" != "I solemnly swear that I am up to no good" ]; then - exit 1 -fi - -cd .. -cat >>CVSS << EOF -CVS -.cvsignore -EOF -tar cvfX jhove-$1.tar CVSS jhove -gzip jhove-$1.tar -cp -r jhove jhove-zip -cd jhove-zip -find . \( -name CVS -o -name .cvsignore \) -exec rm -r {} \; -cd .. -mv jhove jhove-ok -mv jhove-zip jhove -zip -r jhove-$1.zip jhove -rm -r jhove -mv jhove-ok jhove -jhove/md5.pl jhove-$1.tar.gz >jhove-$1.tar.gz.md5 -jhove/md5.pl jhove-$1.zip >jhove-$1.zip.md5 \ No newline at end of file diff --git a/ocrmypdf/jhove/pdump b/ocrmypdf/jhove/pdump deleted file mode 100644 index 34e08ef0..00000000 --- a/ocrmypdf/jhove/pdump +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/sh - -######################################################################## -# pdump - JSTOR/Harvard Object Validation Environment -# Copyright 2003-2005 by JSTOR and the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Driver script for the PDF dump utility -# -# Usage: pdump file -# -# where file is a PDF file -# -# Configuration constants: - -JHOVE_HOME=/users/stephen/projects/jhove - -JAVA_HOME=/usr/java # Java JRE directory -JAVA=$JAVA_HOME/bin/java # Java interpreter - -EXTRA_JARS= # Extra .jar files to add to CLASSPATH - -# NOTE: Nothing below this line should be edited -######################################################################## - -CP=${JHOVE_HOME}/bin/JhoveApp.jar:${EXTRA_JARS} - -# Retrieve a copy of all command line arguments to pass to the application. - -ARGS="" -for ARG do - ARGS="$ARGS $ARG" -done - -# Set the CLASSPATH and invoke the Java loader. -${JAVA} -classpath $CP PDump $ARGS diff --git a/ocrmypdf/jhove/userhome b/ocrmypdf/jhove/userhome deleted file mode 100644 index 3dd97474..00000000 --- a/ocrmypdf/jhove/userhome +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh - -######################################################################## -# userhome - JSTOR/Harvard Object Validation Environment -# Copyright 2004-2006 by the President and Fellows of Harvard College -# JHOVE is made available under the GNU General Public License (see the -# file LICENSE for details) -# -# Driver script to display the default Java user.home property -# -# Usage: userhome -# -# Configuration constants: - -JHOVE_HOME=/users/stephen/projects/jhove - -JAVA_HOME=/usr/java # Java JRE directory -JAVA=$JAVA_HOME/bin/java # Java interpreter - -# NOTE: Nothing below this line should be edited -######################################################################## - -# Set the CLASSPATH and invoke the Java loader. -${JAVA} -classpath ${JHOVE_HOME}/classes UserHome diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 17a058a7..490ced21 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -777,40 +777,28 @@ def validate_pdfa( input_file, log): - args_jhove = [ - 'java', - '-jar', JHOVE_JAR, - '-c', JHOVE_CFG, - '-m', 'PDF-hul', + args_qpdf = [ + 'qpdf', + '--check', input_file ] - p_jhove = Popen(args_jhove, close_fds=True, universal_newlines=True, - stdout=PIPE, stderr=DEVNULL) - stdout, _ = p_jhove.communicate() - log.debug(stdout) - if p_jhove.returncode != 0: - log.error(stdout) - raise RuntimeError( - "Unexpected error while checking compliance to PDF/A file.") + try: + check_output(args_qpdf, stderr=STDOUT, universal_newlines=True) + except CalledProcessError as e: + if e.returncode == 2: + print("{0}: not a valid PDF, and could not repair it.".format( + options.input_file)) + print("Details:") + print(e.output) + elif e.returncode == 3: + log.info("qpdf --check returned warnings:") + log.info(e.output) + else: + print(e.output) + return False - pdf_is_valid = True - if re.search(r'ErrorMessage', stdout, - re.IGNORECASE | re.MULTILINE): - pdf_is_valid = False - if re.search(r'^\s+Status.*not valid', stdout, - re.IGNORECASE | re.MULTILINE): - pdf_is_valid = False - if re.search(r'^\s+Status.*Not well-formed', stdout, - re.IGNORECASE | re.MULTILINE): - pdf_is_valid = False - - pdf_is_pdfa = False - if re.search(r'^\s+Profile:.*PDF/A-1', stdout, - re.IGNORECASE | re.MULTILINE): - pdf_is_pdfa = True - - return (pdf_is_valid, pdf_is_pdfa) + return True # @active_if(ocr_required and options.exact_image) @@ -867,21 +855,11 @@ def run_pipeline(): exc_value, {'ExitCode': ExitCode}, {'exc_value': exc_value}) - pdf_is_valid, pdf_is_pdfa = validate_pdfa(options.output_file, _log) - - returncode = ExitCode.other_error # Assume error - - if not pdf_is_valid: + if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') - returncode = ExitCode.invalid_output_pdfa - elif pdf_is_valid and not pdf_is_pdfa: - _log.warning('Output file: Generated file is VALID PDF but not PDF/A') - returncode = ExitCode.invalid_output_pdfa - elif pdf_is_valid and pdf_is_pdfa: - _log.info('Output file: The generated PDF/A file is VALID') - returncode = 0 + return ExitCode.invalid_output_pdfa - return returncode + return ExitCode.ok if __name__ == '__main__':