Test case: true type font without Unicode mapping

This commit is contained in:
James R. Barlow 2018-11-15 16:22:53 -08:00
parent 622f2c4bab
commit d3b334c10f
4 changed files with 22 additions and 4 deletions

View File

@ -588,12 +588,12 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
return pageinfo
def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None):
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, log=None):
if not log:
log = Mock()
pdf = pikepdf.open(infile)
if detailed_page_analysis:
if detailed_analysis:
pages_xml = None
else:
pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log)
@ -601,17 +601,18 @@ def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None):
pages = []
for n in range(len(pdf.pages)):
page_xml = pages_xml[n] if pages_xml else None
page = PageInfo(pdf, n, infile, page_xml)
page = PageInfo(pdf, n, infile, page_xml, detailed_analysis)
pages.append(page)
return pages, pdf
class PageInfo:
def __init__(self, pdf, pageno, infile, xmltext):
def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False):
self._pageno = pageno
self._infile = infile
self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext)
self._detailed_analysis = detailed_analysis
@property
def pageno(self):
@ -623,6 +624,8 @@ class PageInfo:
@property
def has_corrupt_text(self):
if not self._detailed_analysis:
raise NotImplementedError('Did not do detailed analysis')
return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])
@property

View File

@ -139,6 +139,11 @@ licensed under the specified license.
- @jbarlow83
- @jbarlow83
- CC-BY-SA 4.0
* - truetype_font_nomapping.pdf
- example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
- @jbarlow83
- @jbarlow83
- CC-BY-SA 4.0
* - trivial.pdf
- smallest possible valid PDF-1.3 with all required fields
- @jbarlow83

Binary file not shown.

View File

@ -183,3 +183,13 @@ def test_ocr_detection(resources):
pdf = pdfinfo.PdfInfo(filename)
assert not pdf[0].has_vector
assert pdf[0].has_text
def test_corrupt_font_detection(resources):
filename = resources / 'truetype_font_nomapping.pdf'
with pytest.raises(NotImplementedError):
pdf = pdfinfo.PdfInfo(filename)
pdf[0].has_corrupt_text
pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True)
assert pdf[0].has_corrupt_text