mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-30 00:31:59 +00:00
Test case: true type font without Unicode mapping
This commit is contained in:
parent
622f2c4bab
commit
d3b334c10f
@ -588,12 +588,12 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
|
||||
return pageinfo
|
||||
|
||||
|
||||
def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None):
|
||||
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, log=None):
|
||||
if not log:
|
||||
log = Mock()
|
||||
|
||||
pdf = pikepdf.open(infile)
|
||||
if detailed_page_analysis:
|
||||
if detailed_analysis:
|
||||
pages_xml = None
|
||||
else:
|
||||
pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log)
|
||||
@ -601,17 +601,18 @@ def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None):
|
||||
pages = []
|
||||
for n in range(len(pdf.pages)):
|
||||
page_xml = pages_xml[n] if pages_xml else None
|
||||
page = PageInfo(pdf, n, infile, page_xml)
|
||||
page = PageInfo(pdf, n, infile, page_xml, detailed_analysis)
|
||||
pages.append(page)
|
||||
|
||||
return pages, pdf
|
||||
|
||||
|
||||
class PageInfo:
|
||||
def __init__(self, pdf, pageno, infile, xmltext):
|
||||
def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False):
|
||||
self._pageno = pageno
|
||||
self._infile = infile
|
||||
self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext)
|
||||
self._detailed_analysis = detailed_analysis
|
||||
|
||||
@property
|
||||
def pageno(self):
|
||||
@ -623,6 +624,8 @@ class PageInfo:
|
||||
|
||||
@property
|
||||
def has_corrupt_text(self):
|
||||
if not self._detailed_analysis:
|
||||
raise NotImplementedError('Did not do detailed analysis')
|
||||
return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])
|
||||
|
||||
@property
|
||||
|
||||
@ -139,6 +139,11 @@ licensed under the specified license.
|
||||
- @jbarlow83
|
||||
- @jbarlow83
|
||||
- CC-BY-SA 4.0
|
||||
* - truetype_font_nomapping.pdf
|
||||
- example of a PDF with an embedded subsetted TrueType font with no Unicode mapping
|
||||
- @jbarlow83
|
||||
- @jbarlow83
|
||||
- CC-BY-SA 4.0
|
||||
* - trivial.pdf
|
||||
- smallest possible valid PDF-1.3 with all required fields
|
||||
- @jbarlow83
|
||||
|
||||
BIN
tests/resources/truetype_font_nomapping.pdf
Normal file
BIN
tests/resources/truetype_font_nomapping.pdf
Normal file
Binary file not shown.
@ -183,3 +183,13 @@ def test_ocr_detection(resources):
|
||||
pdf = pdfinfo.PdfInfo(filename)
|
||||
assert not pdf[0].has_vector
|
||||
assert pdf[0].has_text
|
||||
|
||||
|
||||
def test_corrupt_font_detection(resources):
|
||||
filename = resources / 'truetype_font_nomapping.pdf'
|
||||
with pytest.raises(NotImplementedError):
|
||||
pdf = pdfinfo.PdfInfo(filename)
|
||||
pdf[0].has_corrupt_text
|
||||
|
||||
pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True)
|
||||
assert pdf[0].has_corrupt_text
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user