From 32322a9fe9ed3b241baacec54654ba04a4b0ab22 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 9 Feb 2025 13:55:54 -0800 Subject: [PATCH] Fix broken test_hocrtransform_matches_sandwich Expect word similarity rather than exact match. Difference appears to be due to quote styles. Thanks @QuLogic for reporting. --- tests/test_hocrtransform.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_hocrtransform.py b/tests/test_hocrtransform.py index edde6dfe..0d5d7fe1 100644 --- a/tests/test_hocrtransform.py +++ b/tests/test_hocrtransform.py @@ -84,13 +84,19 @@ def test_hocrtransform_matches_sandwich(resources, outdir): def clean(s): s = re.sub(r'\s+', ' ', s) words = s.split(' ') - return '\n'.join(sorted(words)) + return set(words) - hocr_txt = clean(text_from_pdf(outdir / 'hocr.pdf')) - tess_txt = clean(text_from_pdf(outdir / 'tess.pdf')) + hocr_words = clean(text_from_pdf(outdir / 'hocr.pdf')) + tess_words = clean(text_from_pdf(outdir / 'tess.pdf')) + + similarity = len(hocr_words & tess_words) / len(hocr_words | tess_words) # from pathlib import Path - # Path('hocr.txt').write_text(hocr_txt) - # Path('tess.txt').write_text(tess_txt) - assert hocr_txt == tess_txt + # Path('hocr.txt').write_text(sorted('\n'.join(hocr_words))) + # Path('tess.txt').write_text(sorted('\n'.join(tess_words))) + # Path('mismatch.txt').write_text( + # '\n'.join(sorted(hocr_words ^ tess_words)), encoding='utf8' + # ) + + assert similarity > 0.99