From 648ec33b4446e7aa33bce3e547737f04b508e90d Mon Sep 17 00:00:00 2001 From: amadeusz-ds <165173689+amadeusz-ds@users.noreply.github.com> Date: Thu, 9 May 2024 10:01:08 +0200 Subject: [PATCH] feat(evaluation): skip accuracy calculation (#2977) Skip accuracy calculation for files for which output and ground truth sizes differ greatly. ~10% speed up on local machine, keeping the same metrics. --------- Co-authored-by: cragwolfe --- CHANGELOG.md | 1 + unstructured/metrics/evaluate.py | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64f51d9f9..85c9d6a29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ * **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting. * **Extract OCRAgent.get_agent().** Generalize access to the configured OCRAgent instance beyond its use for PDFs. * **Add calculation of table related metrics which take into account colspans and rowspans** +* **Evaluation: skip accuracy calculation** for files for which output and ground truth sizes differ greatly ### Features diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 2b3900a5c..dd0062397 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -116,7 +116,13 @@ def measure_text_extraction_accuracy( except Exception: # if any of the output/source file is unable to open, skip the loop continue - accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3) + # NOTE(amadeusz): Levenshtein distance calculation takes too long + # skip it if file sizes differ wildly + if 0.5 < len(output_cct.encode()) / len(source_cct.encode()) < 2.0: + accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3) + else: + # 0.01 to distinguish it was set manually + accuracy = 0.01 percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3) rows.append([filename, doctype, connector, accuracy, percent_missing])