mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-18 21:10:01 +00:00
feat(evaluation): skip accuracy calculation (#2977)
Skip accuracy calculation for files for which output and ground truth sizes differ greatly. ~10% speed up on local machine, keeping the same metrics. --------- Co-authored-by: cragwolfe <crag@unstructured.io>
This commit is contained in:
parent
e15adb418b
commit
648ec33b44
@ -15,6 +15,7 @@
|
|||||||
* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
|
* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
|
||||||
* **Extract OCRAgent.get_agent().** Generalize access to the configured OCRAgent instance beyond its use for PDFs.
|
* **Extract OCRAgent.get_agent().** Generalize access to the configured OCRAgent instance beyond its use for PDFs.
|
||||||
* **Add calculation of table related metrics which take into account colspans and rowspans**
|
* **Add calculation of table related metrics which take into account colspans and rowspans**
|
||||||
|
* **Evaluation: skip accuracy calculation** for files for which output and ground truth sizes differ greatly
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
@ -116,7 +116,13 @@ def measure_text_extraction_accuracy(
|
|||||||
except Exception:
|
except Exception:
|
||||||
# if any of the output/source file is unable to open, skip the loop
|
# if any of the output/source file is unable to open, skip the loop
|
||||||
continue
|
continue
|
||||||
|
# NOTE(amadeusz): Levenshtein distance calculation takes too long
|
||||||
|
# skip it if file sizes differ wildly
|
||||||
|
if 0.5 < len(output_cct.encode()) / len(source_cct.encode()) < 2.0:
|
||||||
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
|
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
|
||||||
|
else:
|
||||||
|
# 0.01 to distinguish it was set manually
|
||||||
|
accuracy = 0.01
|
||||||
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
|
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
|
||||||
rows.append([filename, doctype, connector, accuracy, percent_missing])
|
rows.append([filename, doctype, connector, accuracy, percent_missing])
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user