Klaijan e65a44eabb
feat: update cct eval for text dir (#2299)
The code makes edit to the `measure_text_extraction_accuracy` function
to allows dir of txt as well as json. The function also takes input
`output_type` to be either "json" or "txt" only, and checks if the files
under given directory/list contains only specified file type or not.

To test this feature, run the following code:

```PYTHONPATH=. python unstructured/ingest/evaluate.py measure-text-extraction-accuracy-command --output_dir <clean-text-path> --source_dir <cct-label-path> --output_type txt```
2024-01-05 23:34:53 +00:00

36 lines
925 B
Python

import pytest
from unstructured.metrics.utils import (
_mean,
_pstdev,
_stdev,
_uniquity_file,
)
@pytest.mark.parametrize(
("numbers", "expected_mean", "expected_stdev", "expected_pstdev"),
[
([2, 5, 6, 7], 5, 2.16, 1.871),
([1, 100], 50.5, 70.004, 49.5),
([1], 1, None, None),
([], None, None, None),
],
)
def test_stats(numbers, expected_mean, expected_stdev, expected_pstdev):
mean = _mean(numbers)
stdev = _stdev(numbers)
pstdev = _pstdev(numbers)
assert mean == expected_mean
assert stdev == expected_stdev
assert pstdev == expected_pstdev
@pytest.mark.parametrize(
("filenames"),
[("filename.ext", "filename (1).ext", "randomfile.ext", "filename.txt", "filename (5).txt")],
)
def test_uniquity_file(filenames):
final_filename = _uniquity_file(filenames, "filename.ext")
assert final_filename == "filename (2).ext"