mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

The code makes edit to the `measure_text_extraction_accuracy` function to allows dir of txt as well as json. The function also takes input `output_type` to be either "json" or "txt" only, and checks if the files under given directory/list contains only specified file type or not. To test this feature, run the following code: ```PYTHONPATH=. python unstructured/ingest/evaluate.py measure-text-extraction-accuracy-command --output_dir <clean-text-path> --source_dir <cct-label-path> --output_type txt```
36 lines
925 B
Python
36 lines
925 B
Python
import pytest
|
|
|
|
from unstructured.metrics.utils import (
|
|
_mean,
|
|
_pstdev,
|
|
_stdev,
|
|
_uniquity_file,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("numbers", "expected_mean", "expected_stdev", "expected_pstdev"),
|
|
[
|
|
([2, 5, 6, 7], 5, 2.16, 1.871),
|
|
([1, 100], 50.5, 70.004, 49.5),
|
|
([1], 1, None, None),
|
|
([], None, None, None),
|
|
],
|
|
)
|
|
def test_stats(numbers, expected_mean, expected_stdev, expected_pstdev):
|
|
mean = _mean(numbers)
|
|
stdev = _stdev(numbers)
|
|
pstdev = _pstdev(numbers)
|
|
assert mean == expected_mean
|
|
assert stdev == expected_stdev
|
|
assert pstdev == expected_pstdev
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filenames"),
|
|
[("filename.ext", "filename (1).ext", "randomfile.ext", "filename.txt", "filename (5).txt")],
|
|
)
|
|
def test_uniquity_file(filenames):
|
|
final_filename = _uniquity_file(filenames, "filename.ext")
|
|
assert final_filename == "filename (2).ext"
|