mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	The code makes edit to the `measure_text_extraction_accuracy` function to allows dir of txt as well as json. The function also takes input `output_type` to be either "json" or "txt" only, and checks if the files under given directory/list contains only specified file type or not. To test this feature, run the following code: ```PYTHONPATH=. python unstructured/ingest/evaluate.py measure-text-extraction-accuracy-command --output_dir <clean-text-path> --source_dir <cct-label-path> --output_type txt```
		
			
				
	
	
		
			36 lines
		
	
	
		
			925 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			36 lines
		
	
	
		
			925 B
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
 | 
						|
from unstructured.metrics.utils import (
 | 
						|
    _mean,
 | 
						|
    _pstdev,
 | 
						|
    _stdev,
 | 
						|
    _uniquity_file,
 | 
						|
)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("numbers", "expected_mean", "expected_stdev", "expected_pstdev"),
 | 
						|
    [
 | 
						|
        ([2, 5, 6, 7], 5, 2.16, 1.871),
 | 
						|
        ([1, 100], 50.5, 70.004, 49.5),
 | 
						|
        ([1], 1, None, None),
 | 
						|
        ([], None, None, None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_stats(numbers, expected_mean, expected_stdev, expected_pstdev):
 | 
						|
    mean = _mean(numbers)
 | 
						|
    stdev = _stdev(numbers)
 | 
						|
    pstdev = _pstdev(numbers)
 | 
						|
    assert mean == expected_mean
 | 
						|
    assert stdev == expected_stdev
 | 
						|
    assert pstdev == expected_pstdev
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("filenames"),
 | 
						|
    [("filename.ext", "filename (1).ext", "randomfile.ext", "filename.txt", "filename (5).txt")],
 | 
						|
)
 | 
						|
def test_uniquity_file(filenames):
 | 
						|
    final_filename = _uniquity_file(filenames, "filename.ext")
 | 
						|
    assert final_filename == "filename (2).ext"
 |