mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 e65a44eabb
			
		
	
	
		e65a44eabb
		
			
		
	
	
	
	
		
			
			The code makes edit to the `measure_text_extraction_accuracy` function to allows dir of txt as well as json. The function also takes input `output_type` to be either "json" or "txt" only, and checks if the files under given directory/list contains only specified file type or not. To test this feature, run the following code: ```PYTHONPATH=. python unstructured/ingest/evaluate.py measure-text-extraction-accuracy-command --output_dir <clean-text-path> --source_dir <cct-label-path> --output_type txt```
		
			
				
	
	
		
			36 lines
		
	
	
		
			925 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			36 lines
		
	
	
		
			925 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| from unstructured.metrics.utils import (
 | |
|     _mean,
 | |
|     _pstdev,
 | |
|     _stdev,
 | |
|     _uniquity_file,
 | |
| )
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("numbers", "expected_mean", "expected_stdev", "expected_pstdev"),
 | |
|     [
 | |
|         ([2, 5, 6, 7], 5, 2.16, 1.871),
 | |
|         ([1, 100], 50.5, 70.004, 49.5),
 | |
|         ([1], 1, None, None),
 | |
|         ([], None, None, None),
 | |
|     ],
 | |
| )
 | |
| def test_stats(numbers, expected_mean, expected_stdev, expected_pstdev):
 | |
|     mean = _mean(numbers)
 | |
|     stdev = _stdev(numbers)
 | |
|     pstdev = _pstdev(numbers)
 | |
|     assert mean == expected_mean
 | |
|     assert stdev == expected_stdev
 | |
|     assert pstdev == expected_pstdev
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filenames"),
 | |
|     [("filename.ext", "filename (1).ext", "randomfile.ext", "filename.txt", "filename (5).txt")],
 | |
| )
 | |
| def test_uniquity_file(filenames):
 | |
|     final_filename = _uniquity_file(filenames, "filename.ext")
 | |
|     assert final_filename == "filename (2).ext"
 |