| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  | import os | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  | import pathlib | 
					
						
							| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | import pandas as pd | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | from unstructured.file_utils import exploration | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  | from unstructured.file_utils.filetype import FileType | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-10 14:52:25 -04:00
										 |  |  | is_in_docker = os.path.exists("/.dockerenv") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  | def test_get_directory_file_info(tmpdir): | 
					
						
							|  |  |  |     file_info_test = os.path.join(tmpdir, "file_info_test") | 
					
						
							|  |  |  |     if not os.path.exists(file_info_test): | 
					
						
							|  |  |  |         os.mkdir(file_info_test) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     directory1 = os.path.join(file_info_test, "directory1") | 
					
						
							|  |  |  |     if not os.path.exists(directory1): | 
					
						
							|  |  |  |         os.mkdir(directory1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filename1 = os.path.join(directory1, "filename1.txt") | 
					
						
							|  |  |  |     with open(filename1, "w") as f: | 
					
						
							|  |  |  |         f.write("hello there!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     directory2 = os.path.join(file_info_test, "directory2") | 
					
						
							|  |  |  |     if not os.path.exists(directory2): | 
					
						
							|  |  |  |         os.mkdir(directory2) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filename2 = os.path.join(directory2, "filename2.txt") | 
					
						
							|  |  |  |     with open(filename2, "w") as f: | 
					
						
							|  |  |  |         f.write("hello there!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_info = exploration.get_directory_file_info(file_info_test) | 
					
						
							|  |  |  |     assert isinstance(file_info, pd.DataFrame) | 
					
						
							|  |  |  |     assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-26 09:26:06 -04:00
										 |  |  |     means = file_info.groupby("filetype").mean(numeric_only=True) | 
					
						
							| 
									
										
										
										
											2023-01-11 12:40:50 -05:00
										 |  |  |     assert means.columns.to_list() == ["filesize"] | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-10 14:52:25 -04:00
										 |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  | def test_get_file_info(tmpdir): | 
					
						
							|  |  |  |     file_info_test = os.path.join(tmpdir, "file_info_test") | 
					
						
							|  |  |  |     if not os.path.exists(file_info_test): | 
					
						
							|  |  |  |         os.mkdir(file_info_test) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     directory1 = os.path.join(file_info_test, "directory1") | 
					
						
							|  |  |  |     if not os.path.exists(directory1): | 
					
						
							|  |  |  |         os.mkdir(directory1) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filename1 = os.path.join(directory1, "filename1.txt") | 
					
						
							|  |  |  |     with open(filename1, "w") as f: | 
					
						
							|  |  |  |         f.write("hello there!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     directory2 = os.path.join(file_info_test, "directory2") | 
					
						
							|  |  |  |     if not os.path.exists(directory2): | 
					
						
							|  |  |  |         os.mkdir(directory2) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     filename2 = os.path.join(directory2, "filename2.txt") | 
					
						
							|  |  |  |     with open(filename2, "w") as f: | 
					
						
							|  |  |  |         f.write("hello there!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_info = exploration.get_file_info([filename1, filename2]) | 
					
						
							|  |  |  |     assert isinstance(file_info, pd.DataFrame) | 
					
						
							|  |  |  |     assert set(file_info["filename"].to_list()) == {"filename1.txt", "filename2.txt"} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-27 13:33:36 -04:00
										 |  |  |     means = file_info.groupby("filetype").mean(numeric_only=True) | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  |     assert means.columns.to_list() == ["filesize"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_get_file_info_from_file_contents(): | 
					
						
							|  |  |  |     file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(file_contents_filename) as f: | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  |         file_contents = [f.read()] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_info = exploration.get_file_info_from_file_contents( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         file_contents=file_contents, | 
					
						
							|  |  |  |         filenames=["test.eml"], | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  |     ) | 
					
						
							|  |  |  |     assert file_info.filetype[0] == FileType.EML | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_get_file_info_from_file_contents_raises_if_lists_no_equal(): | 
					
						
							|  |  |  |     file_contents_filename = os.path.join(DIRECTORY, "test-file-contents.txt") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(file_contents_filename) as f: | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  |         file_contents = [f.read()] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         exploration.get_file_info_from_file_contents( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |             file_contents=file_contents, | 
					
						
							|  |  |  |             filenames=["test.eml", "test2.eml"], | 
					
						
							| 
									
										
										
										
											2023-02-08 15:48:39 -05:00
										 |  |  |         ) |