mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 f21c853ade
			
		
	
	
		f21c853ade
		
			
		
	
	
	
	
		
			
			Fix disk space leaks and Windows errors when accessing file.name on a NamedTemporaryFile Uses of `NamedTemporaryFile(..., delete=False)` and/or uses of `file.name` of NamedTemporaryFile have been replaced with TemporaryFileDirectory to avoid a known issue: - https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile - https://github.com/Unstructured-IO/unstructured/issues/3390 The first 7 commits each address an individual occurrence of the issue if reviewers want to review commit-by-commit.
		
			
				
	
	
		
			55 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			55 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import pathlib
 | |
| import tempfile
 | |
| from unittest.mock import patch
 | |
| 
 | |
| import pypandoc
 | |
| import pytest
 | |
| 
 | |
| from test_unstructured.unit_utils import FixtureRequest, example_doc_path, stdlib_fn_mock
 | |
| from unstructured.file_utils.file_conversion import (
 | |
|     convert_file_to_html_text_using_pandoc,
 | |
|     convert_file_to_text,
 | |
| )
 | |
| 
 | |
| DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | |
| 
 | |
| 
 | |
| def test_convert_file_to_text():
 | |
|     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | |
|     html_text = convert_file_to_text(filename, source_format="epub", target_format="html")
 | |
|     assert html_text.startswith("<p>")
 | |
| 
 | |
| 
 | |
| def test_convert_to_file_raises_if_pandoc_not_available():
 | |
|     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | |
|     with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError):
 | |
|         with pytest.raises(FileNotFoundError):
 | |
|             convert_file_to_text(filename, source_format="epub", target_format="html")
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("source_format", "filename"),
 | |
|     [
 | |
|         ("epub", "winter-sports.epub"),
 | |
|         ("org", "README.org"),
 | |
|         ("rst", "README.rst"),
 | |
|         ("rtf", "fake-doc.rtf"),
 | |
|     ],
 | |
| )
 | |
| def test_convert_file_to_html_text_using_pandoc(
 | |
|     request: FixtureRequest, tmp_path: pathlib.Path, source_format: str, filename: str
 | |
| ):
 | |
|     # -- Get a real tempdir: `tmp_path`
 | |
|     # -- Mock tempfile.TemporaryDirectory() using `stdlib_fn_mock`
 | |
|     # -- Set the return value of mock.__enter__ to the real tempdir
 | |
|     tempdir_ = stdlib_fn_mock(request, tempfile, "TemporaryDirectory")
 | |
|     tempdir_.return_value.__enter__.return_value = tmp_path
 | |
| 
 | |
|     with open(example_doc_path(filename), "rb") as f:
 | |
|         html_text = convert_file_to_html_text_using_pandoc(file=f, source_format=source_format)
 | |
| 
 | |
|     assert isinstance(html_text, str)
 | |
|     assert len(list(tmp_path.iterdir())) == 1
 | |
|     tempdir_.return_value.__exit__.assert_called_once()
 |