enhancement: make tempfiles windows friendly (#3108)

### Summary

Updates handling of tempfiles so that they work on Windows systems.

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
Frederic Marvin Abraham 2024-06-17 19:28:48 +02:00 committed by GitHub
parent 2815226b54
commit 6220633d3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 5 deletions

View File

@ -1,8 +1,9 @@
## 0.14.7-dev1
## 0.14.7-dev2
### Enhancements
* **Pull from `wolfi-base` image.** The amd64 image now pulls from the `unstructured` `wolfi-base` image to avoid duplication of dependency setup steps.
* **Fix windows temp file.** Make the creation of a temp file in unstructured/partition/pdf_image/ocr.py windows compatible.
### Features

View File

@ -1 +1 @@
__version__ = "0.14.7-dev1" # pragma: no cover
__version__ = "0.14.7-dev2" # pragma: no cover

View File

@ -63,12 +63,16 @@ def process_data_with_ocr(
Returns:
DocumentLayout: The merged layout information obtained after OCR processing.
"""
with tempfile.NamedTemporaryFile() as tmp_file:
file_name = ""
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
data_bytes = data if isinstance(data, bytes) else data.read()
tmp_file.write(data_bytes)
tmp_file.flush()
file_name = tmp_file.name
try:
merged_layouts = process_file_with_ocr(
filename=tmp_file.name,
filename=file_name,
out_layout=out_layout,
extracted_layout=extracted_layout,
is_image=is_image,
@ -77,7 +81,11 @@ def process_data_with_ocr(
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
)
return merged_layouts
finally:
if os.path.isfile(file_name):
os.remove(file_name)
return merged_layouts
@requires_dependencies("unstructured_inference")