mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 03:23:25 +00:00
fix: treat yaml files as plain text (#2446)
### Summary Closes #2412. Adds support for YAML MIME types and treats them as plain text. In response to `500` errors that the API currently returns if the MIME type is `text/yaml`.
This commit is contained in:
parent
9fea85dc21
commit
4613e52e11
@ -10,6 +10,8 @@
|
||||
* **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
|
||||
|
||||
### Fixes
|
||||
* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
|
||||
files as text.
|
||||
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
|
||||
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
|
||||
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
|
||||
|
||||
@ -4,6 +4,7 @@ import zipfile
|
||||
|
||||
import magic
|
||||
import pytest
|
||||
import yaml
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.file_utils import filetype
|
||||
@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
|
||||
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(file=f) == FileType.WAV
|
||||
|
||||
|
||||
def test_detect_yaml_as_text_from_filename(tmpdir):
|
||||
data = {"hi": "there", "this is": "yaml"}
|
||||
filename = os.path.join(tmpdir.dirname, "test.yaml")
|
||||
with open(filename, "w") as f:
|
||||
yaml.dump(data, f)
|
||||
|
||||
assert detect_filetype(filename=filename) == FileType.TXT
|
||||
|
||||
|
||||
def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
|
||||
data = {"hi": "there", "this is": "yaml"}
|
||||
filename = os.path.join(tmpdir.dirname, "test.yaml")
|
||||
with open(filename, "w") as f:
|
||||
yaml.dump(data, f)
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(file=f) == FileType.TXT
|
||||
|
||||
@ -114,6 +114,13 @@ STR_TO_FILETYPE = {
|
||||
"image/png": FileType.PNG,
|
||||
"image/tiff": FileType.TIFF,
|
||||
"image/bmp": FileType.BMP,
|
||||
# NOTE(robinson) - https://mimetype.io/application/yaml
|
||||
# In the future, we may have special processing for YAML
|
||||
# files instead of treating them as plaintext
|
||||
"application/yaml": FileType.TXT,
|
||||
"application/x-yaml": FileType.TXT,
|
||||
"text/x-yaml": FileType.TXT,
|
||||
"text/yaml": FileType.TXT,
|
||||
"text/plain": FileType.TXT,
|
||||
"text/x-csv": FileType.CSV,
|
||||
"application/csv": FileType.CSV,
|
||||
@ -209,6 +216,8 @@ EXT_TO_FILETYPE = {
|
||||
".swift": FileType.TXT,
|
||||
".ts": FileType.TXT,
|
||||
".go": FileType.TXT,
|
||||
".yaml": FileType.TXT,
|
||||
".yml": FileType.TXT,
|
||||
None: FileType.UNK,
|
||||
}
|
||||
|
||||
@ -349,7 +358,7 @@ def detect_filetype(
|
||||
return FileType.EML
|
||||
|
||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||
|
||||
# Safety catch
|
||||
if mime_type in STR_TO_FILETYPE:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user