mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-05 12:27:55 +00:00
fix: treat yaml files as plain text (#2446)
### Summary Closes #2412. Adds support for YAML MIME types and treats them as plain text. In response to `500` errors that the API currently returns if the MIME type is `text/yaml`.
This commit is contained in:
parent
9fea85dc21
commit
4613e52e11
@ -10,6 +10,8 @@
|
|||||||
* **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
|
* **Add Databricks Volumes destination connector** Databricks Volumes connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data to a Databricks Volumes storage service.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
* **Treat YAML files as text.** Adds YAML MIME types to the file detection code and treats those
|
||||||
|
files as text.
|
||||||
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
|
* **Fix FSSpec destination connectors check_connection.** FSSpec destination connectors did not use `check_connection`. There was an error when trying to `ls` destination directory - it may not exist at the moment of connector creation. Now `check_connection` calls `ls` on bucket root and this method is called on `initialize` of destination connector.
|
||||||
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
|
* **Fix databricks-volumes extra location.** `setup.py` is currently pointing to the wrong location for the databricks-volumes extra requirements. This results in errors when trying to build the wheel for unstructured. This change updates to point to the correct path.
|
||||||
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
|
* **Fix uploading None values to Chroma and Pinecone.** Removes keys with None values with Pinecone and Chroma destinations. Pins Pinecone dependency
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import zipfile
|
|||||||
|
|
||||||
import magic
|
import magic
|
||||||
import pytest
|
import pytest
|
||||||
|
import yaml
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from unstructured.file_utils import filetype
|
from unstructured.file_utils import filetype
|
||||||
@ -481,3 +482,23 @@ def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
|
|||||||
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
|
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
assert detect_filetype(file=f) == FileType.WAV
|
assert detect_filetype(file=f) == FileType.WAV
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_yaml_as_text_from_filename(tmpdir):
|
||||||
|
data = {"hi": "there", "this is": "yaml"}
|
||||||
|
filename = os.path.join(tmpdir.dirname, "test.yaml")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
yaml.dump(data, f)
|
||||||
|
|
||||||
|
assert detect_filetype(filename=filename) == FileType.TXT
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
|
||||||
|
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
|
||||||
|
data = {"hi": "there", "this is": "yaml"}
|
||||||
|
filename = os.path.join(tmpdir.dirname, "test.yaml")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
yaml.dump(data, f)
|
||||||
|
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
assert detect_filetype(file=f) == FileType.TXT
|
||||||
|
|||||||
@ -114,6 +114,13 @@ STR_TO_FILETYPE = {
|
|||||||
"image/png": FileType.PNG,
|
"image/png": FileType.PNG,
|
||||||
"image/tiff": FileType.TIFF,
|
"image/tiff": FileType.TIFF,
|
||||||
"image/bmp": FileType.BMP,
|
"image/bmp": FileType.BMP,
|
||||||
|
# NOTE(robinson) - https://mimetype.io/application/yaml
|
||||||
|
# In the future, we may have special processing for YAML
|
||||||
|
# files instead of treating them as plaintext
|
||||||
|
"application/yaml": FileType.TXT,
|
||||||
|
"application/x-yaml": FileType.TXT,
|
||||||
|
"text/x-yaml": FileType.TXT,
|
||||||
|
"text/yaml": FileType.TXT,
|
||||||
"text/plain": FileType.TXT,
|
"text/plain": FileType.TXT,
|
||||||
"text/x-csv": FileType.CSV,
|
"text/x-csv": FileType.CSV,
|
||||||
"application/csv": FileType.CSV,
|
"application/csv": FileType.CSV,
|
||||||
@ -209,6 +216,8 @@ EXT_TO_FILETYPE = {
|
|||||||
".swift": FileType.TXT,
|
".swift": FileType.TXT,
|
||||||
".ts": FileType.TXT,
|
".ts": FileType.TXT,
|
||||||
".go": FileType.TXT,
|
".go": FileType.TXT,
|
||||||
|
".yaml": FileType.TXT,
|
||||||
|
".yml": FileType.TXT,
|
||||||
None: FileType.UNK,
|
None: FileType.UNK,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -349,7 +358,7 @@ def detect_filetype(
|
|||||||
return FileType.EML
|
return FileType.EML
|
||||||
|
|
||||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||||
return EXT_TO_FILETYPE.get(extension)
|
return EXT_TO_FILETYPE.get(extension, FileType.UNK)
|
||||||
|
|
||||||
# Safety catch
|
# Safety catch
|
||||||
if mime_type in STR_TO_FILETYPE:
|
if mime_type in STR_TO_FILETYPE:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user