enhancement: file detection for .wav files (#2387)

### Summary Adds filetype detection for `.wav` audio files ### Testing ```python from unstructured.file_utils.filetype import detect_filetype filename = "example-docs/CantinaBand3.wav" detect_filetype(filename=filename) # Should be FileType.WAV ```
2025-06-27 02:30:08 +00:00 · 2024-01-15 11:50:49 -05:00 · 2024-01-15 11:50:49 -05:00 · 36faf677c0
commit 36faf677c0
parent d7980b3665
5 changed files with 21 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,7 @@

 ### Enhancements

+* **Add filetype detection for `.wav` files.** Add filetpye detection for `.wav` files.
 * **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries.
 * **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks.
 * **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents.
--- a/example-docs/CantinaBand3.wav
+++ b/example-docs/CantinaBand3.wav
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -445,3 +445,12 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
 def test_detect_filetype_from_octet_stream(filename="example-docs/emoji.xlsx"):
    with open(filename, "rb") as f:
        assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX
+
+
+def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
+    assert detect_filetype(filename=filename) == FileType.WAV
+
+
+def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
+    with open(filename, "rb") as f:
+        assert detect_filetype(file=f) == FileType.WAV
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -730,7 +730,7 @@ FILETYPE_TO_MODULE = {

@pytest.mark.parametrize("filetype", supported_filetypes)
 def test_file_specific_produces_correct_filetype(filetype: FileType):
-    if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.EMPTY):
+    if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.WAV, FileType.EMPTY):
        pytest.skip()
    extension = filetype.name.lower()
    filetype_module = (
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -97,6 +97,9 @@ class FileType(enum.Enum):
    # Open Office Types
    ODT = 70

+    # Audio Files
+    WAV = 80
+
    # NOTE(robinson) - This is to support sorting for pandas groupby functions
    def __lt__(self, other):
        return self.name < other.name
@ -136,6 +139,12 @@ STR_TO_FILETYPE = {
    "message/rfc822": FileType.EML,
    "application/x-ole-storage": FileType.MSG,
    "application/vnd.ms-outlook": FileType.MSG,
+    # NOTE(robinson) - https://mimetype.io/audio/wav
+    "audio/vnd.wav": FileType.WAV,
+    "audio/vnd.wave": FileType.WAV,
+    "audio/wave": FileType.WAV,
+    "audio/x-pn-wav": FileType.WAV,
+    "audio/x-wav": FileType.WAV,
    "inode/x-empty": FileType.EMPTY,
 }

@ -182,6 +191,7 @@ EXT_TO_FILETYPE = {
    ".tsv": FileType.TSV,
    ".tab": FileType.TSV,
    ".tiff": FileType.TIFF,
+    ".wav": FileType.WAV,
    # NOTE(robinson) - for now we are treating code files as plain text
    ".js": FileType.TXT,
    ".py": FileType.TXT,