enhancement: file detection for .wav files (#2387)

### Summary

Adds filetype detection for `.wav` audio files

### Testing

```python
from unstructured.file_utils.filetype import detect_filetype

filename = "example-docs/CantinaBand3.wav"
detect_filetype(filename=filename) # Should be FileType.WAV
```
This commit is contained in:
Matt Robinson 2024-01-15 11:50:49 -05:00 committed by GitHub
parent d7980b3665
commit 36faf677c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 1 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* **Add filetype detection for `.wav` files.** Add filetpye detection for `.wav` files.
* **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries.
* **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks.
* **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents.

Binary file not shown.

View File

@ -445,3 +445,12 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
def test_detect_filetype_from_octet_stream(filename="example-docs/emoji.xlsx"):
with open(filename, "rb") as f:
assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX
def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
assert detect_filetype(filename=filename) == FileType.WAV
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.WAV

View File

@ -730,7 +730,7 @@ FILETYPE_TO_MODULE = {
@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.EMPTY):
if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.WAV, FileType.EMPTY):
pytest.skip()
extension = filetype.name.lower()
filetype_module = (

View File

@ -97,6 +97,9 @@ class FileType(enum.Enum):
# Open Office Types
ODT = 70
# Audio Files
WAV = 80
# NOTE(robinson) - This is to support sorting for pandas groupby functions
def __lt__(self, other):
return self.name < other.name
@ -136,6 +139,12 @@ STR_TO_FILETYPE = {
"message/rfc822": FileType.EML,
"application/x-ole-storage": FileType.MSG,
"application/vnd.ms-outlook": FileType.MSG,
# NOTE(robinson) - https://mimetype.io/audio/wav
"audio/vnd.wav": FileType.WAV,
"audio/vnd.wave": FileType.WAV,
"audio/wave": FileType.WAV,
"audio/x-pn-wav": FileType.WAV,
"audio/x-wav": FileType.WAV,
"inode/x-empty": FileType.EMPTY,
}
@ -182,6 +191,7 @@ EXT_TO_FILETYPE = {
".tsv": FileType.TSV,
".tab": FileType.TSV,
".tiff": FileType.TIFF,
".wav": FileType.WAV,
# NOTE(robinson) - for now we are treating code files as plain text
".js": FileType.TXT,
".py": FileType.TXT,