mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
enhancement: file detection for .wav
files (#2387)
### Summary Adds filetype detection for `.wav` audio files ### Testing ```python from unstructured.file_utils.filetype import detect_filetype filename = "example-docs/CantinaBand3.wav" detect_filetype(filename=filename) # Should be FileType.WAV ```
This commit is contained in:
parent
d7980b3665
commit
36faf677c0
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Add filetype detection for `.wav` files.** Add filetpye detection for `.wav` files.
|
||||
* **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries.
|
||||
* **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks.
|
||||
* **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents.
|
||||
|
BIN
example-docs/CantinaBand3.wav
Normal file
BIN
example-docs/CantinaBand3.wav
Normal file
Binary file not shown.
@ -445,3 +445,12 @@ def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
|
||||
def test_detect_filetype_from_octet_stream(filename="example-docs/emoji.xlsx"):
|
||||
with open(filename, "rb") as f:
|
||||
assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX
|
||||
|
||||
|
||||
def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
|
||||
assert detect_filetype(filename=filename) == FileType.WAV
|
||||
|
||||
|
||||
def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(file=f) == FileType.WAV
|
||||
|
@ -730,7 +730,7 @@ FILETYPE_TO_MODULE = {
|
||||
|
||||
@pytest.mark.parametrize("filetype", supported_filetypes)
|
||||
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.EMPTY):
|
||||
if filetype in (FileType.JPG, FileType.PNG, FileType.TIFF, FileType.WAV, FileType.EMPTY):
|
||||
pytest.skip()
|
||||
extension = filetype.name.lower()
|
||||
filetype_module = (
|
||||
|
@ -97,6 +97,9 @@ class FileType(enum.Enum):
|
||||
# Open Office Types
|
||||
ODT = 70
|
||||
|
||||
# Audio Files
|
||||
WAV = 80
|
||||
|
||||
# NOTE(robinson) - This is to support sorting for pandas groupby functions
|
||||
def __lt__(self, other):
|
||||
return self.name < other.name
|
||||
@ -136,6 +139,12 @@ STR_TO_FILETYPE = {
|
||||
"message/rfc822": FileType.EML,
|
||||
"application/x-ole-storage": FileType.MSG,
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
# NOTE(robinson) - https://mimetype.io/audio/wav
|
||||
"audio/vnd.wav": FileType.WAV,
|
||||
"audio/vnd.wave": FileType.WAV,
|
||||
"audio/wave": FileType.WAV,
|
||||
"audio/x-pn-wav": FileType.WAV,
|
||||
"audio/x-wav": FileType.WAV,
|
||||
"inode/x-empty": FileType.EMPTY,
|
||||
}
|
||||
|
||||
@ -182,6 +191,7 @@ EXT_TO_FILETYPE = {
|
||||
".tsv": FileType.TSV,
|
||||
".tab": FileType.TSV,
|
||||
".tiff": FileType.TIFF,
|
||||
".wav": FileType.WAV,
|
||||
# NOTE(robinson) - for now we are treating code files as plain text
|
||||
".js": FileType.TXT,
|
||||
".py": FileType.TXT,
|
||||
|
Loading…
x
Reference in New Issue
Block a user