Fix json bytes content type detection (#3941)

Fixes order of content type detection strategies for byte-encoded jsons.

Before
```
json_bytes = json.dumps([{"example": "data"}]).encode("utf-8")
file_buffer = io.BytesIO(json_bytes)
detect_filetype(file=file_buffer, metadata_file_path="filename.pdf") 
```

Before
PDF

Now
JSON
This commit is contained in:
Pluto 2025-03-07 11:33:33 +01:00 committed by GitHub
parent 961c8d5b11
commit 74b0647aa2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 317 additions and 260 deletions

View File

@ -1,3 +1,14 @@
## 0.16.25
### Enhancements
### Features
### Fixes
- **Fixes filetype detection for jsons passed as byte streams** - Now it prioritizes magic mimetype prediction over file extension when detecting filetypes
## 0.16.24
### Enhancements
@ -14,6 +25,7 @@
### Fixes
## 0.16.23
### Enhancements

View File

@ -2,14 +2,14 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile base.in
# pip-compile ./base.in
#
anyio==4.8.0
# via httpx
backoff==2.2.1
# via -r base.in
# via -r ./base.in
beautifulsoup4==4.13.3
# via -r base.in
# via -r ./base.in
certifi==2025.1.31
# via
# httpcore
@ -19,7 +19,7 @@ certifi==2025.1.31
cffi==1.17.1
# via cryptography
chardet==5.2.0
# via -r base.in
# via -r ./base.in
charset-normalizer==3.4.1
# via
# requests
@ -28,24 +28,24 @@ click==8.1.8
# via
# nltk
# python-oxmsg
cryptography==44.0.1
cryptography==44.0.2
# via unstructured-client
dataclasses-json==0.6.7
# via
# -r base.in
# -r ./base.in
# unstructured-client
deepdiff==8.2.0
deepdiff==8.3.0
# via unstructured-client
emoji==2.14.1
# via -r base.in
# via -r ./base.in
exceptiongroup==1.2.2
# via anyio
filetype==1.2.0
# via -r base.in
# via -r ./base.in
h11==0.14.0
# via httpcore
html5lib==1.1
# via -r base.in
# via -r ./base.in
httpcore==1.0.7
# via httpx
httpx==0.28.1
@ -61,9 +61,9 @@ joblib==1.4.2
jsonpath-python==1.0.6
# via unstructured-client
langdetect==1.0.9
# via -r base.in
# via -r ./base.in
lxml==5.3.1
# via -r base.in
# via -r ./base.in
marshmallow==3.26.1
# via
# dataclasses-json
@ -75,9 +75,9 @@ mypy-extensions==1.0.0
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.9.1
# via -r base.in
# via -r ./base.in
numpy==1.26.4
# via -r base.in
# via -r ./base.in
olefile==0.47
# via python-oxmsg
orderly-set==5.3.0
@ -87,26 +87,26 @@ packaging==24.2
# marshmallow
# unstructured-client
psutil==7.0.0
# via -r base.in
# via -r ./base.in
pycparser==2.22
# via cffi
pypdf==5.3.0
pypdf==5.3.1
# via unstructured-client
python-dateutil==2.9.0.post0
# via unstructured-client
python-iso639==2025.2.18
# via -r base.in
# via -r ./base.in
python-magic==0.4.27
# via -r base.in
# via -r ./base.in
python-oxmsg==0.0.2
# via -r base.in
rapidfuzz==3.12.1
# via -r base.in
# via -r ./base.in
rapidfuzz==3.12.2
# via -r ./base.in
regex==2024.11.6
# via nltk
requests==2.32.3
# via
# -r base.in
# -r ./base.in
# requests-toolbelt
# unstructured-client
requests-toolbelt==1.0.0
@ -123,11 +123,11 @@ soupsieve==2.6
# via beautifulsoup4
tqdm==4.67.1
# via
# -r base.in
# -r ./base.in
# nltk
typing-extensions==4.12.2
# via
# -r base.in
# -r ./base.in
# anyio
# beautifulsoup4
# pypdf
@ -140,14 +140,14 @@ typing-inspect==0.9.0
# unstructured-client
unstructured-client==0.25.9
# via
# -c ./deps/constraints.txt
# -r base.in
# -c ././deps/constraints.txt
# -r ./base.in
urllib3==1.26.20
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# requests
# unstructured-client
webencodings==0.5.1
# via html5lib
wrapt==1.17.2
# via -r base.in
# via -r ./base.in

View File

@ -6,6 +6,8 @@
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
weaviate-client>=3.26.7,<4.0.0
# TODO: Constriant due to multiple versions being installed during pip-compile
protobuf>=6.30.0
# TODO: Constriant due to multiple versions being installed during pip-compile
grpcio>=1.65.5
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)
tokenizers>=0.21,<0.22

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile dev.in
# pip-compile ./dev.in
#
build==1.2.2.post1
# via pip-tools
@ -10,48 +10,48 @@ cfgv==3.4.0
# via pre-commit
click==8.1.8
# via
# -c base.txt
# -c test.txt
# -c ./base.txt
# -c ./test.txt
# pip-tools
distlib==0.3.9
# via virtualenv
filelock==3.17.0
# via virtualenv
identify==2.6.7
identify==2.6.8
# via pre-commit
importlib-metadata==8.6.1
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# build
nodeenv==1.9.1
# via pre-commit
packaging==24.2
# via
# -c base.txt
# -c test.txt
# -c ./base.txt
# -c ./test.txt
# build
pip-tools==7.4.1
# via -r dev.in
# via -r ./dev.in
platformdirs==4.3.6
# via
# -c test.txt
# -c ./test.txt
# virtualenv
pre-commit==4.1.0
# via -r dev.in
# via -r ./dev.in
pyproject-hooks==1.2.0
# via
# build
# pip-tools
pyyaml==6.0.2
# via
# -c test.txt
# -c ./test.txt
# pre-commit
tomli==2.2.1
# via
# -c test.txt
# -c ./test.txt
# build
# pip-tools
virtualenv==20.29.2
virtualenv==20.29.3
# via pre-commit
wheel==0.45.1
# via pip-tools

View File

@ -2,23 +2,23 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-csv.in
# pip-compile ./extra-csv.in
#
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# pandas
pandas==2.2.3
# via -r extra-csv.in
# via -r ./extra-csv.in
python-dateutil==2.9.0.post0
# via
# -c base.txt
# -c ./base.txt
# pandas
pytz==2025.1
# via pandas
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# python-dateutil
tzdata==2025.1
# via pandas

View File

@ -2,15 +2,15 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-docx.in
# pip-compile ./extra-docx.in
#
lxml==5.3.1
# via
# -c base.txt
# -c ./base.txt
# python-docx
python-docx==1.1.2
# via -r extra-docx.in
# via -r ./extra-docx.in
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# python-docx

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-epub.in
# pip-compile ./extra-epub.in
#
pypandoc==1.15
# via -r extra-epub.in
# via -r ./extra-epub.in

View File

@ -2,13 +2,13 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-markdown.in
# pip-compile ./extra-markdown.in
#
importlib-metadata==8.6.1
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# markdown
markdown==3.7
# via -r extra-markdown.in
# via -r ./extra-markdown.in
zipp==3.21.0
# via importlib-metadata

View File

@ -2,17 +2,17 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-odt.in
# pip-compile ./extra-odt.in
#
lxml==5.3.1
# via
# -c base.txt
# -c ./base.txt
# python-docx
pypandoc==1.15
# via -r extra-odt.in
# via -r ./extra-odt.in
python-docx==1.1.2
# via -r extra-odt.in
# via -r ./extra-odt.in
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# python-docx

View File

@ -2,23 +2,23 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-paddleocr.in
# pip-compile ./extra-paddleocr.in
#
anyio==4.8.0
# via
# -c base.txt
# -c ./base.txt
# httpx
astor==0.8.1
# via paddlepaddle
certifi==2025.1.31
# via
# -c base.txt
# -c ./base.txt
# httpcore
# httpx
# requests
charset-normalizer==3.4.1
# via
# -c base.txt
# -c ./base.txt
# requests
contourpy==1.3.0
# via matplotlib
@ -26,29 +26,29 @@ cycler==0.12.1
# via matplotlib
cython==3.0.12
# via unstructured-paddleocr
decorator==5.1.1
decorator==5.2.1
# via paddlepaddle
exceptiongroup==1.2.2
# via
# -c base.txt
# -c ./base.txt
# anyio
fonttools==4.56.0
# via matplotlib
h11==0.14.0
# via
# -c base.txt
# -c ./base.txt
# httpcore
httpcore==1.0.7
# via
# -c base.txt
# -c ./base.txt
# httpx
httpx==0.28.1
# via
# -c base.txt
# -c ./base.txt
# paddlepaddle
idna==3.10
# via
# -c base.txt
# -c ./base.txt
# anyio
# httpx
# requests
@ -72,7 +72,7 @@ networkx==3.2.1
# scikit-image
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# contourpy
# imageio
# imgaug
@ -96,12 +96,12 @@ opt-einsum==3.3.0
# via paddlepaddle
packaging==24.2
# via
# -c base.txt
# -c ./base.txt
# lazy-loader
# matplotlib
# scikit-image
paddlepaddle==3.0.0b1
# via -r extra-paddleocr.in
# via -r ./extra-paddleocr.in
pdf2image==1.17.0
# via unstructured-paddleocr
pillow==11.1.0
@ -113,25 +113,27 @@ pillow==11.1.0
# pdf2image
# scikit-image
# unstructured-paddleocr
protobuf==5.29.3
# via paddlepaddle
protobuf==6.30.0
# via
# -c ././deps/constraints.txt
# paddlepaddle
pyclipper==1.3.0.post6
# via unstructured-paddleocr
pyparsing==3.2.1
# via matplotlib
python-dateutil==2.9.0.post0
# via
# -c base.txt
# -c ./base.txt
# matplotlib
pyyaml==6.0.2
# via unstructured-paddleocr
rapidfuzz==3.12.1
rapidfuzz==3.12.2
# via
# -c base.txt
# -c ./base.txt
# unstructured-paddleocr
requests==2.32.3
# via
# -c base.txt
# -c ./base.txt
# unstructured-paddleocr
scikit-image==0.24.0
# via
@ -147,30 +149,30 @@ shapely==2.0.7
# unstructured-paddleocr
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# imgaug
# python-dateutil
sniffio==1.3.1
# via
# -c base.txt
# -c ./base.txt
# anyio
tifffile==2024.8.30
# via scikit-image
tqdm==4.67.1
# via
# -c base.txt
# -c ./base.txt
# unstructured-paddleocr
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# anyio
# paddlepaddle
unstructured-paddleocr==2.8.1.0
# via -r extra-paddleocr.in
# via -r ./extra-paddleocr.in
urllib3==1.26.20
# via
# -c ./deps/constraints.txt
# -c base.txt
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
zipp==3.21.0
# via importlib-resources

View File

@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-pandoc.in
# pip-compile ./extra-pandoc.in
#
pypandoc==1.15
# via -r extra-pandoc.in
# via -r ./extra-pandoc.in

View File

@ -2,39 +2,39 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-pdf-image.in
# pip-compile ./extra-pdf-image.in
#
antlr4-python3-runtime==4.9.3
# via omegaconf
cachetools==5.5.1
cachetools==5.5.2
# via google-auth
certifi==2025.1.31
# via
# -c base.txt
# -c ./base.txt
# requests
cffi==1.17.1
# via
# -c base.txt
# -c ./base.txt
# cryptography
charset-normalizer==3.4.1
# via
# -c base.txt
# -c ./base.txt
# pdfminer-six
# requests
coloredlogs==15.0.1
# via onnxruntime
contourpy==1.3.0
# via matplotlib
cryptography==44.0.1
cryptography==44.0.2
# via
# -c base.txt
# -c ./base.txt
# pdfminer-six
cycler==0.12.1
# via matplotlib
deprecated==1.2.18
# via pikepdf
effdet==0.4.1
# via -r extra-pdf-image.in
# via -r ./extra-pdf-image.in
filelock==3.17.0
# via
# huggingface-hub
@ -48,26 +48,26 @@ fsspec==2025.2.0
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.24.1
google-api-core[grpc]==2.8.0
# via google-cloud-vision
google-auth==2.38.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==3.10.0
# via -r extra-pdf-image.in
googleapis-common-protos==1.67.0
google-cloud-vision==2.7.2
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.56.1
# via
# google-api-core
# grpcio-status
grpcio==1.70.0
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# google-api-core
# grpcio-status
grpcio-status==1.70.0
grpcio-status==1.62.3
# via google-api-core
huggingface-hub==0.28.1
huggingface-hub==0.29.2
# via
# timm
# tokenizers
@ -77,17 +77,17 @@ humanfriendly==10.0
# via coloredlogs
idna==3.10
# via
# -c base.txt
# -c ./base.txt
# requests
importlib-resources==6.5.2
# via matplotlib
jinja2==3.1.5
jinja2==3.1.6
# via torch
kiwisolver==1.4.7
# via matplotlib
lxml==5.3.1
# via
# -c base.txt
# -c ./base.txt
# pikepdf
markupsafe==3.0.2
# via jinja2
@ -101,7 +101,7 @@ networkx==3.2.1
# via torch
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# contourpy
# matplotlib
# onnx
@ -117,7 +117,7 @@ omegaconf==2.3.0
# via effdet
onnx==1.17.0
# via
# -r extra-pdf-image.in
# -r ./extra-pdf-image.in
# unstructured-inference
onnxruntime==1.19.2
# via unstructured-inference
@ -125,7 +125,7 @@ opencv-python==4.11.0.86
# via unstructured-inference
packaging==24.2
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# matplotlib
# onnxruntime
@ -135,15 +135,15 @@ packaging==24.2
pandas==2.2.3
# via unstructured-inference
pdf2image==1.17.0
# via -r extra-pdf-image.in
# via -r ./extra-pdf-image.in
pdfminer-six==20240706
# via
# -r extra-pdf-image.in
# -r ./extra-pdf-image.in
# unstructured-inference
pi-heif==0.21.0
# via -r extra-pdf-image.in
# via -r ./extra-pdf-image.in
pikepdf==9.5.2
# via -r extra-pdf-image.in
# via -r ./extra-pdf-image.in
pillow==11.1.0
# via
# matplotlib
@ -152,14 +152,12 @@ pillow==11.1.0
# pikepdf
# torchvision
# unstructured-pytesseract
proto-plus==1.26.0
proto-plus==1.20.4
# via google-cloud-vision
protobuf==6.30.0
# via
# -c ././deps/constraints.txt
# google-api-core
# google-cloud-vision
protobuf==5.29.3
# via
# google-api-core
# google-cloud-vision
# googleapis-common-protos
# grpcio-status
# onnx
@ -175,19 +173,19 @@ pycocotools==2.0.8
# via effdet
pycparser==2.22
# via
# -c base.txt
# -c ./base.txt
# cffi
pyparsing==3.2.1
# via matplotlib
pypdf==5.3.0
pypdf==5.3.1
# via
# -c base.txt
# -r extra-pdf-image.in
# -c ./base.txt
# -r ./extra-pdf-image.in
pypdfium2==4.30.1
# via unstructured-inference
python-dateutil==2.9.0.post0
# via
# -c base.txt
# -c ./base.txt
# matplotlib
# pandas
python-multipart==0.0.20
@ -200,23 +198,23 @@ pyyaml==6.0.2
# omegaconf
# timm
# transformers
rapidfuzz==3.12.1
rapidfuzz==3.12.2
# via
# -c base.txt
# -c ./base.txt
# unstructured-inference
regex==2024.11.6
# via
# -c base.txt
# -c ./base.txt
# transformers
requests==2.32.3
# via
# -c base.txt
# -c ./base.txt
# google-api-core
# huggingface-hub
# transformers
rsa==4.9
# via google-auth
safetensors==0.5.2
safetensors==0.5.3
# via
# timm
# transformers
@ -224,19 +222,19 @@ scipy==1.13.1
# via unstructured-inference
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# python-dateutil
sympy==1.13.1
# via
# onnxruntime
# torch
timm==1.0.14
timm==1.0.15
# via
# effdet
# unstructured-inference
tokenizers==0.21.0
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# transformers
torch==2.6.0
# via
@ -250,31 +248,31 @@ torchvision==0.21.0
# timm
tqdm==4.67.1
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# transformers
transformers==4.49.0
# via unstructured-inference
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# pypdf
# torch
tzdata==2025.1
# via pandas
unstructured-inference==0.8.7
# via -r extra-pdf-image.in
unstructured-pytesseract==0.3.13
# via -r extra-pdf-image.in
unstructured-inference==0.8.9
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.15
# via -r ./extra-pdf-image.in
urllib3==1.26.20
# via
# -c ./deps/constraints.txt
# -c base.txt
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
wrapt==1.17.2
# via
# -c base.txt
# -c ./base.txt
# deprecated
zipp==3.21.0
# via importlib-resources

View File

@ -2,14 +2,14 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-pptx.in
# pip-compile ./extra-pptx.in
#
lxml==5.3.1
# via python-pptx
pillow==11.1.0
# via python-pptx
python-pptx==1.0.2
# via -r extra-pptx.in
# via -r ./extra-pptx.in
typing-extensions==4.12.2
# via python-pptx
xlsxwriter==3.2.2

View File

@ -2,31 +2,31 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile extra-xlsx.in
# pip-compile ./extra-xlsx.in
#
et-xmlfile==2.0.0
# via openpyxl
networkx==3.2.1
# via -r extra-xlsx.in
# via -r ./extra-xlsx.in
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# pandas
openpyxl==3.1.5
# via -r extra-xlsx.in
# via -r ./extra-xlsx.in
pandas==2.2.3
# via -r extra-xlsx.in
# via -r ./extra-xlsx.in
python-dateutil==2.9.0.post0
# via
# -c base.txt
# -c ./base.txt
# pandas
pytz==2025.1
# via pandas
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# python-dateutil
tzdata==2025.1
# via pandas
xlrd==2.0.1
# via -r extra-xlsx.in
# via -r ./extra-xlsx.in

View File

@ -2,19 +2,19 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile huggingface.in
# pip-compile ./huggingface.in
#
certifi==2025.1.31
# via
# -c base.txt
# -c ./base.txt
# requests
charset-normalizer==3.4.1
# via
# -c base.txt
# -c ./base.txt
# requests
click==8.1.8
# via
# -c base.txt
# -c ./base.txt
# sacremoses
filelock==3.17.0
# via
@ -25,24 +25,24 @@ fsspec==2025.2.0
# via
# huggingface-hub
# torch
huggingface-hub==0.28.1
huggingface-hub==0.29.2
# via
# tokenizers
# transformers
idna==3.10
# via
# -c base.txt
# -c ./base.txt
# requests
jinja2==3.1.5
jinja2==3.1.6
# via torch
joblib==1.4.2
# via
# -c base.txt
# -c ./base.txt
# sacremoses
langdetect==1.0.9
# via
# -c base.txt
# -r huggingface.in
# -c ./base.txt
# -r ./huggingface.in
markupsafe==3.0.2
# via jinja2
mpmath==1.3.0
@ -51,11 +51,11 @@ networkx==3.2.1
# via torch
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# transformers
packaging==24.2
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# transformers
pyyaml==6.0.2
@ -64,47 +64,47 @@ pyyaml==6.0.2
# transformers
regex==2024.11.6
# via
# -c base.txt
# -c ./base.txt
# sacremoses
# transformers
requests==2.32.3
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# transformers
sacremoses==0.1.1
# via -r huggingface.in
safetensors==0.5.2
# via -r ./huggingface.in
safetensors==0.5.3
# via transformers
sentencepiece==0.2.0
# via -r huggingface.in
# via -r ./huggingface.in
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# langdetect
sympy==1.13.1
# via torch
tokenizers==0.21.0
# via
# -c ./deps/constraints.txt
# -c ././deps/constraints.txt
# transformers
torch==2.6.0
# via -r huggingface.in
# via -r ./huggingface.in
tqdm==4.67.1
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# sacremoses
# transformers
transformers==4.49.0
# via -r huggingface.in
# via -r ./huggingface.in
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# huggingface-hub
# torch
urllib3==1.26.20
# via
# -c ./deps/constraints.txt
# -c base.txt
# -c ././deps/constraints.txt
# -c ./base.txt
# requests

View File

@ -2,46 +2,46 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile test.in
# pip-compile ./test.in
#
annotated-types==0.7.0
# via pydantic
anyio==4.8.0
# via
# -c base.txt
# -c ./base.txt
# httpx
appdirs==1.4.4
# via label-studio-sdk
argcomplete==3.5.3
argcomplete==3.6.0
# via datamodel-code-generator
attrs==25.1.0
# via
# jsonschema
# referencing
autoflake==2.3.1
# via -r test.in
# via -r ./test.in
black==25.1.0
# via
# -r test.in
# -r ./test.in
# datamodel-code-generator
certifi==2025.1.31
# via
# -c base.txt
# -c ./base.txt
# httpcore
# httpx
# requests
charset-normalizer==3.4.1
# via
# -c base.txt
# -c ./base.txt
# requests
click==8.1.8
# via
# -c base.txt
# -c ./base.txt
# black
# nltk
coverage[toml]==7.6.12
# via
# -r test.in
# -r ./test.in
# pytest-cov
datamodel-code-generator==0.26.1
# via label-studio-sdk
@ -51,40 +51,40 @@ email-validator==2.2.0
# via pydantic
exceptiongroup==1.2.2
# via
# -c base.txt
# -c ./base.txt
# anyio
# pytest
faker==36.1.1
faker==36.2.2
# via jsf
flake8==7.1.2
# via
# -r test.in
# -r ./test.in
# flake8-print
flake8-print==5.0.0
# via -r test.in
# via -r ./test.in
freezegun==1.5.1
# via -r test.in
# via -r ./test.in
genson==1.3.0
# via datamodel-code-generator
grpcio==1.70.0
# via
# -c ./deps/constraints.txt
# -r test.in
# -c ././deps/constraints.txt
# -r ./test.in
h11==0.14.0
# via
# -c base.txt
# -c ./base.txt
# httpcore
httpcore==1.0.7
# via
# -c base.txt
# -c ./base.txt
# httpx
httpx==0.28.1
# via
# -c base.txt
# -c ./base.txt
# label-studio-sdk
idna==3.10
# via
# -c base.txt
# -c ./base.txt
# anyio
# email-validator
# httpx
@ -98,11 +98,11 @@ iniconfig==2.0.0
# via pytest
isort==5.13.2
# via datamodel-code-generator
jinja2==3.1.5
jinja2==3.1.6
# via datamodel-code-generator
joblib==1.4.2
# via
# -c base.txt
# -c ./base.txt
# nltk
jsf==0.11.2
# via label-studio-sdk
@ -113,12 +113,12 @@ jsonschema==4.23.0
jsonschema-specifications==2024.10.1
# via jsonschema
label-studio-sdk==1.0.10
# via -r test.in
# via -r ./test.in
liccheck==0.9.2
# via -r test.in
# via -r ./test.in
lxml==5.3.1
# via
# -c base.txt
# -c ./base.txt
# label-studio-sdk
markupsafe==3.0.2
# via jinja2
@ -127,24 +127,24 @@ mccabe==0.7.0
multidict==6.1.0
# via yarl
mypy==1.15.0
# via -r test.in
# via -r ./test.in
mypy-extensions==1.0.0
# via
# -c base.txt
# -c ./base.txt
# black
# mypy
nltk==3.9.1
# via
# -c base.txt
# -c ./base.txt
# label-studio-sdk
numpy==1.26.4
# via
# -c base.txt
# -c ./base.txt
# label-studio-sdk
# pandas
packaging==24.2
# via
# -c base.txt
# -c ./base.txt
# black
# datamodel-code-generator
# pytest
@ -158,7 +158,7 @@ platformdirs==4.3.6
# via black
pluggy==1.5.0
# via pytest
propcache==0.2.1
propcache==0.3.0
# via yarl
pycodestyle==2.12.1
# via
@ -166,7 +166,7 @@ pycodestyle==2.12.1
# flake8-print
pydantic[email]==2.10.6
# via
# -r test.in
# -r ./test.in
# datamodel-code-generator
# jsf
# label-studio-sdk
@ -178,17 +178,17 @@ pyflakes==3.2.0
# via
# autoflake
# flake8
pytest==8.3.4
pytest==8.3.5
# via
# pytest-cov
# pytest-mock
pytest-cov==6.0.0
# via -r test.in
# via -r ./test.in
pytest-mock==3.14.0
# via -r test.in
# via -r ./test.in
python-dateutil==2.9.0.post0
# via
# -c base.txt
# -c ./base.txt
# freezegun
# pandas
pytz==2025.1
@ -203,35 +203,35 @@ referencing==0.36.2
# jsonschema-specifications
regex==2024.11.6
# via
# -c base.txt
# -c ./base.txt
# nltk
requests==2.32.3
# via
# -c base.txt
# -c ./base.txt
# label-studio-sdk
# requests-mock
# smart-open
requests-mock==1.12.1
# via label-studio-sdk
rpds-py==0.22.3
rpds-py==0.23.1
# via
# jsonschema
# referencing
rstr==3.2.2
# via jsf
ruff==0.9.6
# via -r test.in
ruff==0.9.9
# via -r ./test.in
semantic-version==2.10.0
# via liccheck
six==1.17.0
# via
# -c base.txt
# -c ./base.txt
# python-dateutil
smart-open[http]==7.1.0
# via jsf
sniffio==1.3.1
# via
# -c base.txt
# -c ./base.txt
# anyio
toml==0.10.2
# via
@ -246,21 +246,21 @@ tomli==2.2.1
# pytest
tqdm==4.67.1
# via
# -c base.txt
# -c ./base.txt
# nltk
types-click==7.1.8
# via -r test.in
# via -r ./test.in
types-markdown==3.7.0.20241204
# via -r test.in
# via -r ./test.in
types-requests==2.31.0.6
# via -r test.in
# via -r ./test.in
types-tabulate==0.9.0.20241207
# via -r test.in
# via -r ./test.in
types-urllib3==1.26.25.14
# via types-requests
typing-extensions==4.12.2
# via
# -c base.txt
# -c ./base.txt
# anyio
# black
# jsf
@ -278,15 +278,15 @@ ujson==5.10.0
# via label-studio-sdk
urllib3==1.26.20
# via
# -c ./deps/constraints.txt
# -c base.txt
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
# vcrpy
vcrpy==7.0.0
# via -r test.in
# via -r ./test.in
wrapt==1.17.2
# via
# -c base.txt
# -c ./base.txt
# smart-open
# vcrpy
xmljson==0.2.1

View File

@ -5,6 +5,7 @@
from __future__ import annotations
import io
import json
import os
import pytest
@ -944,3 +945,45 @@ class Describe_ZipFileDetector:
):
ctx = _FileTypeDetectionContext(example_doc_path(file_name))
assert _ZipFileDetector.file_type(ctx) is expected_value
def test_mimetype_magic_detection_is_used_before_filename_when_filetype_is_detected_for_json():
json_bytes = json.dumps([{"example": "data"}]).encode("utf-8")
file_buffer = io.BytesIO(json_bytes)
predicted_type = detect_filetype(file=file_buffer, metadata_file_path="filename.pdf")
assert predicted_type == FileType.JSON
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer)
assert predicted_type == FileType.JSON
def test_mimetype_magic_detection_is_used_before_filename_when_filetype_is_detected_for_ndjson():
data = [{"example": "data1"}, {"example": "data2"}, {"example": "data3"}]
ndjson_string = "\n".join(json.dumps(item) for item in data) + "\n"
ndjson_bytes = ndjson_string.encode("utf-8")
file_buffer = io.BytesIO(ndjson_bytes)
predicted_type = detect_filetype(file=file_buffer, metadata_file_path="filename.pdf")
assert predicted_type == FileType.NDJSON
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer)
assert predicted_type == FileType.NDJSON
def test_json_content_type_is_disambiguated_for_ndjson():
data = [{"example": "data1"}, {"example": "data2"}, {"example": "data3"}]
ndjson_string = "\n".join(json.dumps(item) for item in data) + "\n"
ndjson_bytes = ndjson_string.encode("utf-8")
file_buffer = io.BytesIO(ndjson_bytes)
predicted_type = detect_filetype(
file=file_buffer, metadata_file_path="filename.pdf", content_type="application/json"
)
assert predicted_type == FileType.NDJSON
file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
assert predicted_type == FileType.NDJSON

View File

@ -409,17 +409,17 @@ def test_auto_partition_json_from_file_preserves_original_elements():
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
# NOTE(robinson) - This is unprocessable because it is not a list of dicts, per the
# Unstructured JSON serialization format
text = '{"hi": "there"}'
def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
text = '{"text": "hello", "type": "NarrativeText"}'
file_path = str(tmp_path / "unprocessable.json")
with open(file_path, "w") as f:
f.write(text)
with pytest.raises(ValueError, match="Detected a JSON file that does not conform to the Unst"):
partition(filename=file_path)
result = partition(filename=file_path)
assert len(result) == 1
assert isinstance(result[0], NarrativeText)
assert "hello" in result[0].text
# ================================================================================================

View File

@ -1 +1 @@
__version__ = "0.16.24" # pragma: no cover
__version__ = "0.16.25" # pragma: no cover

View File

@ -169,30 +169,31 @@ class _FileTypeDetector:
# -- accuracy. So start with binary types and only then consider an asserted content-type,
# -- generally as a last resort.
# -- strategy 1: most binary types can be detected with 100% accuracy --
if file_type := self._known_binary_file_type:
return file_type
if (
( # strategy 1: most binary types can be detected with 100% accuracy
predicted_file_type := self._known_binary_file_type
)
or ( # strategy 2: use content-type asserted by caller
predicted_file_type := self._file_type_from_content_type
)
or ( # strategy 3: guess MIME-type using libmagic and use that
predicted_file_type := self._file_type_from_guessed_mime_type
)
or ( # strategy 4: use filename-extension, like ".docx" -> FileType.DOCX
predicted_file_type := self._file_type_from_file_extension
)
):
result_file_type = predicted_file_type
else:
# give up and report FileType.UNK
result_file_type = FileType.UNK
# -- strategy 2: use content-type asserted by caller --
if file_type := self._file_type_from_content_type:
return file_type
if result_file_type == FileType.JSON:
# edge case where JSON/NDJSON content without file extension
# (magic lib can't distinguish them)
result_file_type = self._disambiguate_json_file_type
# -- strategy 3: guess MIME-type using libmagic and use that --
if file_type := self._file_type_from_guessed_mime_type:
return file_type
# -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
if file_type := self._file_type_from_file_extension:
return file_type
# -- strategy 5: edge case where JSON/NDJSON content without file extension --
if file_type := self._disambiguate_json_file_type:
return file_type
# -- strategy 6: give up and report FileType.UNK --
return FileType.UNK
# == STRATEGIES ============================================================
return result_file_type
@property
def _known_binary_file_type(self) -> FileType | None:

View File

@ -124,8 +124,7 @@ class FileType(enum.Enum):
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
`FileType` member or one of its alias MIME-types.
"""
if mime_type is None or mime_type == "application/json":
# application/json is ambiguous as it may point ot JSON and NDJSON file types
if mime_type is None:
return None
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
# -- limitations on defining a class variable on an Enum.