mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Fix json bytes content type detection (#3941)
Fixes order of content type detection strategies for byte-encoded jsons. Before ``` json_bytes = json.dumps([{"example": "data"}]).encode("utf-8") file_buffer = io.BytesIO(json_bytes) detect_filetype(file=file_buffer, metadata_file_path="filename.pdf") ``` Before PDF Now JSON
This commit is contained in:
parent
961c8d5b11
commit
74b0647aa2
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,14 @@
|
||||
## 0.16.25
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
- **Fixes filetype detection for jsons passed as byte streams** - Now it prioritizes magic mimetype prediction over file extension when detecting filetypes
|
||||
|
||||
|
||||
## 0.16.24
|
||||
|
||||
### Enhancements
|
||||
@ -14,6 +25,7 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
## 0.16.23
|
||||
|
||||
### Enhancements
|
||||
|
@ -2,14 +2,14 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile base.in
|
||||
# pip-compile ./base.in
|
||||
#
|
||||
anyio==4.8.0
|
||||
# via httpx
|
||||
backoff==2.2.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
beautifulsoup4==4.13.3
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
certifi==2025.1.31
|
||||
# via
|
||||
# httpcore
|
||||
@ -19,7 +19,7 @@ certifi==2025.1.31
|
||||
cffi==1.17.1
|
||||
# via cryptography
|
||||
chardet==5.2.0
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
charset-normalizer==3.4.1
|
||||
# via
|
||||
# requests
|
||||
@ -28,24 +28,24 @@ click==8.1.8
|
||||
# via
|
||||
# nltk
|
||||
# python-oxmsg
|
||||
cryptography==44.0.1
|
||||
cryptography==44.0.2
|
||||
# via unstructured-client
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
# -r base.in
|
||||
# -r ./base.in
|
||||
# unstructured-client
|
||||
deepdiff==8.2.0
|
||||
deepdiff==8.3.0
|
||||
# via unstructured-client
|
||||
emoji==2.14.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
exceptiongroup==1.2.2
|
||||
# via anyio
|
||||
filetype==1.2.0
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
html5lib==1.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
httpcore==1.0.7
|
||||
# via httpx
|
||||
httpx==0.28.1
|
||||
@ -61,9 +61,9 @@ joblib==1.4.2
|
||||
jsonpath-python==1.0.6
|
||||
# via unstructured-client
|
||||
langdetect==1.0.9
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
lxml==5.3.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
marshmallow==3.26.1
|
||||
# via
|
||||
# dataclasses-json
|
||||
@ -75,9 +75,9 @@ mypy-extensions==1.0.0
|
||||
nest-asyncio==1.6.0
|
||||
# via unstructured-client
|
||||
nltk==3.9.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
numpy==1.26.4
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
olefile==0.47
|
||||
# via python-oxmsg
|
||||
orderly-set==5.3.0
|
||||
@ -87,26 +87,26 @@ packaging==24.2
|
||||
# marshmallow
|
||||
# unstructured-client
|
||||
psutil==7.0.0
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pypdf==5.3.0
|
||||
pypdf==5.3.1
|
||||
# via unstructured-client
|
||||
python-dateutil==2.9.0.post0
|
||||
# via unstructured-client
|
||||
python-iso639==2025.2.18
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
python-magic==0.4.27
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
python-oxmsg==0.0.2
|
||||
# via -r base.in
|
||||
rapidfuzz==3.12.1
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
rapidfuzz==3.12.2
|
||||
# via -r ./base.in
|
||||
regex==2024.11.6
|
||||
# via nltk
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -r base.in
|
||||
# -r ./base.in
|
||||
# requests-toolbelt
|
||||
# unstructured-client
|
||||
requests-toolbelt==1.0.0
|
||||
@ -123,11 +123,11 @@ soupsieve==2.6
|
||||
# via beautifulsoup4
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -r base.in
|
||||
# -r ./base.in
|
||||
# nltk
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -r base.in
|
||||
# -r ./base.in
|
||||
# anyio
|
||||
# beautifulsoup4
|
||||
# pypdf
|
||||
@ -140,14 +140,14 @@ typing-inspect==0.9.0
|
||||
# unstructured-client
|
||||
unstructured-client==0.25.9
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -r base.in
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./base.in
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# requests
|
||||
# unstructured-client
|
||||
webencodings==0.5.1
|
||||
# via html5lib
|
||||
wrapt==1.17.2
|
||||
# via -r base.in
|
||||
# via -r ./base.in
|
||||
|
@ -6,6 +6,8 @@
|
||||
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
|
||||
weaviate-client>=3.26.7,<4.0.0
|
||||
# TODO: Constriant due to multiple versions being installed during pip-compile
|
||||
protobuf>=6.30.0
|
||||
# TODO: Constriant due to multiple versions being installed during pip-compile
|
||||
grpcio>=1.65.5
|
||||
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)
|
||||
tokenizers>=0.21,<0.22
|
||||
|
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile dev.in
|
||||
# pip-compile ./dev.in
|
||||
#
|
||||
build==1.2.2.post1
|
||||
# via pip-tools
|
||||
@ -10,48 +10,48 @@ cfgv==3.4.0
|
||||
# via pre-commit
|
||||
click==8.1.8
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c test.txt
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
# pip-tools
|
||||
distlib==0.3.9
|
||||
# via virtualenv
|
||||
filelock==3.17.0
|
||||
# via virtualenv
|
||||
identify==2.6.7
|
||||
identify==2.6.8
|
||||
# via pre-commit
|
||||
importlib-metadata==8.6.1
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# build
|
||||
nodeenv==1.9.1
|
||||
# via pre-commit
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c test.txt
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
# build
|
||||
pip-tools==7.4.1
|
||||
# via -r dev.in
|
||||
# via -r ./dev.in
|
||||
platformdirs==4.3.6
|
||||
# via
|
||||
# -c test.txt
|
||||
# -c ./test.txt
|
||||
# virtualenv
|
||||
pre-commit==4.1.0
|
||||
# via -r dev.in
|
||||
# via -r ./dev.in
|
||||
pyproject-hooks==1.2.0
|
||||
# via
|
||||
# build
|
||||
# pip-tools
|
||||
pyyaml==6.0.2
|
||||
# via
|
||||
# -c test.txt
|
||||
# -c ./test.txt
|
||||
# pre-commit
|
||||
tomli==2.2.1
|
||||
# via
|
||||
# -c test.txt
|
||||
# -c ./test.txt
|
||||
# build
|
||||
# pip-tools
|
||||
virtualenv==20.29.2
|
||||
virtualenv==20.29.3
|
||||
# via pre-commit
|
||||
wheel==0.45.1
|
||||
# via pip-tools
|
||||
|
@ -2,23 +2,23 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-csv.in
|
||||
# pip-compile ./extra-csv.in
|
||||
#
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
pandas==2.2.3
|
||||
# via -r extra-csv.in
|
||||
# via -r ./extra-csv.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
pytz==2025.1
|
||||
# via pandas
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
tzdata==2025.1
|
||||
# via pandas
|
||||
|
@ -2,15 +2,15 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-docx.in
|
||||
# pip-compile ./extra-docx.in
|
||||
#
|
||||
lxml==5.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
python-docx==1.1.2
|
||||
# via -r extra-docx.in
|
||||
# via -r ./extra-docx.in
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-epub.in
|
||||
# pip-compile ./extra-epub.in
|
||||
#
|
||||
pypandoc==1.15
|
||||
# via -r extra-epub.in
|
||||
# via -r ./extra-epub.in
|
||||
|
@ -2,13 +2,13 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-markdown.in
|
||||
# pip-compile ./extra-markdown.in
|
||||
#
|
||||
importlib-metadata==8.6.1
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# markdown
|
||||
markdown==3.7
|
||||
# via -r extra-markdown.in
|
||||
# via -r ./extra-markdown.in
|
||||
zipp==3.21.0
|
||||
# via importlib-metadata
|
||||
|
@ -2,17 +2,17 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-odt.in
|
||||
# pip-compile ./extra-odt.in
|
||||
#
|
||||
lxml==5.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
pypandoc==1.15
|
||||
# via -r extra-odt.in
|
||||
# via -r ./extra-odt.in
|
||||
python-docx==1.1.2
|
||||
# via -r extra-odt.in
|
||||
# via -r ./extra-odt.in
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -2,23 +2,23 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-paddleocr.in
|
||||
# pip-compile ./extra-paddleocr.in
|
||||
#
|
||||
anyio==4.8.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
astor==0.8.1
|
||||
# via paddlepaddle
|
||||
certifi==2025.1.31
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.4.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
@ -26,29 +26,29 @@ cycler==0.12.1
|
||||
# via matplotlib
|
||||
cython==3.0.12
|
||||
# via unstructured-paddleocr
|
||||
decorator==5.1.1
|
||||
decorator==5.2.1
|
||||
# via paddlepaddle
|
||||
exceptiongroup==1.2.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
fonttools==4.56.0
|
||||
# via matplotlib
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
httpcore==1.0.7
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
httpx==0.28.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# paddlepaddle
|
||||
idna==3.10
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
@ -72,7 +72,7 @@ networkx==3.2.1
|
||||
# scikit-image
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# contourpy
|
||||
# imageio
|
||||
# imgaug
|
||||
@ -96,12 +96,12 @@ opt-einsum==3.3.0
|
||||
# via paddlepaddle
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# lazy-loader
|
||||
# matplotlib
|
||||
# scikit-image
|
||||
paddlepaddle==3.0.0b1
|
||||
# via -r extra-paddleocr.in
|
||||
# via -r ./extra-paddleocr.in
|
||||
pdf2image==1.17.0
|
||||
# via unstructured-paddleocr
|
||||
pillow==11.1.0
|
||||
@ -113,25 +113,27 @@ pillow==11.1.0
|
||||
# pdf2image
|
||||
# scikit-image
|
||||
# unstructured-paddleocr
|
||||
protobuf==5.29.3
|
||||
# via paddlepaddle
|
||||
protobuf==6.30.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# paddlepaddle
|
||||
pyclipper==1.3.0.post6
|
||||
# via unstructured-paddleocr
|
||||
pyparsing==3.2.1
|
||||
# via matplotlib
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# matplotlib
|
||||
pyyaml==6.0.2
|
||||
# via unstructured-paddleocr
|
||||
rapidfuzz==3.12.1
|
||||
rapidfuzz==3.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
scikit-image==0.24.0
|
||||
# via
|
||||
@ -147,30 +149,30 @@ shapely==2.0.7
|
||||
# unstructured-paddleocr
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# imgaug
|
||||
# python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
tifffile==2024.8.30
|
||||
# via scikit-image
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# paddlepaddle
|
||||
unstructured-paddleocr==2.8.1.0
|
||||
# via -r extra-paddleocr.in
|
||||
# via -r ./extra-paddleocr.in
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c base.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
zipp==3.21.0
|
||||
# via importlib-resources
|
||||
|
@ -2,7 +2,7 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-pandoc.in
|
||||
# pip-compile ./extra-pandoc.in
|
||||
#
|
||||
pypandoc==1.15
|
||||
# via -r extra-pandoc.in
|
||||
# via -r ./extra-pandoc.in
|
||||
|
@ -2,39 +2,39 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-pdf-image.in
|
||||
# pip-compile ./extra-pdf-image.in
|
||||
#
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
cachetools==5.5.1
|
||||
cachetools==5.5.2
|
||||
# via google-auth
|
||||
certifi==2025.1.31
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
cffi==1.17.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# cryptography
|
||||
charset-normalizer==3.4.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
# requests
|
||||
coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cryptography==44.0.1
|
||||
cryptography==44.0.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
cycler==0.12.1
|
||||
# via matplotlib
|
||||
deprecated==1.2.18
|
||||
# via pikepdf
|
||||
effdet==0.4.1
|
||||
# via -r extra-pdf-image.in
|
||||
# via -r ./extra-pdf-image.in
|
||||
filelock==3.17.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
@ -48,26 +48,26 @@ fsspec==2025.2.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-api-core[grpc]==2.24.1
|
||||
google-api-core[grpc]==2.8.0
|
||||
# via google-cloud-vision
|
||||
google-auth==2.38.0
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
google-cloud-vision==3.10.0
|
||||
# via -r extra-pdf-image.in
|
||||
googleapis-common-protos==1.67.0
|
||||
google-cloud-vision==2.7.2
|
||||
# via -r ./extra-pdf-image.in
|
||||
googleapis-common-protos==1.56.1
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.70.0
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio-status==1.70.0
|
||||
grpcio-status==1.62.3
|
||||
# via google-api-core
|
||||
huggingface-hub==0.28.1
|
||||
huggingface-hub==0.29.2
|
||||
# via
|
||||
# timm
|
||||
# tokenizers
|
||||
@ -77,17 +77,17 @@ humanfriendly==10.0
|
||||
# via coloredlogs
|
||||
idna==3.10
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
importlib-resources==6.5.2
|
||||
# via matplotlib
|
||||
jinja2==3.1.5
|
||||
jinja2==3.1.6
|
||||
# via torch
|
||||
kiwisolver==1.4.7
|
||||
# via matplotlib
|
||||
lxml==5.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pikepdf
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
@ -101,7 +101,7 @@ networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# contourpy
|
||||
# matplotlib
|
||||
# onnx
|
||||
@ -117,7 +117,7 @@ omegaconf==2.3.0
|
||||
# via effdet
|
||||
onnx==1.17.0
|
||||
# via
|
||||
# -r extra-pdf-image.in
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
onnxruntime==1.19.2
|
||||
# via unstructured-inference
|
||||
@ -125,7 +125,7 @@ opencv-python==4.11.0.86
|
||||
# via unstructured-inference
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# matplotlib
|
||||
# onnxruntime
|
||||
@ -135,15 +135,15 @@ packaging==24.2
|
||||
pandas==2.2.3
|
||||
# via unstructured-inference
|
||||
pdf2image==1.17.0
|
||||
# via -r extra-pdf-image.in
|
||||
# via -r ./extra-pdf-image.in
|
||||
pdfminer-six==20240706
|
||||
# via
|
||||
# -r extra-pdf-image.in
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
pi-heif==0.21.0
|
||||
# via -r extra-pdf-image.in
|
||||
# via -r ./extra-pdf-image.in
|
||||
pikepdf==9.5.2
|
||||
# via -r extra-pdf-image.in
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==11.1.0
|
||||
# via
|
||||
# matplotlib
|
||||
@ -152,14 +152,12 @@ pillow==11.1.0
|
||||
# pikepdf
|
||||
# torchvision
|
||||
# unstructured-pytesseract
|
||||
proto-plus==1.26.0
|
||||
proto-plus==1.20.4
|
||||
# via google-cloud-vision
|
||||
protobuf==6.30.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
protobuf==5.29.3
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
# googleapis-common-protos
|
||||
# grpcio-status
|
||||
# onnx
|
||||
@ -175,19 +173,19 @@ pycocotools==2.0.8
|
||||
# via effdet
|
||||
pycparser==2.22
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# cffi
|
||||
pyparsing==3.2.1
|
||||
# via matplotlib
|
||||
pypdf==5.3.0
|
||||
pypdf==5.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -r extra-pdf-image.in
|
||||
# -c ./base.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
pypdfium2==4.30.1
|
||||
# via unstructured-inference
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# matplotlib
|
||||
# pandas
|
||||
python-multipart==0.0.20
|
||||
@ -200,23 +198,23 @@ pyyaml==6.0.2
|
||||
# omegaconf
|
||||
# timm
|
||||
# transformers
|
||||
rapidfuzz==3.12.1
|
||||
rapidfuzz==3.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# unstructured-inference
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# google-api-core
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
rsa==4.9
|
||||
# via google-auth
|
||||
safetensors==0.5.2
|
||||
safetensors==0.5.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
@ -224,19 +222,19 @@ scipy==1.13.1
|
||||
# via unstructured-inference
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
sympy==1.13.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==1.0.14
|
||||
timm==1.0.15
|
||||
# via
|
||||
# effdet
|
||||
# unstructured-inference
|
||||
tokenizers==0.21.0
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.6.0
|
||||
# via
|
||||
@ -250,31 +248,31 @@ torchvision==0.21.0
|
||||
# timm
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
transformers==4.49.0
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# pypdf
|
||||
# torch
|
||||
tzdata==2025.1
|
||||
# via pandas
|
||||
unstructured-inference==0.8.7
|
||||
# via -r extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.13
|
||||
# via -r extra-pdf-image.in
|
||||
unstructured-inference==0.8.9
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.15
|
||||
# via -r ./extra-pdf-image.in
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c base.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
wrapt==1.17.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# deprecated
|
||||
zipp==3.21.0
|
||||
# via importlib-resources
|
||||
|
@ -2,14 +2,14 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-pptx.in
|
||||
# pip-compile ./extra-pptx.in
|
||||
#
|
||||
lxml==5.3.1
|
||||
# via python-pptx
|
||||
pillow==11.1.0
|
||||
# via python-pptx
|
||||
python-pptx==1.0.2
|
||||
# via -r extra-pptx.in
|
||||
# via -r ./extra-pptx.in
|
||||
typing-extensions==4.12.2
|
||||
# via python-pptx
|
||||
xlsxwriter==3.2.2
|
||||
|
@ -2,31 +2,31 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile extra-xlsx.in
|
||||
# pip-compile ./extra-xlsx.in
|
||||
#
|
||||
et-xmlfile==2.0.0
|
||||
# via openpyxl
|
||||
networkx==3.2.1
|
||||
# via -r extra-xlsx.in
|
||||
# via -r ./extra-xlsx.in
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
openpyxl==3.1.5
|
||||
# via -r extra-xlsx.in
|
||||
# via -r ./extra-xlsx.in
|
||||
pandas==2.2.3
|
||||
# via -r extra-xlsx.in
|
||||
# via -r ./extra-xlsx.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# pandas
|
||||
pytz==2025.1
|
||||
# via pandas
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
tzdata==2025.1
|
||||
# via pandas
|
||||
xlrd==2.0.1
|
||||
# via -r extra-xlsx.in
|
||||
# via -r ./extra-xlsx.in
|
||||
|
@ -2,19 +2,19 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile huggingface.in
|
||||
# pip-compile ./huggingface.in
|
||||
#
|
||||
certifi==2025.1.31
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
charset-normalizer==3.4.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
click==8.1.8
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
filelock==3.17.0
|
||||
# via
|
||||
@ -25,24 +25,24 @@ fsspec==2025.2.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface-hub==0.28.1
|
||||
huggingface-hub==0.29.2
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
idna==3.10
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
jinja2==3.1.5
|
||||
jinja2==3.1.6
|
||||
# via torch
|
||||
joblib==1.4.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
langdetect==1.0.9
|
||||
# via
|
||||
# -c base.txt
|
||||
# -r huggingface.in
|
||||
# -c ./base.txt
|
||||
# -r ./huggingface.in
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
mpmath==1.3.0
|
||||
@ -51,11 +51,11 @@ networkx==3.2.1
|
||||
# via torch
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# transformers
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
pyyaml==6.0.2
|
||||
@ -64,47 +64,47 @@ pyyaml==6.0.2
|
||||
# transformers
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
# transformers
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
sacremoses==0.1.1
|
||||
# via -r huggingface.in
|
||||
safetensors==0.5.2
|
||||
# via -r ./huggingface.in
|
||||
safetensors==0.5.3
|
||||
# via transformers
|
||||
sentencepiece==0.2.0
|
||||
# via -r huggingface.in
|
||||
# via -r ./huggingface.in
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# langdetect
|
||||
sympy==1.13.1
|
||||
# via torch
|
||||
tokenizers==0.21.0
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.6.0
|
||||
# via -r huggingface.in
|
||||
# via -r ./huggingface.in
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.49.0
|
||||
# via -r huggingface.in
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c base.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
|
@ -2,46 +2,46 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile test.in
|
||||
# pip-compile ./test.in
|
||||
#
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.8.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
appdirs==1.4.4
|
||||
# via label-studio-sdk
|
||||
argcomplete==3.5.3
|
||||
argcomplete==3.6.0
|
||||
# via datamodel-code-generator
|
||||
attrs==25.1.0
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
autoflake==2.3.1
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
black==25.1.0
|
||||
# via
|
||||
# -r test.in
|
||||
# -r ./test.in
|
||||
# datamodel-code-generator
|
||||
certifi==2025.1.31
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.4.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
click==8.1.8
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# nltk
|
||||
coverage[toml]==7.6.12
|
||||
# via
|
||||
# -r test.in
|
||||
# -r ./test.in
|
||||
# pytest-cov
|
||||
datamodel-code-generator==0.26.1
|
||||
# via label-studio-sdk
|
||||
@ -51,40 +51,40 @@ email-validator==2.2.0
|
||||
# via pydantic
|
||||
exceptiongroup==1.2.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# pytest
|
||||
faker==36.1.1
|
||||
faker==36.2.2
|
||||
# via jsf
|
||||
flake8==7.1.2
|
||||
# via
|
||||
# -r test.in
|
||||
# -r ./test.in
|
||||
# flake8-print
|
||||
flake8-print==5.0.0
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.1
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
genson==1.3.0
|
||||
# via datamodel-code-generator
|
||||
grpcio==1.70.0
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -r test.in
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./test.in
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
httpcore==1.0.7
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
httpx==0.28.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
idna==3.10
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# email-validator
|
||||
# httpx
|
||||
@ -98,11 +98,11 @@ iniconfig==2.0.0
|
||||
# via pytest
|
||||
isort==5.13.2
|
||||
# via datamodel-code-generator
|
||||
jinja2==3.1.5
|
||||
jinja2==3.1.6
|
||||
# via datamodel-code-generator
|
||||
joblib==1.4.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
jsf==0.11.2
|
||||
# via label-studio-sdk
|
||||
@ -113,12 +113,12 @@ jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
# via jsonschema
|
||||
label-studio-sdk==1.0.10
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
liccheck==0.9.2
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
lxml==5.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
@ -127,24 +127,24 @@ mccabe==0.7.0
|
||||
multidict==6.1.0
|
||||
# via yarl
|
||||
mypy==1.15.0
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# mypy
|
||||
nltk==3.9.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
# pandas
|
||||
packaging==24.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# datamodel-code-generator
|
||||
# pytest
|
||||
@ -158,7 +158,7 @@ platformdirs==4.3.6
|
||||
# via black
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
propcache==0.2.1
|
||||
propcache==0.3.0
|
||||
# via yarl
|
||||
pycodestyle==2.12.1
|
||||
# via
|
||||
@ -166,7 +166,7 @@ pycodestyle==2.12.1
|
||||
# flake8-print
|
||||
pydantic[email]==2.10.6
|
||||
# via
|
||||
# -r test.in
|
||||
# -r ./test.in
|
||||
# datamodel-code-generator
|
||||
# jsf
|
||||
# label-studio-sdk
|
||||
@ -178,17 +178,17 @@ pyflakes==3.2.0
|
||||
# via
|
||||
# autoflake
|
||||
# flake8
|
||||
pytest==8.3.4
|
||||
pytest==8.3.5
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
pytest-cov==6.0.0
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
pytest-mock==3.14.0
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# freezegun
|
||||
# pandas
|
||||
pytz==2025.1
|
||||
@ -203,35 +203,35 @@ referencing==0.36.2
|
||||
# jsonschema-specifications
|
||||
regex==2024.11.6
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# label-studio-sdk
|
||||
# requests-mock
|
||||
# smart-open
|
||||
requests-mock==1.12.1
|
||||
# via label-studio-sdk
|
||||
rpds-py==0.22.3
|
||||
rpds-py==0.23.1
|
||||
# via
|
||||
# jsonschema
|
||||
# referencing
|
||||
rstr==3.2.2
|
||||
# via jsf
|
||||
ruff==0.9.6
|
||||
# via -r test.in
|
||||
ruff==0.9.9
|
||||
# via -r ./test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
six==1.17.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
smart-open[http]==7.1.0
|
||||
# via jsf
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
toml==0.10.2
|
||||
# via
|
||||
@ -246,21 +246,21 @@ tomli==2.2.1
|
||||
# pytest
|
||||
tqdm==4.67.1
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# nltk
|
||||
types-click==7.1.8
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
types-markdown==3.7.0.20241204
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
types-requests==2.31.0.6
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
types-tabulate==0.9.0.20241207
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# black
|
||||
# jsf
|
||||
@ -278,15 +278,15 @@ ujson==5.10.0
|
||||
# via label-studio-sdk
|
||||
urllib3==1.26.20
|
||||
# via
|
||||
# -c ./deps/constraints.txt
|
||||
# -c base.txt
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
# vcrpy
|
||||
vcrpy==7.0.0
|
||||
# via -r test.in
|
||||
# via -r ./test.in
|
||||
wrapt==1.17.2
|
||||
# via
|
||||
# -c base.txt
|
||||
# -c ./base.txt
|
||||
# smart-open
|
||||
# vcrpy
|
||||
xmljson==0.2.1
|
||||
|
@ -5,6 +5,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
@ -944,3 +945,45 @@ class Describe_ZipFileDetector:
|
||||
):
|
||||
ctx = _FileTypeDetectionContext(example_doc_path(file_name))
|
||||
assert _ZipFileDetector.file_type(ctx) is expected_value
|
||||
|
||||
|
||||
def test_mimetype_magic_detection_is_used_before_filename_when_filetype_is_detected_for_json():
|
||||
json_bytes = json.dumps([{"example": "data"}]).encode("utf-8")
|
||||
|
||||
file_buffer = io.BytesIO(json_bytes)
|
||||
predicted_type = detect_filetype(file=file_buffer, metadata_file_path="filename.pdf")
|
||||
assert predicted_type == FileType.JSON
|
||||
|
||||
file_buffer.name = "filename.pdf"
|
||||
predicted_type = detect_filetype(file=file_buffer)
|
||||
assert predicted_type == FileType.JSON
|
||||
|
||||
|
||||
def test_mimetype_magic_detection_is_used_before_filename_when_filetype_is_detected_for_ndjson():
|
||||
data = [{"example": "data1"}, {"example": "data2"}, {"example": "data3"}]
|
||||
ndjson_string = "\n".join(json.dumps(item) for item in data) + "\n"
|
||||
ndjson_bytes = ndjson_string.encode("utf-8")
|
||||
|
||||
file_buffer = io.BytesIO(ndjson_bytes)
|
||||
predicted_type = detect_filetype(file=file_buffer, metadata_file_path="filename.pdf")
|
||||
assert predicted_type == FileType.NDJSON
|
||||
|
||||
file_buffer.name = "filename.pdf"
|
||||
predicted_type = detect_filetype(file=file_buffer)
|
||||
assert predicted_type == FileType.NDJSON
|
||||
|
||||
|
||||
def test_json_content_type_is_disambiguated_for_ndjson():
|
||||
data = [{"example": "data1"}, {"example": "data2"}, {"example": "data3"}]
|
||||
ndjson_string = "\n".join(json.dumps(item) for item in data) + "\n"
|
||||
ndjson_bytes = ndjson_string.encode("utf-8")
|
||||
|
||||
file_buffer = io.BytesIO(ndjson_bytes)
|
||||
predicted_type = detect_filetype(
|
||||
file=file_buffer, metadata_file_path="filename.pdf", content_type="application/json"
|
||||
)
|
||||
assert predicted_type == FileType.NDJSON
|
||||
|
||||
file_buffer.name = "filename.pdf"
|
||||
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
|
||||
assert predicted_type == FileType.NDJSON
|
||||
|
@ -409,17 +409,17 @@ def test_auto_partition_json_from_file_preserves_original_elements():
|
||||
assert elements_to_dicts(partitioned_elements) == elements_to_dicts(original_elements)
|
||||
|
||||
|
||||
def test_auto_partition_json_raises_with_unprocessable_json(tmp_path: pathlib.Path):
|
||||
# NOTE(robinson) - This is unprocessable because it is not a list of dicts, per the
|
||||
# Unstructured JSON serialization format
|
||||
text = '{"hi": "there"}'
|
||||
def test_auto_partition_processes_simple_ndjson(tmp_path: pathlib.Path):
|
||||
text = '{"text": "hello", "type": "NarrativeText"}'
|
||||
|
||||
file_path = str(tmp_path / "unprocessable.json")
|
||||
with open(file_path, "w") as f:
|
||||
f.write(text)
|
||||
|
||||
with pytest.raises(ValueError, match="Detected a JSON file that does not conform to the Unst"):
|
||||
partition(filename=file_path)
|
||||
result = partition(filename=file_path)
|
||||
assert len(result) == 1
|
||||
assert isinstance(result[0], NarrativeText)
|
||||
assert "hello" in result[0].text
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.24" # pragma: no cover
|
||||
__version__ = "0.16.25" # pragma: no cover
|
||||
|
@ -169,30 +169,31 @@ class _FileTypeDetector:
|
||||
# -- accuracy. So start with binary types and only then consider an asserted content-type,
|
||||
# -- generally as a last resort.
|
||||
|
||||
# -- strategy 1: most binary types can be detected with 100% accuracy --
|
||||
if file_type := self._known_binary_file_type:
|
||||
return file_type
|
||||
if (
|
||||
( # strategy 1: most binary types can be detected with 100% accuracy
|
||||
predicted_file_type := self._known_binary_file_type
|
||||
)
|
||||
or ( # strategy 2: use content-type asserted by caller
|
||||
predicted_file_type := self._file_type_from_content_type
|
||||
)
|
||||
or ( # strategy 3: guess MIME-type using libmagic and use that
|
||||
predicted_file_type := self._file_type_from_guessed_mime_type
|
||||
)
|
||||
or ( # strategy 4: use filename-extension, like ".docx" -> FileType.DOCX
|
||||
predicted_file_type := self._file_type_from_file_extension
|
||||
)
|
||||
):
|
||||
result_file_type = predicted_file_type
|
||||
else:
|
||||
# give up and report FileType.UNK
|
||||
result_file_type = FileType.UNK
|
||||
|
||||
# -- strategy 2: use content-type asserted by caller --
|
||||
if file_type := self._file_type_from_content_type:
|
||||
return file_type
|
||||
if result_file_type == FileType.JSON:
|
||||
# edge case where JSON/NDJSON content without file extension
|
||||
# (magic lib can't distinguish them)
|
||||
result_file_type = self._disambiguate_json_file_type
|
||||
|
||||
# -- strategy 3: guess MIME-type using libmagic and use that --
|
||||
if file_type := self._file_type_from_guessed_mime_type:
|
||||
return file_type
|
||||
|
||||
# -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
|
||||
if file_type := self._file_type_from_file_extension:
|
||||
return file_type
|
||||
|
||||
# -- strategy 5: edge case where JSON/NDJSON content without file extension --
|
||||
if file_type := self._disambiguate_json_file_type:
|
||||
return file_type
|
||||
|
||||
# -- strategy 6: give up and report FileType.UNK --
|
||||
return FileType.UNK
|
||||
|
||||
# == STRATEGIES ============================================================
|
||||
return result_file_type
|
||||
|
||||
@property
|
||||
def _known_binary_file_type(self) -> FileType | None:
|
||||
|
@ -124,8 +124,7 @@ class FileType(enum.Enum):
|
||||
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
|
||||
`FileType` member or one of its alias MIME-types.
|
||||
"""
|
||||
if mime_type is None or mime_type == "application/json":
|
||||
# application/json is ambiguous as it may point ot JSON and NDJSON file types
|
||||
if mime_type is None:
|
||||
return None
|
||||
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
|
||||
# -- limitations on defining a class variable on an Enum.
|
||||
|
Loading…
x
Reference in New Issue
Block a user