mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
resolve CVEs and HF issue (#4009)
update reqs to resolve CVEs and add the HF ENV to stop it from reaching out updated the Dockerfile with ENV HF_HUB_OFFLINE=1 to stop it from pinging HF. This was an issue for a gov customer. and updated requirements to resolve some open CVEs --------- Co-authored-by: cragwolfe <crag@unstructured.io> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: luke-kucing <luke-kucing@users.noreply.github.com>
This commit is contained in:
parent
3a048a5a02
commit
a7e90f7990
@ -1,3 +1,12 @@
|
||||
## 0.17.7
|
||||
|
||||
### Enhancements
|
||||
- **Updated Docker file with ENV HF_HUB_OFFLINE=1 to prevent the contianer from trying to access the internet
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.17.7-dev0
|
||||
|
||||
### Enhancements
|
||||
|
@ -31,4 +31,6 @@ RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt
|
||||
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
||||
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
||||
|
||||
ENV HF_HUB_OFFLINE=1
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
|
@ -20,7 +20,7 @@ cffi==1.17.1
|
||||
# via cryptography
|
||||
chardet==5.2.0
|
||||
# via -r ./base.in
|
||||
charset-normalizer==3.4.1
|
||||
charset-normalizer==3.4.2
|
||||
# via
|
||||
# requests
|
||||
# unstructured-client
|
||||
@ -28,17 +28,17 @@ click==8.1.8
|
||||
# via
|
||||
# nltk
|
||||
# python-oxmsg
|
||||
cryptography==44.0.2
|
||||
cryptography==45.0.3
|
||||
# via unstructured-client
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
# -r ./base.in
|
||||
# unstructured-client
|
||||
deepdiff==8.4.2
|
||||
deepdiff==8.5.0
|
||||
# via unstructured-client
|
||||
emoji==2.14.1
|
||||
# via -r ./base.in
|
||||
exceptiongroup==1.2.2
|
||||
exceptiongroup==1.3.0
|
||||
# via anyio
|
||||
filetype==1.2.0
|
||||
# via -r ./base.in
|
||||
@ -56,7 +56,7 @@ idna==3.10
|
||||
# httpx
|
||||
# requests
|
||||
# unstructured-client
|
||||
joblib==1.4.2
|
||||
joblib==1.5.1
|
||||
# via nltk
|
||||
jsonpath-python==1.0.6
|
||||
# via unstructured-client
|
||||
@ -80,7 +80,7 @@ numpy==2.0.2
|
||||
# via -r ./base.in
|
||||
olefile==0.47
|
||||
# via python-oxmsg
|
||||
orderly-set==5.4.0
|
||||
orderly-set==5.4.1
|
||||
# via deepdiff
|
||||
packaging==25.0
|
||||
# via
|
||||
@ -90,7 +90,7 @@ psutil==7.0.0
|
||||
# via -r ./base.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pypdf==5.4.0
|
||||
pypdf==5.6.0
|
||||
# via unstructured-client
|
||||
python-dateutil==2.9.0.post0
|
||||
# via unstructured-client
|
||||
@ -125,11 +125,12 @@ tqdm==4.67.1
|
||||
# via
|
||||
# -r ./base.in
|
||||
# nltk
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -r ./base.in
|
||||
# anyio
|
||||
# beautifulsoup4
|
||||
# exceptiongroup
|
||||
# pypdf
|
||||
# python-oxmsg
|
||||
# typing-inspect
|
||||
|
@ -17,9 +17,9 @@ distlib==0.3.9
|
||||
# via virtualenv
|
||||
filelock==3.18.0
|
||||
# via virtualenv
|
||||
identify==2.6.10
|
||||
identify==2.6.12
|
||||
# via pre-commit
|
||||
importlib-metadata==8.6.1
|
||||
importlib-metadata==8.7.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# build
|
||||
@ -32,7 +32,7 @@ packaging==25.0
|
||||
# build
|
||||
pip-tools==7.4.1
|
||||
# via -r ./dev.in
|
||||
platformdirs==4.3.7
|
||||
platformdirs==4.3.8
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# virtualenv
|
||||
@ -49,11 +49,11 @@ tomli==2.2.1
|
||||
# -c ./test.txt
|
||||
# build
|
||||
# pip-tools
|
||||
virtualenv==20.30.0
|
||||
virtualenv==20.31.2
|
||||
# via pre-commit
|
||||
wheel==0.45.1
|
||||
# via pip-tools
|
||||
zipp==3.21.0
|
||||
zipp==3.22.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
@ -10,7 +10,7 @@ lxml==5.4.0
|
||||
# python-docx
|
||||
python-docx==1.1.2
|
||||
# via -r ./extra-docx.in
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -4,11 +4,11 @@
|
||||
#
|
||||
# pip-compile ./extra-markdown.in
|
||||
#
|
||||
importlib-metadata==8.6.1
|
||||
importlib-metadata==8.7.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# markdown
|
||||
markdown==3.8
|
||||
# via -r ./extra-markdown.in
|
||||
zipp==3.21.0
|
||||
zipp==3.22.0
|
||||
# via importlib-metadata
|
||||
|
@ -12,7 +12,7 @@ pypandoc==1.15
|
||||
# via -r ./extra-odt.in
|
||||
python-docx==1.1.2
|
||||
# via -r ./extra-odt.in
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
|
@ -4,11 +4,11 @@
|
||||
#
|
||||
# pip-compile ./extra-paddleocr.in
|
||||
#
|
||||
albucore==0.0.23
|
||||
albucore==0.0.24
|
||||
# via
|
||||
# albumentations
|
||||
# unstructured-paddleocr
|
||||
albumentations==2.0.5
|
||||
albumentations==2.0.8
|
||||
# via unstructured-paddleocr
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
@ -28,23 +28,23 @@ certifi==2025.4.26
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.4.1
|
||||
charset-normalizer==3.4.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
cython==3.0.12
|
||||
cython==3.1.1
|
||||
# via unstructured-paddleocr
|
||||
decorator==5.2.1
|
||||
# via paddlepaddle
|
||||
eval-type-backport==0.2.2
|
||||
# via albumentations
|
||||
exceptiongroup==1.2.2
|
||||
exceptiongroup==1.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
fire==0.7.0
|
||||
# via unstructured-paddleocr
|
||||
fonttools==4.57.0
|
||||
fonttools==4.58.1
|
||||
# via unstructured-paddleocr
|
||||
h11==0.16.0
|
||||
# via
|
||||
@ -115,15 +115,15 @@ pillow==11.2.1
|
||||
# paddlepaddle
|
||||
# scikit-image
|
||||
# unstructured-paddleocr
|
||||
protobuf==6.30.2
|
||||
protobuf==6.31.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# paddlepaddle
|
||||
pyclipper==1.3.0.post6
|
||||
# via unstructured-paddleocr
|
||||
pydantic==2.11.3
|
||||
pydantic==2.11.5
|
||||
# via albumentations
|
||||
pydantic-core==2.33.1
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
python-docx==1.1.2
|
||||
# via unstructured-paddleocr
|
||||
@ -147,7 +147,7 @@ scipy==1.13.1
|
||||
# scikit-image
|
||||
shapely==2.0.7
|
||||
# via unstructured-paddleocr
|
||||
simsimd==6.2.1
|
||||
simsimd==6.4.7
|
||||
# via albucore
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
@ -159,7 +159,7 @@ soupsieve==2.7
|
||||
# beautifulsoup4
|
||||
stringzilla==3.12.5
|
||||
# via albucore
|
||||
termcolor==3.0.1
|
||||
termcolor==3.1.0
|
||||
# via fire
|
||||
tifffile==2024.8.30
|
||||
# via scikit-image
|
||||
@ -167,19 +167,20 @@ tqdm==4.67.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# albucore
|
||||
# albumentations
|
||||
# anyio
|
||||
# beautifulsoup4
|
||||
# exceptiongroup
|
||||
# paddlepaddle
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# python-docx
|
||||
# typing-inspection
|
||||
typing-inspection==0.4.0
|
||||
typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
unstructured-paddleocr==2.10.0
|
||||
# via -r ./extra-paddleocr.in
|
||||
|
@ -4,6 +4,8 @@
|
||||
#
|
||||
# pip-compile ./extra-pdf-image.in
|
||||
#
|
||||
accelerate==1.7.0
|
||||
# via unstructured-inference
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
cachetools==5.5.2
|
||||
@ -16,7 +18,7 @@ cffi==1.17.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# cryptography
|
||||
charset-normalizer==3.4.1
|
||||
charset-normalizer==3.4.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
@ -25,7 +27,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cryptography==44.0.2
|
||||
cryptography==45.0.3
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
@ -42,15 +44,15 @@ filelock==3.18.0
|
||||
# transformers
|
||||
flatbuffers==25.2.10
|
||||
# via onnxruntime
|
||||
fonttools==4.57.0
|
||||
fonttools==4.58.1
|
||||
# via matplotlib
|
||||
fsspec==2025.3.2
|
||||
fsspec==2025.5.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-api-core[grpc]==2.24.2
|
||||
google-api-core[grpc]==2.25.0
|
||||
# via google-cloud-vision
|
||||
google-auth==2.39.0
|
||||
google-auth==2.40.2
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
@ -60,15 +62,18 @@ googleapis-common-protos==1.70.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.71.0
|
||||
grpcio==1.72.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio-status==1.62.3
|
||||
grpcio-status==1.72.1
|
||||
# via google-api-core
|
||||
huggingface-hub==0.30.2
|
||||
hf-xet==1.1.2
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.32.3
|
||||
# via
|
||||
# accelerate
|
||||
# timm
|
||||
# tokenizers
|
||||
# transformers
|
||||
@ -92,9 +97,7 @@ lxml==5.4.0
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
matplotlib==3.9.4
|
||||
# via
|
||||
# pycocotools
|
||||
# unstructured-inference
|
||||
# via unstructured-inference
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
networkx==3.2.1
|
||||
@ -102,6 +105,7 @@ networkx==3.2.1
|
||||
numpy==2.0.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# accelerate
|
||||
# contourpy
|
||||
# matplotlib
|
||||
# onnx
|
||||
@ -115,7 +119,7 @@ numpy==2.0.2
|
||||
# unstructured-inference
|
||||
omegaconf==2.3.0
|
||||
# via effdet
|
||||
onnx==1.17.0
|
||||
onnx==1.18.0
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
@ -128,6 +132,7 @@ opencv-python==4.11.0.86
|
||||
packaging==25.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# accelerate
|
||||
# huggingface-hub
|
||||
# matplotlib
|
||||
# onnxruntime
|
||||
@ -145,7 +150,7 @@ pdfminer-six==20250327
|
||||
# unstructured-inference
|
||||
pi-heif==0.22.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pikepdf==9.7.0
|
||||
pikepdf==9.8.1
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==11.2.1
|
||||
# via
|
||||
@ -159,7 +164,7 @@ proto-plus==1.26.1
|
||||
# via
|
||||
# google-api-core
|
||||
# google-cloud-vision
|
||||
protobuf==6.30.2
|
||||
protobuf==6.31.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
@ -169,13 +174,17 @@ protobuf==6.30.2
|
||||
# onnx
|
||||
# onnxruntime
|
||||
# proto-plus
|
||||
psutil==7.0.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# accelerate
|
||||
pyasn1==0.6.1
|
||||
# via
|
||||
# pyasn1-modules
|
||||
# rsa
|
||||
pyasn1-modules==0.4.2
|
||||
# via google-auth
|
||||
pycocotools==2.0.8
|
||||
pycocotools==2.0.9
|
||||
# via effdet
|
||||
pycparser==2.22
|
||||
# via
|
||||
@ -183,7 +192,7 @@ pycparser==2.22
|
||||
# cffi
|
||||
pyparsing==3.2.3
|
||||
# via matplotlib
|
||||
pypdf==5.4.0
|
||||
pypdf==5.6.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
@ -200,6 +209,7 @@ pytz==2025.2
|
||||
# via pandas
|
||||
pyyaml==6.0.2
|
||||
# via
|
||||
# accelerate
|
||||
# huggingface-hub
|
||||
# omegaconf
|
||||
# timm
|
||||
@ -222,6 +232,7 @@ rsa==4.9.1
|
||||
# via google-auth
|
||||
safetensors==0.5.3
|
||||
# via
|
||||
# accelerate
|
||||
# timm
|
||||
# transformers
|
||||
scipy==1.13.1
|
||||
@ -230,7 +241,7 @@ six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
sympy==1.13.3
|
||||
sympy==1.14.0
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
@ -244,6 +255,7 @@ tokenizers==0.21.1
|
||||
# transformers
|
||||
torch==2.7.0
|
||||
# via
|
||||
# accelerate
|
||||
# effdet
|
||||
# timm
|
||||
# torchvision
|
||||
@ -257,17 +269,18 @@ tqdm==4.67.1
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
transformers==4.51.3
|
||||
transformers==4.52.4
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
# onnx
|
||||
# pypdf
|
||||
# torch
|
||||
tzdata==2025.2
|
||||
# via pandas
|
||||
unstructured-inference==0.8.10
|
||||
unstructured-inference==1.0.2
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.15
|
||||
# via -r ./extra-pdf-image.in
|
||||
@ -280,5 +293,5 @@ wrapt==1.17.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# deprecated
|
||||
zipp==3.21.0
|
||||
zipp==3.22.0
|
||||
# via importlib-resources
|
||||
|
@ -10,7 +10,7 @@ pillow==11.2.1
|
||||
# via python-pptx
|
||||
python-pptx==1.0.2
|
||||
# via -r ./extra-pptx.in
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via python-pptx
|
||||
xlsxwriter==3.2.3
|
||||
# via python-pptx
|
||||
|
@ -8,7 +8,7 @@ certifi==2025.4.26
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
charset-normalizer==3.4.1
|
||||
charset-normalizer==3.4.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -21,11 +21,13 @@ filelock==3.18.0
|
||||
# huggingface-hub
|
||||
# torch
|
||||
# transformers
|
||||
fsspec==2025.3.2
|
||||
fsspec==2025.5.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface-hub==0.30.2
|
||||
hf-xet==1.1.2
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.32.3
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
@ -35,7 +37,7 @@ idna==3.10
|
||||
# requests
|
||||
jinja2==3.1.6
|
||||
# via torch
|
||||
joblib==1.4.2
|
||||
joblib==1.5.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# sacremoses
|
||||
@ -82,7 +84,7 @@ six==1.17.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# langdetect
|
||||
sympy==1.13.3
|
||||
sympy==1.14.0
|
||||
# via torch
|
||||
tokenizers==0.21.1
|
||||
# via
|
||||
@ -96,9 +98,9 @@ tqdm==4.67.1
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.51.3
|
||||
transformers==4.52.4
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# huggingface-hub
|
||||
|
@ -14,11 +14,11 @@ click==8.1.8
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# black
|
||||
coverage[toml]==7.8.0
|
||||
coverage[toml]==7.8.2
|
||||
# via
|
||||
# -r ./test.in
|
||||
# pytest-cov
|
||||
exceptiongroup==1.2.2
|
||||
exceptiongroup==1.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pytest
|
||||
@ -28,9 +28,9 @@ flake8==7.2.0
|
||||
# flake8-print
|
||||
flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.1
|
||||
freezegun==1.5.2
|
||||
# via -r ./test.in
|
||||
grpcio==1.71.0
|
||||
grpcio==1.72.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./test.in
|
||||
@ -40,7 +40,7 @@ liccheck==0.9.2
|
||||
# via -r ./test.in
|
||||
mccabe==0.7.0
|
||||
# via flake8
|
||||
mypy==1.15.0
|
||||
mypy==1.16.0
|
||||
# via -r ./test.in
|
||||
mypy-extensions==1.1.0
|
||||
# via
|
||||
@ -53,36 +53,40 @@ packaging==25.0
|
||||
# black
|
||||
# pytest
|
||||
pathspec==0.12.1
|
||||
# via
|
||||
# black
|
||||
# mypy
|
||||
platformdirs==4.3.8
|
||||
# via black
|
||||
platformdirs==4.3.7
|
||||
# via black
|
||||
pluggy==1.5.0
|
||||
pluggy==1.6.0
|
||||
# via pytest
|
||||
pycodestyle==2.13.0
|
||||
# via
|
||||
# flake8
|
||||
# flake8-print
|
||||
pydantic==2.11.3
|
||||
pydantic==2.11.5
|
||||
# via -r ./test.in
|
||||
pydantic-core==2.33.1
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pyflakes==3.3.2
|
||||
# via
|
||||
# autoflake
|
||||
# flake8
|
||||
pytest==8.3.5
|
||||
pygments==2.19.1
|
||||
# via pytest
|
||||
pytest==8.4.0
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
pytest-cov==6.1.1
|
||||
# via -r ./test.in
|
||||
pytest-mock==3.14.0
|
||||
pytest-mock==3.14.1
|
||||
# via -r ./test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# freezegun
|
||||
ruff==0.11.7
|
||||
ruff==0.11.12
|
||||
# via -r ./test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
@ -109,13 +113,14 @@ types-tabulate==0.9.0.20241207
|
||||
# via -r ./test.in
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.13.2
|
||||
typing-extensions==4.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# exceptiongroup
|
||||
# mypy
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
# typing-inspection
|
||||
typing-inspection==0.4.0
|
||||
typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
|
@ -1,7 +1,11 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners import translate
|
||||
|
||||
IS_CI = os.getenv("CI") == "true"
|
||||
|
||||
|
||||
def test_get_opus_mt_model_name():
|
||||
model_name = translate._get_opus_mt_model_name("ru", "en")
|
||||
@ -24,27 +28,32 @@ def test_translate_returns_same_text_text_is_empty():
|
||||
assert translate.translate_text(text) == text
|
||||
|
||||
|
||||
@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline")
|
||||
def test_translate_with_language_specified():
|
||||
text = "Ich bin ein Berliner!"
|
||||
assert translate.translate_text(text, "de") == "I'm a Berliner!"
|
||||
|
||||
|
||||
@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline")
|
||||
def test_translate_with_no_language_specified():
|
||||
text = "Ich bin ein Berliner!"
|
||||
assert translate.translate_text(text) == "I'm a Berliner!"
|
||||
|
||||
|
||||
@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline")
|
||||
def test_translate_raises_with_bad_language():
|
||||
text = "Ich bin ein Berliner!"
|
||||
with pytest.raises(ValueError):
|
||||
translate.translate_text(text, "zz")
|
||||
|
||||
|
||||
@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline")
|
||||
def test_tranlate_works_with_russian():
|
||||
text = "Я тоже можно переводать русский язык!"
|
||||
assert translate.translate_text(text) == "I can also translate Russian!"
|
||||
|
||||
|
||||
@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline")
|
||||
def test_translate_works_with_chinese():
|
||||
text = "網站有中、英文版本"
|
||||
translate.translate_text(text) == "Website available in Chinese and English"
|
||||
|
@ -26,7 +26,7 @@
|
||||
Large Model
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
| Notes
|
||||
Notes
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
|
@ -168,21 +168,33 @@
|
||||
Dataset
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
| Base Model'|
|
||||
|
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
| Notes
|
||||
Base Model'|
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
Large Model |
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
Notes
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr style="border: 1px solid black;">
|
||||
<td style="border: 1px solid black;">
|
||||
PubLayNet B8]|
|
||||
PubLayNet
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
B8]|
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
F/M
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
M
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Layouts of modern scientific documents
|
||||
</td>
|
||||
@ -191,9 +203,14 @@
|
||||
<td style="border: 1px solid black;">
|
||||
PRImA
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
M
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
-
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Layouts of scanned modern magazines and scientific report
|
||||
</td>
|
||||
@ -202,9 +219,14 @@
|
||||
<td style="border: 1px solid black;">
|
||||
Newspaper
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
F
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
-
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Layouts of scanned US newspapers from the 20th century
|
||||
</td>
|
||||
@ -213,6 +235,11 @@
|
||||
<td style="border: 1px solid black;">
|
||||
TableBank
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
F
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
F
|
||||
</td>
|
||||
@ -224,9 +251,14 @@
|
||||
<td style="border: 1px solid black;">
|
||||
HJDataset
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
F/M
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
-
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Layouts of history Japanese documents
|
||||
</td>
|
||||
@ -316,10 +348,7 @@
|
||||
<thead>
|
||||
<tr style="border: 1px solid black;">
|
||||
<th style="border: 1px solid black;">
|
||||
block.pad(top, bottom,
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
right,
|
||||
block.pad(top, bottom, right,
|
||||
</th>
|
||||
<th style="border: 1px solid black;">
|
||||
left)
|
||||
@ -336,8 +365,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Scale the current block given the ratio in x and y direction
|
||||
</td>
|
||||
@ -348,8 +375,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Move the current block with the shift distances in x and y direction
|
||||
</td>
|
||||
@ -360,8 +385,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Whether block] is inside of block2
|
||||
</td>
|
||||
@ -372,8 +395,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
|
||||
</td>
|
||||
@ -384,8 +405,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
|
||||
</td>
|
||||
@ -396,8 +415,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Convert the absolute coordinates of block to relative coordinates to block2
|
||||
</td>
|
||||
@ -408,8 +425,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
|
||||
</td>
|
||||
@ -420,8 +435,6 @@
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
</td>
|
||||
<td style="border: 1px solid black;">
|
||||
Obtain the image segments in the block region
|
||||
</td>
|
||||
|
@ -48,7 +48,7 @@
|
||||
"element_id": "dddac446da6c93dc1449ecb5d997c423",
|
||||
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents",
|
||||
"metadata": {
|
||||
"text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
|
||||
"text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>Notes</th></tr></thead><tbody><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
|
||||
"filetype": "image/jpeg",
|
||||
"languages": [
|
||||
"eng"
|
||||
|
@ -1459,7 +1459,7 @@
|
||||
"start_index": 65
|
||||
}
|
||||
],
|
||||
"text_as_html": "<table><thead><tr><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></tr></thead><tbody><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
|
||||
"text_as_html": "<table><thead><tr><th>Dataset</th><th>|</th><th>Base Model'|</th><th>Large Model |</th><th>Notes</th></tr></thead><tbody><tr><td>PubLayNet</td><td>B8]|</td><td>F/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td></td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td></td><td>F</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td></td><td>F</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td></td><td>F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></tbody></table>",
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
@ -2153,7 +2153,7 @@
|
||||
"element_id": "64bc79d1132a89c71837f420d6e4e2dc",
|
||||
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
|
||||
"metadata": {
|
||||
"text_as_html": "<table><thead><tr><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></tr></thead><tbody><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></tbody></table>",
|
||||
"text_as_html": "<table><thead><tr><th>block.pad(top, bottom, right,</th><th>left)</th><th>Enlarge the current block according to the input</th></tr></thead><tbody><tr><td>block.scale(fx, fy)</td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td>Obtain the image segments in the block region</td></tr></tbody></table>",
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.17.7-dev0" # pragma: no cover
|
||||
__version__ = "0.17.7" # pragma: no cover
|
||||
|
Loading…
x
Reference in New Issue
Block a user