mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
build: drop remaining Python 3.9 refs (#4049)
Dropped variables that said we support Python 3.9 in `setup.py`, as well as any remaining references to Python 3.9. I also checked the pins and removed several that don't seem necessary any more.
This commit is contained in:
parent
92965fb286
commit
7764fb6fd4
@ -1,7 +1,9 @@
|
||||
## 0.18.5-dev0
|
||||
## 0.18.5-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
@ -22,7 +22,7 @@ unstructured/nlp/patterns\.py
|
||||
|
||||
[tool.pyright]
|
||||
pythonPlatform = "Linux"
|
||||
pythonVersion = "3.9"
|
||||
pythonVersion = "3.10"
|
||||
reportUnnecessaryCast = true
|
||||
reportUnnecessaryTypeIgnoreComment = true
|
||||
stubPath = "./typings"
|
||||
@ -31,7 +31,7 @@ verboseOutput = true
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py39"
|
||||
target-version = "py310"
|
||||
|
||||
[tool.ruff.lint]
|
||||
ignore = [
|
||||
|
||||
@ -10,7 +10,7 @@ backoff==2.2.1
|
||||
# via -r ./base.in
|
||||
beautifulsoup4==4.13.4
|
||||
# via -r ./base.in
|
||||
certifi==2025.6.15
|
||||
certifi==2025.7.9
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
@ -28,7 +28,7 @@ click==8.2.1
|
||||
# via
|
||||
# nltk
|
||||
# python-oxmsg
|
||||
cryptography==45.0.4
|
||||
cryptography==45.0.5
|
||||
# via unstructured-client
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
@ -62,7 +62,7 @@ jsonpath-python==1.0.6
|
||||
# via unstructured-client
|
||||
langdetect==1.0.9
|
||||
# via -r ./base.in
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via -r ./base.in
|
||||
marshmallow==3.26.1
|
||||
# via
|
||||
@ -90,7 +90,7 @@ psutil==7.0.0
|
||||
# via -r ./base.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pypdf==5.6.0
|
||||
pypdf==5.7.0
|
||||
# via unstructured-client
|
||||
python-dateutil==2.9.0.post0
|
||||
# via unstructured-client
|
||||
@ -125,7 +125,7 @@ tqdm==4.67.1
|
||||
# via
|
||||
# -r ./base.in
|
||||
# nltk
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -r ./base.in
|
||||
# anyio
|
||||
@ -143,7 +143,7 @@ unstructured-client==0.25.9
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# -r ./base.in
|
||||
urllib3==2.4.0
|
||||
urllib3==2.5.0
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# requests
|
||||
|
||||
@ -5,19 +5,10 @@
|
||||
####################################################################################################
|
||||
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
|
||||
weaviate-client>=3.26.7,<4.0.0
|
||||
# TODO: Constriant due to multiple versions being installed during pip-compile
|
||||
protobuf>=6.30.0
|
||||
# TODO: Constriant due to multiple versions being installed during pip-compile
|
||||
grpcio>=1.65.5
|
||||
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)
|
||||
tokenizers>=0.21,<0.22
|
||||
# TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
|
||||
# updated or we drop support for 3.9
|
||||
# NOTE(alan): Okay to drop pin once this version exists and we verify compatibility.
|
||||
urllib3<3.0.0
|
||||
# TODO: Constriant due to aiobotocore, remove when that gets updates:
|
||||
botocore<1.34.132
|
||||
# TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile
|
||||
importlib-metadata>=8.5.0
|
||||
# (austin): Versions below this have a different interface for passing parameters
|
||||
unstructured-client>=0.23.0,<0.26.0
|
||||
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file
|
||||
|
||||
@ -8,7 +8,7 @@ numpy==2.2.6
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pandas
|
||||
pandas==2.3.0
|
||||
pandas==2.3.1
|
||||
# via -r ./extra-csv.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
|
||||
@ -4,13 +4,13 @@
|
||||
#
|
||||
# pip-compile ./extra-docx.in
|
||||
#
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
python-docx==1.2.0
|
||||
# via -r ./extra-docx.in
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
|
||||
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./extra-markdown.in
|
||||
#
|
||||
markdown==3.8
|
||||
markdown==3.8.2
|
||||
# via -r ./extra-markdown.in
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./extra-odt.in
|
||||
#
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
@ -12,7 +12,7 @@ pypandoc==1.15
|
||||
# via -r ./extra-odt.in
|
||||
python-docx==1.2.0
|
||||
# via -r ./extra-odt.in
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
|
||||
@ -16,13 +16,11 @@ anyio==4.9.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# httpx
|
||||
astor==0.8.1
|
||||
# via paddlepaddle
|
||||
beautifulsoup4==4.13.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# unstructured-paddleocr
|
||||
certifi==2025.6.15
|
||||
certifi==2025.7.9
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# httpcore
|
||||
@ -42,7 +40,7 @@ exceptiongroup==1.3.0
|
||||
# anyio
|
||||
fire==0.7.0
|
||||
# via unstructured-paddleocr
|
||||
fonttools==4.58.4
|
||||
fonttools==4.58.5
|
||||
# via unstructured-paddleocr
|
||||
h11==0.16.0
|
||||
# via
|
||||
@ -66,7 +64,7 @@ imageio==2.37.0
|
||||
# via scikit-image
|
||||
lazy-loader==0.4
|
||||
# via scikit-image
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-docx
|
||||
@ -90,11 +88,11 @@ numpy==2.2.6
|
||||
# shapely
|
||||
# tifffile
|
||||
# unstructured-paddleocr
|
||||
opencv-contrib-python==4.11.0.86
|
||||
opencv-contrib-python==4.12.0.88
|
||||
# via unstructured-paddleocr
|
||||
opencv-python==4.11.0.86
|
||||
opencv-python==4.12.0.88
|
||||
# via unstructured-paddleocr
|
||||
opencv-python-headless==4.11.0.86
|
||||
opencv-python-headless==4.12.0.88
|
||||
# via
|
||||
# albucore
|
||||
# albumentations
|
||||
@ -105,7 +103,7 @@ packaging==25.0
|
||||
# -c requirements/base.txt
|
||||
# lazy-loader
|
||||
# scikit-image
|
||||
paddlepaddle==3.0.0
|
||||
paddlepaddle==3.1.0
|
||||
# via -r ./extra-paddleocr.in
|
||||
pillow==11.3.0
|
||||
# via
|
||||
@ -145,7 +143,7 @@ scipy==1.15.3
|
||||
# scikit-image
|
||||
shapely==2.1.1
|
||||
# via unstructured-paddleocr
|
||||
simsimd==6.4.9
|
||||
simsimd==6.5.0
|
||||
# via albucore
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
@ -165,7 +163,7 @@ tqdm==4.67.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# unstructured-paddleocr
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# anyio
|
||||
@ -180,7 +178,7 @@ typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
unstructured-paddleocr==2.10.0
|
||||
# via -r ./extra-paddleocr.in
|
||||
urllib3==2.4.0
|
||||
urllib3==2.5.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/deps/constraints.txt
|
||||
|
||||
@ -4,13 +4,13 @@
|
||||
#
|
||||
# pip-compile ./extra-pdf-image.in
|
||||
#
|
||||
accelerate==1.7.0
|
||||
accelerate==1.8.1
|
||||
# via unstructured-inference
|
||||
antlr4-python3-runtime==4.9.3
|
||||
# via omegaconf
|
||||
cachetools==5.5.2
|
||||
# via google-auth
|
||||
certifi==2025.6.15
|
||||
certifi==2025.7.9
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
@ -27,7 +27,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.3.2
|
||||
# via matplotlib
|
||||
cryptography==45.0.4
|
||||
cryptography==45.0.5
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pdfminer-six
|
||||
@ -44,7 +44,7 @@ filelock==3.18.0
|
||||
# transformers
|
||||
flatbuffers==25.2.10
|
||||
# via onnxruntime
|
||||
fonttools==4.58.4
|
||||
fonttools==4.58.5
|
||||
# via matplotlib
|
||||
fsspec==2025.5.1
|
||||
# via
|
||||
@ -62,16 +62,16 @@ googleapis-common-protos==1.70.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.73.0
|
||||
grpcio==1.73.1
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio-status==1.73.0
|
||||
grpcio-status==1.73.1
|
||||
# via google-api-core
|
||||
hf-xet==1.1.4
|
||||
hf-xet==1.1.5
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.33.0
|
||||
huggingface-hub==0.33.2
|
||||
# via
|
||||
# accelerate
|
||||
# timm
|
||||
@ -88,7 +88,7 @@ jinja2==3.1.6
|
||||
# via torch
|
||||
kiwisolver==1.4.8
|
||||
# via matplotlib
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pikepdf
|
||||
@ -125,7 +125,7 @@ onnxruntime==1.22.0
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
opencv-python==4.11.0.86
|
||||
opencv-python==4.12.0.88
|
||||
# via unstructured-inference
|
||||
packaging==25.0
|
||||
# via
|
||||
@ -137,7 +137,7 @@ packaging==25.0
|
||||
# pikepdf
|
||||
# transformers
|
||||
# unstructured-pytesseract
|
||||
pandas==2.3.0
|
||||
pandas==2.3.1
|
||||
# via unstructured-inference
|
||||
pdf2image==1.17.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
@ -146,9 +146,9 @@ pdfminer-six==20250327
|
||||
# -c requirements/deps/constraints.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
# unstructured-inference
|
||||
pi-heif==0.22.0
|
||||
pi-heif==1.0.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pikepdf==9.8.1
|
||||
pikepdf==9.9.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==11.3.0
|
||||
# via
|
||||
@ -190,7 +190,7 @@ pycparser==2.22
|
||||
# cffi
|
||||
pyparsing==3.2.3
|
||||
# via matplotlib
|
||||
pypdf==5.6.0
|
||||
pypdf==5.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
@ -243,11 +243,11 @@ sympy==1.14.0
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==1.0.15
|
||||
timm==1.0.16
|
||||
# via
|
||||
# effdet
|
||||
# unstructured-inference
|
||||
tokenizers==0.21.1
|
||||
tokenizers==0.21.2
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# transformers
|
||||
@ -267,9 +267,9 @@ tqdm==4.67.1
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
transformers==4.52.4
|
||||
transformers==4.53.1
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
@ -282,7 +282,7 @@ unstructured-inference==1.0.5
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.15
|
||||
# via -r ./extra-pdf-image.in
|
||||
urllib3==2.4.0
|
||||
urllib3==2.5.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/deps/constraints.txt
|
||||
|
||||
@ -4,13 +4,13 @@
|
||||
#
|
||||
# pip-compile ./extra-pptx.in
|
||||
#
|
||||
lxml==5.4.0
|
||||
lxml==6.0.0
|
||||
# via python-pptx
|
||||
pillow==11.3.0
|
||||
# via python-pptx
|
||||
python-pptx==1.0.2
|
||||
# via -r ./extra-pptx.in
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via python-pptx
|
||||
xlsxwriter==3.2.3
|
||||
xlsxwriter==3.2.5
|
||||
# via python-pptx
|
||||
|
||||
@ -14,7 +14,7 @@ numpy==2.2.6
|
||||
# pandas
|
||||
openpyxl==3.1.5
|
||||
# via -r ./extra-xlsx.in
|
||||
pandas==2.3.0
|
||||
pandas==2.3.1
|
||||
# via -r ./extra-xlsx.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./huggingface.in
|
||||
#
|
||||
certifi==2025.6.15
|
||||
certifi==2025.7.9
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
@ -25,9 +25,9 @@ fsspec==2025.5.1
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
hf-xet==1.1.4
|
||||
hf-xet==1.1.5
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.33.0
|
||||
huggingface-hub==0.33.2
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
@ -86,7 +86,7 @@ six==1.17.0
|
||||
# langdetect
|
||||
sympy==1.14.0
|
||||
# via torch
|
||||
tokenizers==0.21.1
|
||||
tokenizers==0.21.2
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# transformers
|
||||
@ -98,14 +98,14 @@ tqdm==4.67.1
|
||||
# huggingface-hub
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.52.4
|
||||
transformers==4.53.1
|
||||
# via -r ./huggingface.in
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
# torch
|
||||
urllib3==2.4.0
|
||||
urllib3==2.5.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/deps/constraints.txt
|
||||
|
||||
@ -14,7 +14,7 @@ click==8.2.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
coverage[toml]==7.9.1
|
||||
coverage[toml]==7.9.2
|
||||
# via
|
||||
# -r ./test.in
|
||||
# pytest-cov
|
||||
@ -24,7 +24,7 @@ exceptiongroup==1.3.0
|
||||
# pytest
|
||||
execnet==2.1.1
|
||||
# via pytest-xdist
|
||||
flake8==7.2.0
|
||||
flake8==7.3.0
|
||||
# via
|
||||
# -r ./test.in
|
||||
# flake8-print
|
||||
@ -32,7 +32,7 @@ flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.2
|
||||
# via -r ./test.in
|
||||
grpcio==1.73.0
|
||||
grpcio==1.73.1
|
||||
# via
|
||||
# -c requirements/deps/constraints.txt
|
||||
# -r ./test.in
|
||||
@ -64,7 +64,7 @@ pluggy==1.6.0
|
||||
# via
|
||||
# pytest
|
||||
# pytest-cov
|
||||
pycodestyle==2.13.0
|
||||
pycodestyle==2.14.0
|
||||
# via
|
||||
# flake8
|
||||
# flake8-print
|
||||
@ -72,13 +72,13 @@ pydantic==2.11.7
|
||||
# via -r ./test.in
|
||||
pydantic-core==2.33.2
|
||||
# via pydantic
|
||||
pyflakes==3.3.2
|
||||
pyflakes==3.4.0
|
||||
# via
|
||||
# autoflake
|
||||
# flake8
|
||||
pygments==2.19.1
|
||||
pygments==2.19.2
|
||||
# via pytest
|
||||
pytest==8.4.0
|
||||
pytest==8.4.1
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
@ -87,13 +87,13 @@ pytest-cov==6.2.1
|
||||
# via -r ./test.in
|
||||
pytest-mock==3.14.1
|
||||
# via -r ./test.in
|
||||
pytest-xdist==3.7.0
|
||||
pytest-xdist==3.8.0
|
||||
# via -r ./test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# freezegun
|
||||
ruff==0.11.13
|
||||
ruff==0.12.2
|
||||
# via -r ./test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
@ -112,13 +112,13 @@ tomli==2.2.1
|
||||
# pytest
|
||||
types-click==7.1.8
|
||||
# via -r ./test.in
|
||||
types-markdown==3.8.0.20250415
|
||||
types-markdown==3.8.0.20250708
|
||||
# via -r ./test.in
|
||||
types-requests==2.32.4.20250611
|
||||
# via -r ./test.in
|
||||
types-tabulate==0.9.0.20241207
|
||||
# via -r ./test.in
|
||||
typing-extensions==4.14.0
|
||||
typing-extensions==4.14.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
@ -129,7 +129,7 @@ typing-extensions==4.14.0
|
||||
# typing-inspection
|
||||
typing-inspection==0.4.1
|
||||
# via pydantic
|
||||
urllib3==2.4.0
|
||||
urllib3==2.5.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/deps/constraints.txt
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# python version must match lowest supported (3.9)
|
||||
# python version must match lowest supported (3.10)
|
||||
major=3
|
||||
minor=10
|
||||
if ! python -c "import sys; assert sys.version_info.major == $major and sys.version_info.minor == $minor"; then
|
||||
|
||||
3
setup.py
3
setup.py
@ -82,7 +82,7 @@ setup(
|
||||
long_description_content_type="text/markdown",
|
||||
keywords="NLP PDF HTML CV XML parsing preprocessing",
|
||||
url="https://github.com/Unstructured-IO/unstructured",
|
||||
python_requires=">=3.9.0",
|
||||
python_requires=">=3.10.0",
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
@ -91,7 +91,6 @@ setup(
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
|
||||
@ -401,9 +401,8 @@ def test_get_page_image_metadata_and_coordinate_system():
|
||||
assert isinstance(metadata, dict)
|
||||
|
||||
|
||||
def test_ocr_data_to_elements(
|
||||
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
|
||||
):
|
||||
def test_ocr_data_to_elements():
|
||||
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
|
||||
text_regions = [
|
||||
TextRegion.from_coords(
|
||||
163.0,
|
||||
|
||||
@ -133,9 +133,8 @@ def test_partition_image_local_raises_with_no_filename():
|
||||
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
||||
|
||||
|
||||
def test_partition_image_with_auto_strategy(
|
||||
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
|
||||
):
|
||||
def test_partition_image_with_auto_strategy():
|
||||
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
|
||||
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
||||
titles = [
|
||||
el for el in elements if el.category == ElementType.TITLE and len(el.text.split(" ")) > 10
|
||||
@ -147,9 +146,8 @@ def test_partition_image_with_auto_strategy(
|
||||
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
||||
|
||||
|
||||
def test_partition_image_with_table_extraction(
|
||||
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
|
||||
):
|
||||
def test_partition_image_with_table_extraction():
|
||||
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
@ -161,17 +159,14 @@ def test_partition_image_with_table_extraction(
|
||||
assert "</thead><tbody><tr>" in table[0]
|
||||
|
||||
|
||||
def test_partition_image_with_multipage_tiff(
|
||||
filename=example_doc_path("img/layout-parser-paper-combined.tiff"),
|
||||
):
|
||||
def test_partition_image_with_multipage_tiff():
|
||||
filename = example_doc_path("img/layout-parser-paper-combined.tiff")
|
||||
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
||||
assert elements[-1].metadata.page_number == 2
|
||||
|
||||
|
||||
def test_partition_image_with_bmp(
|
||||
tmpdir,
|
||||
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
|
||||
):
|
||||
def test_partition_image_with_bmp(tmpdir):
|
||||
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
|
||||
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
|
||||
img = Image.open(filename)
|
||||
img.save(bmp_filename)
|
||||
@ -187,7 +182,8 @@ def test_partition_image_with_bmp(
|
||||
assert "</thead><tbody><tr>" in table[0]
|
||||
|
||||
|
||||
def test_partition_image_with_language_passed(filename=example_doc_path("img/example.jpg")):
|
||||
def test_partition_image_with_language_passed():
|
||||
filename = example_doc_path("img/example.jpg")
|
||||
with mock.patch.object(
|
||||
ocr,
|
||||
"process_file_with_ocr",
|
||||
@ -202,9 +198,8 @@ def test_partition_image_with_language_passed(filename=example_doc_path("img/exa
|
||||
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
||||
|
||||
|
||||
def test_partition_image_from_file_with_language_passed(
|
||||
filename=example_doc_path("img/example.jpg"),
|
||||
):
|
||||
def test_partition_image_from_file_with_language_passed():
|
||||
filename = example_doc_path("img/example.jpg")
|
||||
with mock.patch.object(
|
||||
ocr,
|
||||
"process_data_with_ocr",
|
||||
@ -217,9 +212,8 @@ def test_partition_image_from_file_with_language_passed(
|
||||
|
||||
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
|
||||
@pytest.mark.skip(reason="Current catching too many tesseract errors")
|
||||
def test_partition_image_raises_with_invalid_language(
|
||||
filename=example_doc_path("img/example.jpg"),
|
||||
):
|
||||
def test_partition_image_raises_with_invalid_language():
|
||||
filename = example_doc_path("img/example.jpg")
|
||||
with pytest.raises(TesseractError):
|
||||
image.partition_image(
|
||||
filename=filename,
|
||||
@ -414,9 +408,8 @@ def test_partition_msg_with_json():
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
||||
def test_partition_image_with_ocr_has_coordinates_from_filename(
|
||||
filename=example_doc_path("img/english-and-korean.png"),
|
||||
):
|
||||
def test_partition_image_with_ocr_has_coordinates_from_filename():
|
||||
filename = example_doc_path("img/english-and-korean.png")
|
||||
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
|
||||
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
|
||||
assert int_coordinates == [(14, 16), (14, 37), (381, 37), (381, 16)]
|
||||
@ -467,9 +460,8 @@ def test_partition_image_warns_with_ocr_languages(caplog):
|
||||
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_image(
|
||||
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
|
||||
):
|
||||
def test_add_chunking_strategy_on_partition_image():
|
||||
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
|
||||
elements = image.partition_image(filename=filename)
|
||||
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
|
||||
chunks = chunk_by_title(elements)
|
||||
@ -477,9 +469,8 @@ def test_add_chunking_strategy_on_partition_image(
|
||||
assert chunk_elements == chunks
|
||||
|
||||
|
||||
def test_add_chunking_strategy_on_partition_image_hi_res(
|
||||
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
|
||||
):
|
||||
def test_add_chunking_strategy_on_partition_image_hi_res():
|
||||
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
strategy=PartitionStrategy.HI_RES,
|
||||
@ -615,8 +606,8 @@ def test_partition_image_has_filename(inference_results):
|
||||
def test_partition_image_element_extraction(
|
||||
file_mode,
|
||||
extract_image_block_to_payload,
|
||||
filename=example_doc_path("img/embedded-images-tables.jpg"),
|
||||
):
|
||||
filename = example_doc_path("img/embedded-images-tables.jpg")
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@ -641,9 +632,8 @@ def test_partition_image_element_extraction(
|
||||
)
|
||||
|
||||
|
||||
def test_partition_image_works_on_heic_file(
|
||||
filename=example_doc_path("img/DA-1p.heic"),
|
||||
):
|
||||
def test_partition_image_works_on_heic_file():
|
||||
filename = example_doc_path("img/DA-1p.heic")
|
||||
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
||||
titles = [el.text for el in elements if el.category == ElementType.TITLE]
|
||||
assert "CREATURES" in titles
|
||||
|
||||
@ -226,8 +226,9 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
||||
starting_page_number,
|
||||
expected_page_numbers,
|
||||
origin,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"),
|
||||
):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf")
|
||||
|
||||
# Test that the partition_pdf function can handle filename
|
||||
def _test(result):
|
||||
# validate that the result is a non-empty list of dicts
|
||||
@ -270,8 +271,8 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
||||
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
||||
def test_partition_pdf_with_model_name_env_var(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
@ -286,8 +287,8 @@ def test_partition_pdf_with_model_name_env_var(
|
||||
def test_partition_pdf_with_model_name(
|
||||
monkeypatch,
|
||||
model_name,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
@ -315,10 +316,8 @@ def test_partition_pdf_with_model_name(
|
||||
assert mock_process.call_args[1]["model_name"] == model_name
|
||||
|
||||
|
||||
def test_partition_pdf_with_hi_res_model_name(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_hi_res_model_name(monkeypatch):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
@ -332,10 +331,8 @@ def test_partition_pdf_with_hi_res_model_name(
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_or_image_with_hi_res_model_name(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_or_image_with_hi_res_model_name(monkeypatch):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||
with mock.patch.object(
|
||||
layout,
|
||||
@ -349,9 +346,8 @@ def test_partition_pdf_or_image_with_hi_res_model_name(
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_with_auto_strategy(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_auto_strategy():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
|
||||
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
||||
assert elements[6].text == title
|
||||
@ -359,23 +355,20 @@ def test_partition_pdf_with_auto_strategy(
|
||||
assert elements[6].metadata.file_directory == os.path.dirname(filename)
|
||||
|
||||
|
||||
def test_partition_pdf_with_page_breaks(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_page_breaks():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
||||
assert "PageBreak" in [elem.category for elem in elements]
|
||||
|
||||
|
||||
def test_partition_pdf_with_no_page_breaks(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_no_page_breaks():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, url=None)
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_strategy(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_strategy():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
|
||||
)
|
||||
@ -394,9 +387,8 @@ def test_partition_pdf_with_fast_neg_coordinates():
|
||||
assert elements[0].metadata.coordinates.points[1][0] < 0
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_groups_text(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_groups_text():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
|
||||
|
||||
first_narrative_element = None
|
||||
@ -410,18 +402,15 @@ def test_partition_pdf_with_fast_groups_text(
|
||||
assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf"
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_strategy_from_file(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_strategy_from_file():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = pdf.partition_pdf(file=f, url=None, strategy=PartitionStrategy.FAST)
|
||||
assert len(elements) > 10
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
||||
caplog,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_strategy_and_page_breaks(caplog):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
url=None,
|
||||
@ -436,18 +425,15 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
||||
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
||||
|
||||
|
||||
def test_partition_pdf_raises_with_bad_strategy(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_raises_with_bad_strategy():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
with pytest.raises(ValueError):
|
||||
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
||||
|
||||
|
||||
def test_partition_pdf_falls_back_to_fast(
|
||||
monkeypatch,
|
||||
caplog,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_falls_back_to_fast(monkeypatch, caplog):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
def mock_exists(dep):
|
||||
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
|
||||
|
||||
@ -465,11 +451,9 @@ def test_partition_pdf_falls_back_to_fast(
|
||||
assert "unstructured_inference is not installed" in caplog.text
|
||||
|
||||
|
||||
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
||||
monkeypatch,
|
||||
caplog,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_falls_back_to_fast_from_ocr_only(monkeypatch, caplog):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
def mock_exists(dep):
|
||||
return dep not in ["unstructured_pytesseract"]
|
||||
|
||||
@ -491,11 +475,9 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
||||
assert "pytesseract is not installed" in caplog.text
|
||||
|
||||
|
||||
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
||||
monkeypatch,
|
||||
caplog,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(monkeypatch, caplog):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
def mock_exists(dep):
|
||||
return dep not in ["unstructured_pytesseract"]
|
||||
|
||||
@ -514,11 +496,9 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
||||
assert "pytesseract is not installed" in caplog.text
|
||||
|
||||
|
||||
def test_partition_pdf_falls_back_to_ocr_only(
|
||||
monkeypatch,
|
||||
caplog,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_falls_back_to_ocr_only(monkeypatch, caplog):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
def mock_exists(dep):
|
||||
return dep not in ["unstructured_inference"]
|
||||
|
||||
@ -633,7 +613,8 @@ def test_partition_pdf_with_dpi():
|
||||
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
|
||||
|
||||
|
||||
def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("pdf/reliance.pdf")):
|
||||
def test_partition_pdf_requiring_recursive_text_grab():
|
||||
filename = example_doc_path("pdf/reliance.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
|
||||
assert len(elements) > 50
|
||||
assert elements[0].metadata.page_number == 1
|
||||
@ -646,10 +627,9 @@ def test_partition_pdf_text_not_extractable():
|
||||
assert len(elements) == 0
|
||||
|
||||
|
||||
def test_partition_pdf_fails_if_pdf_not_processable(
|
||||
monkeypatch,
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_fails_if_pdf_not_processable(monkeypatch):
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
|
||||
def mock_exists(dep):
|
||||
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
|
||||
|
||||
@ -700,9 +680,8 @@ def test_partition_pdf_fast_groups_text_in_text_box():
|
||||
assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3)
|
||||
|
||||
|
||||
def test_partition_pdf_with_metadata_filename(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_metadata_filename():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
url=None,
|
||||
@ -713,9 +692,8 @@ def test_partition_pdf_with_metadata_filename(
|
||||
assert element.metadata.filename == "test"
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = pdf.partition_pdf(
|
||||
file=f,
|
||||
@ -793,9 +771,8 @@ def test_partition_pdf_with_json(strategy: str):
|
||||
assert_round_trips_through_JSON(elements)
|
||||
|
||||
|
||||
def test_add_chunking_strategy_by_title_on_partition_pdf(
|
||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
def test_add_chunking_strategy_by_title_on_partition_pdf():
|
||||
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename)
|
||||
chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title")
|
||||
chunks = chunk_by_title(elements)
|
||||
@ -920,9 +897,8 @@ def test_partition_pdf_uses_hi_res_model_name():
|
||||
assert mockpartition.call_args.kwargs["hi_res_model_name"]
|
||||
|
||||
|
||||
def test_partition_pdf_word_bbox_not_char(
|
||||
filename=example_doc_path("pdf/interface-config-guide-p93.pdf"),
|
||||
):
|
||||
def test_partition_pdf_word_bbox_not_char():
|
||||
filename = example_doc_path("pdf/interface-config-guide-p93.pdf")
|
||||
try:
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
except Exception as e:
|
||||
@ -930,9 +906,8 @@ def test_partition_pdf_word_bbox_not_char(
|
||||
assert len(elements) == 17
|
||||
|
||||
|
||||
def test_partition_pdf_fast_no_mapping_errors(
|
||||
filename=example_doc_path("pdf/a1977-backus-p21.pdf"),
|
||||
):
|
||||
def test_partition_pdf_fast_no_mapping_errors():
|
||||
filename = example_doc_path("pdf/a1977-backus-p21.pdf")
|
||||
"""Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940,
|
||||
failing to map old parent_id's to new"""
|
||||
pdf.partition_pdf(filename=filename, strategy="fast")
|
||||
@ -1190,9 +1165,8 @@ def test_partition_pdf_with_bad_color_profile():
|
||||
assert pdf.partition_pdf(filename, strategy="fast")
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_finds_headers_footers(
|
||||
filename=example_doc_path("pdf/header-test-doc.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_fast_finds_headers_footers():
|
||||
filename = example_doc_path("pdf/header-test-doc.pdf")
|
||||
elements = pdf.partition_pdf(filename, strategy="fast")
|
||||
assert isinstance(elements[0], Header)
|
||||
assert isinstance(elements[-1], Footer)
|
||||
@ -1266,11 +1240,8 @@ def assert_element_extraction(
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_partition_pdf_element_extraction(
|
||||
file_mode,
|
||||
extract_image_block_to_payload,
|
||||
filename=example_doc_path("pdf/embedded-images-tables.pdf"),
|
||||
):
|
||||
def test_partition_pdf_element_extraction(file_mode, extract_image_block_to_payload):
|
||||
filename = example_doc_path("pdf/embedded-images-tables.pdf")
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@ -1299,9 +1270,8 @@ def test_partition_pdf_element_extraction(
|
||||
)
|
||||
|
||||
|
||||
def test_partition_pdf_always_keep_all_image_elements(
|
||||
filename=example_doc_path("pdf/embedded-images.pdf"),
|
||||
):
|
||||
def test_partition_pdf_always_keep_all_image_elements():
|
||||
filename = example_doc_path("pdf/embedded-images.pdf")
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
strategy="hi_res",
|
||||
@ -1559,11 +1529,9 @@ def test_document_to_element_list_sets_category_depth_titles():
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
],
|
||||
)
|
||||
def test_partition_pdf_with_password(
|
||||
file_mode,
|
||||
strategy,
|
||||
filename=example_doc_path("pdf/password.pdf"),
|
||||
):
|
||||
def test_partition_pdf_with_password(file_mode, strategy):
|
||||
filename = example_doc_path("pdf/password.pdf")
|
||||
|
||||
# Test that the partition_pdf function can handle filename
|
||||
def _test(result):
|
||||
# validate that the result is a non-empty list of dicts
|
||||
|
||||
@ -35,9 +35,8 @@ def test_write_image(image_type):
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("path_only", [True, False])
|
||||
def test_convert_pdf_to_image(
|
||||
file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf")
|
||||
):
|
||||
def test_convert_pdf_to_image(file_mode, path_only):
|
||||
filename = example_doc_path("pdf/embedded-images.pdf")
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
images = pdf_image_utils.convert_pdf_to_image(
|
||||
@ -61,7 +60,8 @@ def test_convert_pdf_to_image(
|
||||
assert isinstance(images[0], PILImg.Image)
|
||||
|
||||
|
||||
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
|
||||
def test_convert_pdf_to_image_raises_error():
|
||||
filename = example_doc_path("embedded-images.pdf")
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.18.5-dev0" # pragma: no cover
|
||||
__version__ = "0.18.5-dev1" # pragma: no cover
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user