build: drop remaining Python 3.9 refs (#4049)

Dropped variables that said we support Python 3.9 in `setup.py`, as well
as any remaining references to Python 3.9.

I also checked the pins and removed several that don't seem necessary
any more.
This commit is contained in:
qued 2025-07-10 11:43:15 -05:00 committed by GitHub
parent 92965fb286
commit 7764fb6fd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 162 additions and 215 deletions

View File

@ -1,7 +1,9 @@
## 0.18.5-dev0
## 0.18.5-dev1
### Enhancements
- **Bump dependencies and remove lingering Python 3.9 artifacts** Cleaned up some references to 3.9 that were left When we dropped Python 3.9 support.
### Features
### Fixes

View File

@ -22,7 +22,7 @@ unstructured/nlp/patterns\.py
[tool.pyright]
pythonPlatform = "Linux"
pythonVersion = "3.9"
pythonVersion = "3.10"
reportUnnecessaryCast = true
reportUnnecessaryTypeIgnoreComment = true
stubPath = "./typings"
@ -31,7 +31,7 @@ verboseOutput = true
[tool.ruff]
line-length = 100
target-version = "py39"
target-version = "py310"
[tool.ruff.lint]
ignore = [

View File

@ -10,7 +10,7 @@ backoff==2.2.1
# via -r ./base.in
beautifulsoup4==4.13.4
# via -r ./base.in
certifi==2025.6.15
certifi==2025.7.9
# via
# httpcore
# httpx
@ -28,7 +28,7 @@ click==8.2.1
# via
# nltk
# python-oxmsg
cryptography==45.0.4
cryptography==45.0.5
# via unstructured-client
dataclasses-json==0.6.7
# via
@ -62,7 +62,7 @@ jsonpath-python==1.0.6
# via unstructured-client
langdetect==1.0.9
# via -r ./base.in
lxml==5.4.0
lxml==6.0.0
# via -r ./base.in
marshmallow==3.26.1
# via
@ -90,7 +90,7 @@ psutil==7.0.0
# via -r ./base.in
pycparser==2.22
# via cffi
pypdf==5.6.0
pypdf==5.7.0
# via unstructured-client
python-dateutil==2.9.0.post0
# via unstructured-client
@ -125,7 +125,7 @@ tqdm==4.67.1
# via
# -r ./base.in
# nltk
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -r ./base.in
# anyio
@ -143,7 +143,7 @@ unstructured-client==0.25.9
# via
# -c requirements/deps/constraints.txt
# -r ./base.in
urllib3==2.4.0
urllib3==2.5.0
# via
# -c requirements/deps/constraints.txt
# requests

View File

@ -5,19 +5,10 @@
####################################################################################################
# we are using v3 client https://weaviate.io/developers/weaviate/client-libraries/python/python_v3
weaviate-client>=3.26.7,<4.0.0
# TODO: Constriant due to multiple versions being installed during pip-compile
protobuf>=6.30.0
# TODO: Constriant due to multiple versions being installed during pip-compile
grpcio>=1.65.5
# TODO: Pinned in transformers package, remove when that gets updated (https://github.com/huggingface/transformers/blob/main/setup.py)
tokenizers>=0.21,<0.22
# TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets
# updated or we drop support for 3.9
# NOTE(alan): Okay to drop pin once this version exists and we verify compatibility.
urllib3<3.0.0
# TODO: Constriant due to aiobotocore, remove when that gets updates:
botocore<1.34.132
# TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile
importlib-metadata>=8.5.0
# (austin): Versions below this have a different interface for passing parameters
unstructured-client>=0.23.0,<0.26.0
# paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file

View File

@ -8,7 +8,7 @@ numpy==2.2.6
# via
# -c requirements/base.txt
# pandas
pandas==2.3.0
pandas==2.3.1
# via -r ./extra-csv.in
python-dateutil==2.9.0.post0
# via

View File

@ -4,13 +4,13 @@
#
# pip-compile ./extra-docx.in
#
lxml==5.4.0
lxml==6.0.0
# via
# -c requirements/base.txt
# python-docx
python-docx==1.2.0
# via -r ./extra-docx.in
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# python-docx

View File

@ -4,5 +4,5 @@
#
# pip-compile ./extra-markdown.in
#
markdown==3.8
markdown==3.8.2
# via -r ./extra-markdown.in

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-odt.in
#
lxml==5.4.0
lxml==6.0.0
# via
# -c requirements/base.txt
# python-docx
@ -12,7 +12,7 @@ pypandoc==1.15
# via -r ./extra-odt.in
python-docx==1.2.0
# via -r ./extra-odt.in
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# python-docx

View File

@ -16,13 +16,11 @@ anyio==4.9.0
# via
# -c requirements/base.txt
# httpx
astor==0.8.1
# via paddlepaddle
beautifulsoup4==4.13.4
# via
# -c requirements/base.txt
# unstructured-paddleocr
certifi==2025.6.15
certifi==2025.7.9
# via
# -c requirements/base.txt
# httpcore
@ -42,7 +40,7 @@ exceptiongroup==1.3.0
# anyio
fire==0.7.0
# via unstructured-paddleocr
fonttools==4.58.4
fonttools==4.58.5
# via unstructured-paddleocr
h11==0.16.0
# via
@ -66,7 +64,7 @@ imageio==2.37.0
# via scikit-image
lazy-loader==0.4
# via scikit-image
lxml==5.4.0
lxml==6.0.0
# via
# -c requirements/base.txt
# python-docx
@ -90,11 +88,11 @@ numpy==2.2.6
# shapely
# tifffile
# unstructured-paddleocr
opencv-contrib-python==4.11.0.86
opencv-contrib-python==4.12.0.88
# via unstructured-paddleocr
opencv-python==4.11.0.86
opencv-python==4.12.0.88
# via unstructured-paddleocr
opencv-python-headless==4.11.0.86
opencv-python-headless==4.12.0.88
# via
# albucore
# albumentations
@ -105,7 +103,7 @@ packaging==25.0
# -c requirements/base.txt
# lazy-loader
# scikit-image
paddlepaddle==3.0.0
paddlepaddle==3.1.0
# via -r ./extra-paddleocr.in
pillow==11.3.0
# via
@ -145,7 +143,7 @@ scipy==1.15.3
# scikit-image
shapely==2.1.1
# via unstructured-paddleocr
simsimd==6.4.9
simsimd==6.5.0
# via albucore
sniffio==1.3.1
# via
@ -165,7 +163,7 @@ tqdm==4.67.1
# via
# -c requirements/base.txt
# unstructured-paddleocr
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# anyio
@ -180,7 +178,7 @@ typing-inspection==0.4.1
# via pydantic
unstructured-paddleocr==2.10.0
# via -r ./extra-paddleocr.in
urllib3==2.4.0
urllib3==2.5.0
# via
# -c requirements/base.txt
# -c requirements/deps/constraints.txt

View File

@ -4,13 +4,13 @@
#
# pip-compile ./extra-pdf-image.in
#
accelerate==1.7.0
accelerate==1.8.1
# via unstructured-inference
antlr4-python3-runtime==4.9.3
# via omegaconf
cachetools==5.5.2
# via google-auth
certifi==2025.6.15
certifi==2025.7.9
# via
# -c requirements/base.txt
# requests
@ -27,7 +27,7 @@ coloredlogs==15.0.1
# via onnxruntime
contourpy==1.3.2
# via matplotlib
cryptography==45.0.4
cryptography==45.0.5
# via
# -c requirements/base.txt
# pdfminer-six
@ -44,7 +44,7 @@ filelock==3.18.0
# transformers
flatbuffers==25.2.10
# via onnxruntime
fonttools==4.58.4
fonttools==4.58.5
# via matplotlib
fsspec==2025.5.1
# via
@ -62,16 +62,16 @@ googleapis-common-protos==1.70.0
# via
# google-api-core
# grpcio-status
grpcio==1.73.0
grpcio==1.73.1
# via
# -c requirements/deps/constraints.txt
# google-api-core
# grpcio-status
grpcio-status==1.73.0
grpcio-status==1.73.1
# via google-api-core
hf-xet==1.1.4
hf-xet==1.1.5
# via huggingface-hub
huggingface-hub==0.33.0
huggingface-hub==0.33.2
# via
# accelerate
# timm
@ -88,7 +88,7 @@ jinja2==3.1.6
# via torch
kiwisolver==1.4.8
# via matplotlib
lxml==5.4.0
lxml==6.0.0
# via
# -c requirements/base.txt
# pikepdf
@ -125,7 +125,7 @@ onnxruntime==1.22.0
# via
# -r ./extra-pdf-image.in
# unstructured-inference
opencv-python==4.11.0.86
opencv-python==4.12.0.88
# via unstructured-inference
packaging==25.0
# via
@ -137,7 +137,7 @@ packaging==25.0
# pikepdf
# transformers
# unstructured-pytesseract
pandas==2.3.0
pandas==2.3.1
# via unstructured-inference
pdf2image==1.17.0
# via -r ./extra-pdf-image.in
@ -146,9 +146,9 @@ pdfminer-six==20250327
# -c requirements/deps/constraints.txt
# -r ./extra-pdf-image.in
# unstructured-inference
pi-heif==0.22.0
pi-heif==1.0.0
# via -r ./extra-pdf-image.in
pikepdf==9.8.1
pikepdf==9.9.0
# via -r ./extra-pdf-image.in
pillow==11.3.0
# via
@ -190,7 +190,7 @@ pycparser==2.22
# cffi
pyparsing==3.2.3
# via matplotlib
pypdf==5.6.0
pypdf==5.7.0
# via
# -c requirements/base.txt
# -r ./extra-pdf-image.in
@ -243,11 +243,11 @@ sympy==1.14.0
# via
# onnxruntime
# torch
timm==1.0.15
timm==1.0.16
# via
# effdet
# unstructured-inference
tokenizers==0.21.1
tokenizers==0.21.2
# via
# -c requirements/deps/constraints.txt
# transformers
@ -267,9 +267,9 @@ tqdm==4.67.1
# -c requirements/base.txt
# huggingface-hub
# transformers
transformers==4.52.4
transformers==4.53.1
# via unstructured-inference
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# huggingface-hub
@ -282,7 +282,7 @@ unstructured-inference==1.0.5
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.15
# via -r ./extra-pdf-image.in
urllib3==2.4.0
urllib3==2.5.0
# via
# -c requirements/base.txt
# -c requirements/deps/constraints.txt

View File

@ -4,13 +4,13 @@
#
# pip-compile ./extra-pptx.in
#
lxml==5.4.0
lxml==6.0.0
# via python-pptx
pillow==11.3.0
# via python-pptx
python-pptx==1.0.2
# via -r ./extra-pptx.in
typing-extensions==4.14.0
typing-extensions==4.14.1
# via python-pptx
xlsxwriter==3.2.3
xlsxwriter==3.2.5
# via python-pptx

View File

@ -14,7 +14,7 @@ numpy==2.2.6
# pandas
openpyxl==3.1.5
# via -r ./extra-xlsx.in
pandas==2.3.0
pandas==2.3.1
# via -r ./extra-xlsx.in
python-dateutil==2.9.0.post0
# via

View File

@ -4,7 +4,7 @@
#
# pip-compile ./huggingface.in
#
certifi==2025.6.15
certifi==2025.7.9
# via
# -c requirements/base.txt
# requests
@ -25,9 +25,9 @@ fsspec==2025.5.1
# via
# huggingface-hub
# torch
hf-xet==1.1.4
hf-xet==1.1.5
# via huggingface-hub
huggingface-hub==0.33.0
huggingface-hub==0.33.2
# via
# tokenizers
# transformers
@ -86,7 +86,7 @@ six==1.17.0
# langdetect
sympy==1.14.0
# via torch
tokenizers==0.21.1
tokenizers==0.21.2
# via
# -c requirements/deps/constraints.txt
# transformers
@ -98,14 +98,14 @@ tqdm==4.67.1
# huggingface-hub
# sacremoses
# transformers
transformers==4.52.4
transformers==4.53.1
# via -r ./huggingface.in
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# huggingface-hub
# torch
urllib3==2.4.0
urllib3==2.5.0
# via
# -c requirements/base.txt
# -c requirements/deps/constraints.txt

View File

@ -14,7 +14,7 @@ click==8.2.1
# via
# -c requirements/base.txt
# black
coverage[toml]==7.9.1
coverage[toml]==7.9.2
# via
# -r ./test.in
# pytest-cov
@ -24,7 +24,7 @@ exceptiongroup==1.3.0
# pytest
execnet==2.1.1
# via pytest-xdist
flake8==7.2.0
flake8==7.3.0
# via
# -r ./test.in
# flake8-print
@ -32,7 +32,7 @@ flake8-print==5.0.0
# via -r ./test.in
freezegun==1.5.2
# via -r ./test.in
grpcio==1.73.0
grpcio==1.73.1
# via
# -c requirements/deps/constraints.txt
# -r ./test.in
@ -64,7 +64,7 @@ pluggy==1.6.0
# via
# pytest
# pytest-cov
pycodestyle==2.13.0
pycodestyle==2.14.0
# via
# flake8
# flake8-print
@ -72,13 +72,13 @@ pydantic==2.11.7
# via -r ./test.in
pydantic-core==2.33.2
# via pydantic
pyflakes==3.3.2
pyflakes==3.4.0
# via
# autoflake
# flake8
pygments==2.19.1
pygments==2.19.2
# via pytest
pytest==8.4.0
pytest==8.4.1
# via
# pytest-cov
# pytest-mock
@ -87,13 +87,13 @@ pytest-cov==6.2.1
# via -r ./test.in
pytest-mock==3.14.1
# via -r ./test.in
pytest-xdist==3.7.0
pytest-xdist==3.8.0
# via -r ./test.in
python-dateutil==2.9.0.post0
# via
# -c requirements/base.txt
# freezegun
ruff==0.11.13
ruff==0.12.2
# via -r ./test.in
semantic-version==2.10.0
# via liccheck
@ -112,13 +112,13 @@ tomli==2.2.1
# pytest
types-click==7.1.8
# via -r ./test.in
types-markdown==3.8.0.20250415
types-markdown==3.8.0.20250708
# via -r ./test.in
types-requests==2.32.4.20250611
# via -r ./test.in
types-tabulate==0.9.0.20241207
# via -r ./test.in
typing-extensions==4.14.0
typing-extensions==4.14.1
# via
# -c requirements/base.txt
# black
@ -129,7 +129,7 @@ typing-extensions==4.14.0
# typing-inspection
typing-inspection==0.4.1
# via pydantic
urllib3==2.4.0
urllib3==2.5.0
# via
# -c requirements/base.txt
# -c requirements/deps/constraints.txt

View File

@ -1,6 +1,6 @@
#!/usr/bin/env bash
# python version must match lowest supported (3.9)
# python version must match lowest supported (3.10)
major=3
minor=10
if ! python -c "import sys; assert sys.version_info.major == $major and sys.version_info.minor == $minor"; then

View File

@ -82,7 +82,7 @@ setup(
long_description_content_type="text/markdown",
keywords="NLP PDF HTML CV XML parsing preprocessing",
url="https://github.com/Unstructured-IO/unstructured",
python_requires=">=3.9.0",
python_requires=">=3.10.0",
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
@ -91,7 +91,6 @@ setup(
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",

View File

@ -401,9 +401,8 @@ def test_get_page_image_metadata_and_coordinate_system():
assert isinstance(metadata, dict)
def test_ocr_data_to_elements(
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
def test_ocr_data_to_elements():
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
text_regions = [
TextRegion.from_coords(
163.0,

View File

@ -133,9 +133,8 @@ def test_partition_image_local_raises_with_no_filename():
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
def test_partition_image_with_auto_strategy(
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
def test_partition_image_with_auto_strategy():
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
titles = [
el for el in elements if el.category == ElementType.TITLE and len(el.text.split(" ")) > 10
@ -147,9 +146,8 @@ def test_partition_image_with_auto_strategy(
assert isinstance(elements[idx].metadata.detection_class_prob, float)
def test_partition_image_with_table_extraction(
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
def test_partition_image_with_table_extraction():
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
elements = image.partition_image(
filename=filename,
strategy=PartitionStrategy.HI_RES,
@ -161,17 +159,14 @@ def test_partition_image_with_table_extraction(
assert "</thead><tbody><tr>" in table[0]
def test_partition_image_with_multipage_tiff(
filename=example_doc_path("img/layout-parser-paper-combined.tiff"),
):
def test_partition_image_with_multipage_tiff():
filename = example_doc_path("img/layout-parser-paper-combined.tiff")
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
assert elements[-1].metadata.page_number == 2
def test_partition_image_with_bmp(
tmpdir,
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
def test_partition_image_with_bmp(tmpdir):
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
img = Image.open(filename)
img.save(bmp_filename)
@ -187,7 +182,8 @@ def test_partition_image_with_bmp(
assert "</thead><tbody><tr>" in table[0]
def test_partition_image_with_language_passed(filename=example_doc_path("img/example.jpg")):
def test_partition_image_with_language_passed():
filename = example_doc_path("img/example.jpg")
with mock.patch.object(
ocr,
"process_file_with_ocr",
@ -202,9 +198,8 @@ def test_partition_image_with_language_passed(filename=example_doc_path("img/exa
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
def test_partition_image_from_file_with_language_passed(
filename=example_doc_path("img/example.jpg"),
):
def test_partition_image_from_file_with_language_passed():
filename = example_doc_path("img/example.jpg")
with mock.patch.object(
ocr,
"process_data_with_ocr",
@ -217,9 +212,8 @@ def test_partition_image_from_file_with_language_passed(
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
@pytest.mark.skip(reason="Current catching too many tesseract errors")
def test_partition_image_raises_with_invalid_language(
filename=example_doc_path("img/example.jpg"),
):
def test_partition_image_raises_with_invalid_language():
filename = example_doc_path("img/example.jpg")
with pytest.raises(TesseractError):
image.partition_image(
filename=filename,
@ -414,9 +408,8 @@ def test_partition_msg_with_json():
assert_round_trips_through_JSON(elements)
def test_partition_image_with_ocr_has_coordinates_from_filename(
filename=example_doc_path("img/english-and-korean.png"),
):
def test_partition_image_with_ocr_has_coordinates_from_filename():
filename = example_doc_path("img/english-and-korean.png")
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
assert int_coordinates == [(14, 16), (14, 37), (381, 37), (381, 16)]
@ -467,9 +460,8 @@ def test_partition_image_warns_with_ocr_languages(caplog):
assert "The ocr_languages kwarg will be deprecated" in caplog.text
def test_add_chunking_strategy_on_partition_image(
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
):
def test_add_chunking_strategy_on_partition_image():
filename = example_doc_path("img/layout-parser-paper-fast.jpg")
elements = image.partition_image(filename=filename)
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
@ -477,9 +469,8 @@ def test_add_chunking_strategy_on_partition_image(
assert chunk_elements == chunks
def test_add_chunking_strategy_on_partition_image_hi_res(
filename=example_doc_path("img/layout-parser-paper-with-table.jpg"),
):
def test_add_chunking_strategy_on_partition_image_hi_res():
filename = example_doc_path("img/layout-parser-paper-with-table.jpg")
elements = image.partition_image(
filename=filename,
strategy=PartitionStrategy.HI_RES,
@ -615,8 +606,8 @@ def test_partition_image_has_filename(inference_results):
def test_partition_image_element_extraction(
file_mode,
extract_image_block_to_payload,
filename=example_doc_path("img/embedded-images-tables.jpg"),
):
filename = example_doc_path("img/embedded-images-tables.jpg")
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
@ -641,9 +632,8 @@ def test_partition_image_element_extraction(
)
def test_partition_image_works_on_heic_file(
filename=example_doc_path("img/DA-1p.heic"),
):
def test_partition_image_works_on_heic_file():
filename = example_doc_path("img/DA-1p.heic")
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
titles = [el.text for el in elements if el.category == ElementType.TITLE]
assert "CREATURES" in titles

View File

@ -226,8 +226,9 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
starting_page_number,
expected_page_numbers,
origin,
filename=example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf"),
):
filename = example_doc_path("pdf/layout-parser-paper-with-empty-pages.pdf")
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts
@ -270,8 +271,8 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
def test_partition_pdf_with_model_name_env_var(
monkeypatch,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
layout,
@ -286,8 +287,8 @@ def test_partition_pdf_with_model_name_env_var(
def test_partition_pdf_with_model_name(
monkeypatch,
model_name,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
layout,
@ -315,10 +316,8 @@ def test_partition_pdf_with_model_name(
assert mock_process.call_args[1]["model_name"] == model_name
def test_partition_pdf_with_hi_res_model_name(
monkeypatch,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_hi_res_model_name(monkeypatch):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
layout,
@ -332,10 +331,8 @@ def test_partition_pdf_with_hi_res_model_name(
assert mock_process.call_args[1]["model_name"] == "checkbox"
def test_partition_pdf_or_image_with_hi_res_model_name(
monkeypatch,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_or_image_with_hi_res_model_name(monkeypatch):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
layout,
@ -349,9 +346,8 @@ def test_partition_pdf_or_image_with_hi_res_model_name(
assert mock_process.call_args[1]["model_name"] == "checkbox"
def test_partition_pdf_with_auto_strategy(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_auto_strategy():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[6].text == title
@ -359,23 +355,20 @@ def test_partition_pdf_with_auto_strategy(
assert elements[6].metadata.file_directory == os.path.dirname(filename)
def test_partition_pdf_with_page_breaks(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_page_breaks():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
assert "PageBreak" in [elem.category for elem in elements]
def test_partition_pdf_with_no_page_breaks(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_no_page_breaks():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(filename=filename, url=None)
assert "PageBreak" not in [elem.category for elem in elements]
def test_partition_pdf_with_fast_strategy(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_fast_strategy():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
)
@ -394,9 +387,8 @@ def test_partition_pdf_with_fast_neg_coordinates():
assert elements[0].metadata.coordinates.points[1][0] < 0
def test_partition_pdf_with_fast_groups_text(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_fast_groups_text():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
first_narrative_element = None
@ -410,18 +402,15 @@ def test_partition_pdf_with_fast_groups_text(
assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf"
def test_partition_pdf_with_fast_strategy_from_file(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_fast_strategy_from_file():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
with open(filename, "rb") as f:
elements = pdf.partition_pdf(file=f, url=None, strategy=PartitionStrategy.FAST)
assert len(elements) > 10
def test_partition_pdf_with_fast_strategy_and_page_breaks(
caplog,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_fast_strategy_and_page_breaks(caplog):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(
filename=filename,
url=None,
@ -436,18 +425,15 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks(
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
def test_partition_pdf_raises_with_bad_strategy(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_raises_with_bad_strategy():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
with pytest.raises(ValueError):
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
def test_partition_pdf_falls_back_to_fast(
monkeypatch,
caplog,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_falls_back_to_fast(monkeypatch, caplog):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
def mock_exists(dep):
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
@ -465,11 +451,9 @@ def test_partition_pdf_falls_back_to_fast(
assert "unstructured_inference is not installed" in caplog.text
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
monkeypatch,
caplog,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_falls_back_to_fast_from_ocr_only(monkeypatch, caplog):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
def mock_exists(dep):
return dep not in ["unstructured_pytesseract"]
@ -491,11 +475,9 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
monkeypatch,
caplog,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(monkeypatch, caplog):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
def mock_exists(dep):
return dep not in ["unstructured_pytesseract"]
@ -514,11 +496,9 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
assert "pytesseract is not installed" in caplog.text
def test_partition_pdf_falls_back_to_ocr_only(
monkeypatch,
caplog,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_falls_back_to_ocr_only(monkeypatch, caplog):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
def mock_exists(dep):
return dep not in ["unstructured_inference"]
@ -633,7 +613,8 @@ def test_partition_pdf_with_dpi():
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("pdf/reliance.pdf")):
def test_partition_pdf_requiring_recursive_text_grab():
filename = example_doc_path("pdf/reliance.pdf")
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
assert len(elements) > 50
assert elements[0].metadata.page_number == 1
@ -646,10 +627,9 @@ def test_partition_pdf_text_not_extractable():
assert len(elements) == 0
def test_partition_pdf_fails_if_pdf_not_processable(
monkeypatch,
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_fails_if_pdf_not_processable(monkeypatch):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
def mock_exists(dep):
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
@ -700,9 +680,8 @@ def test_partition_pdf_fast_groups_text_in_text_box():
assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3)
def test_partition_pdf_with_metadata_filename(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_metadata_filename():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(
filename=filename,
url=None,
@ -713,9 +692,8 @@ def test_partition_pdf_with_metadata_filename(
assert element.metadata.filename == "test"
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
@ -793,9 +771,8 @@ def test_partition_pdf_with_json(strategy: str):
assert_round_trips_through_JSON(elements)
def test_add_chunking_strategy_by_title_on_partition_pdf(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
):
def test_add_chunking_strategy_by_title_on_partition_pdf():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
elements = pdf.partition_pdf(filename=filename)
chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
@ -920,9 +897,8 @@ def test_partition_pdf_uses_hi_res_model_name():
assert mockpartition.call_args.kwargs["hi_res_model_name"]
def test_partition_pdf_word_bbox_not_char(
filename=example_doc_path("pdf/interface-config-guide-p93.pdf"),
):
def test_partition_pdf_word_bbox_not_char():
filename = example_doc_path("pdf/interface-config-guide-p93.pdf")
try:
elements = pdf.partition_pdf(filename=filename, strategy="fast")
except Exception as e:
@ -930,9 +906,8 @@ def test_partition_pdf_word_bbox_not_char(
assert len(elements) == 17
def test_partition_pdf_fast_no_mapping_errors(
filename=example_doc_path("pdf/a1977-backus-p21.pdf"),
):
def test_partition_pdf_fast_no_mapping_errors():
filename = example_doc_path("pdf/a1977-backus-p21.pdf")
"""Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940,
failing to map old parent_id's to new"""
pdf.partition_pdf(filename=filename, strategy="fast")
@ -1190,9 +1165,8 @@ def test_partition_pdf_with_bad_color_profile():
assert pdf.partition_pdf(filename, strategy="fast")
def test_partition_pdf_with_fast_finds_headers_footers(
filename=example_doc_path("pdf/header-test-doc.pdf"),
):
def test_partition_pdf_with_fast_finds_headers_footers():
filename = example_doc_path("pdf/header-test-doc.pdf")
elements = pdf.partition_pdf(filename, strategy="fast")
assert isinstance(elements[0], Header)
assert isinstance(elements[-1], Footer)
@ -1266,11 +1240,8 @@ def assert_element_extraction(
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_partition_pdf_element_extraction(
file_mode,
extract_image_block_to_payload,
filename=example_doc_path("pdf/embedded-images-tables.pdf"),
):
def test_partition_pdf_element_extraction(file_mode, extract_image_block_to_payload):
filename = example_doc_path("pdf/embedded-images-tables.pdf")
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
@ -1299,9 +1270,8 @@ def test_partition_pdf_element_extraction(
)
def test_partition_pdf_always_keep_all_image_elements(
filename=example_doc_path("pdf/embedded-images.pdf"),
):
def test_partition_pdf_always_keep_all_image_elements():
filename = example_doc_path("pdf/embedded-images.pdf")
elements = pdf.partition_pdf(
filename=filename,
strategy="hi_res",
@ -1559,11 +1529,9 @@ def test_document_to_element_list_sets_category_depth_titles():
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_pdf_with_password(
file_mode,
strategy,
filename=example_doc_path("pdf/password.pdf"),
):
def test_partition_pdf_with_password(file_mode, strategy):
filename = example_doc_path("pdf/password.pdf")
# Test that the partition_pdf function can handle filename
def _test(result):
# validate that the result is a non-empty list of dicts

View File

@ -35,9 +35,8 @@ def test_write_image(image_type):
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(
file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf")
):
def test_convert_pdf_to_image(file_mode, path_only):
filename = example_doc_path("pdf/embedded-images.pdf")
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
images = pdf_image_utils.convert_pdf_to_image(
@ -61,7 +60,8 @@ def test_convert_pdf_to_image(
assert isinstance(images[0], PILImg.Image)
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
def test_convert_pdf_to_image_raises_error():
filename = example_doc_path("embedded-images.pdf")
with pytest.raises(ValueError) as exc_info:
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)

View File

@ -1 +1 @@
__version__ = "0.18.5-dev0" # pragma: no cover
__version__ = "0.18.5-dev1" # pragma: no cover