bump unstructured-inference (#3711)

This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com>
2025-08-17 13:16:47 +00:00 · 2024-10-21 16:55:08 -05:00 · 2024-10-21 16:55:08 -05:00 · a11ad22609
commit a11ad22609
parent e764bc503c
23 changed files with 184 additions and 109 deletions
--- a/.github/actions/base-cache/action.yml
+++ b/.github/actions/base-cache/action.yml
@ -30,7 +30,9 @@ runs:
      shell: bash
      run: |
        python${{ inputs.python-version }} -m pip install --upgrade virtualenv
+        if [ ! -d ".venv" ]; then
          python${{ inputs.python-version }} -m venv .venv
+        fi
        source .venv/bin/activate
        [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
        if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
@ -38,6 +40,7 @@ runs:
          python -m pip install --upgrade setuptools
        fi
        make install-ci
+        make install-nltk-models
    - name: Save Cache
      if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
      id: virtualenv-cache-save
--- a/.github/actions/base-ingest-cache/action.yml
+++ b/.github/actions/base-ingest-cache/action.yml
@ -18,7 +18,7 @@ runs:
        path: |
          .venv
          nltk_data
-        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
        lookup-only: ${{ inputs.check-only }}
    - name: Set up Python ${{ inputs.python-version }}
      if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@ -39,6 +39,8 @@ runs:
          python -m pip install --upgrade setuptools
        fi
        make install-ci
+        make install-nltk-models
+        make install-all-docs
        make install-ingest
    - name: Save Ingest Cache
      if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@ -48,5 +50,5 @@ runs:
        path: |
          .venv
          nltk_data
-        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}

--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -12,14 +12,15 @@ permissions:
  id-token: write
  contents: read

+env:
+  NLTK_DATA: ${{ github.workspace }}/nltk_data
+
 jobs:
  setup:
    strategy:
      matrix:
        python-version: ["3.9","3.10","3.11", "3.12"]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    steps:
    - uses: actions/checkout@v4
    - uses: ./.github/actions/base-cache
@ -78,8 +79,6 @@ jobs:
    strategy:
      matrix:
        python-version: ["3.9","3.10","3.11"]
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    runs-on: ubuntu-latest
    needs: [setup, changelog]
    steps:
@ -185,8 +184,6 @@ jobs:
        python-version: ["3.10"]
        extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup, lint, test_unit_no_extras]
    steps:
    - uses: actions/checkout@v4
@ -220,6 +217,7 @@ jobs:
        sudo apt-get update
        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
        tesseract --version
+        make install-${{ matrix.extra }}
        make test-extra-${{ matrix.extra }} CI=true

  setup_ingest:
@ -227,8 +225,6 @@ jobs:
      matrix:
        python-version: [ "3.9","3.10" ]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup]
    steps:
      - uses: actions/checkout@v4
@ -307,7 +303,6 @@ jobs:
        MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
        OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
        CI: "true"
-        NLTK_DATA: ${{ github.workspace }}/nltk_data
        PYTHON: python${{ matrix.python-version }}
      run: |
        source .venv/bin/activate
@ -320,6 +315,8 @@ jobs:
        sudo apt-get install -y tesseract-ocr-kor
        sudo apt-get install diffstat
        tesseract --version
+        make install-all-docs
+        make install-ingest
        ./test_unstructured_ingest/test-ingest-src.sh


@ -329,8 +326,6 @@ jobs:
        # NOTE(yuming): Unstructured API only use Python 3.10
        python-version: ["3.10"]
    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup, lint]
    steps:
    - uses: actions/checkout@v4
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,8 @@
-## 0.16.1-dev5
+## 0.16.1-dev6

 ### Enhancements

+* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
 * **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.

 ### Features
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./base.in
 #
-anyio==4.6.0
+anyio==4.6.2.post1
    # via httpx
 backoff==2.2.1
    # via -r ./base.in
@ -20,7 +20,7 @@ cffi==1.17.1
    # via cryptography
 chardet==5.2.0
    # via -r ./base.in
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via
    #   requests
    #   unstructured-client
@ -28,7 +28,7 @@ click==8.1.7
    # via
    #   nltk
    #   python-oxmsg
-cryptography==43.0.1
+cryptography==43.0.3
    # via unstructured-client
 dataclasses-json==0.6.7
    # via
@ -62,7 +62,7 @@ langdetect==1.0.9
    # via -r ./base.in
 lxml==5.3.0
    # via -r ./base.in
-marshmallow==3.22.0
+marshmallow==3.23.0
    # via
    #   dataclasses-json
    #   unstructured-client
@ -84,7 +84,7 @@ packaging==24.1
    # via
    #   marshmallow
    #   unstructured-client
-psutil==6.0.0
+psutil==6.1.0
    # via -r ./base.in
 pycparser==2.22
    # via cffi
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./dev.in
 #
-build==1.2.2
+build==1.2.2.post1
    # via pip-tools
 cfgv==3.4.0
    # via pre-commit
@ -13,7 +13,7 @@ click==8.1.7
    #   -c ./base.txt
    #   -c ./test.txt
    #   pip-tools
-distlib==0.3.8
+distlib==0.3.9
    # via virtualenv
 filelock==3.16.1
    # via virtualenv
@ -36,7 +36,7 @@ platformdirs==4.3.6
    # via
    #   -c ./test.txt
    #   virtualenv
-pre-commit==3.8.0
+pre-commit==4.0.1
    # via -r ./dev.in
 pyproject-hooks==1.2.0
    # via
@ -51,7 +51,7 @@ tomli==2.0.2
    #   -c ./test.txt
    #   build
    #   pip-tools
-virtualenv==20.26.6
+virtualenv==20.27.0
    # via pre-commit
 wheel==0.44.0
    # via pip-tools
--- a/requirements/extra-epub.txt
+++ b/requirements/extra-epub.txt
@ -4,5 +4,5 @@
 #
 #    pip-compile ./extra-epub.in
 #
-pypandoc==1.13
+pypandoc==1.14
    # via -r ./extra-epub.in
--- a/requirements/extra-odt.txt
+++ b/requirements/extra-odt.txt
@ -8,7 +8,7 @@ lxml==5.3.0
    # via
    #   -c ./base.txt
    #   python-docx
-pypandoc==1.13
+pypandoc==1.14
    # via -r ./extra-odt.in
 python-docx==1.1.2
    # via -r ./extra-odt.in
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile ./extra-paddleocr.in
 #
-anyio==4.6.0
+anyio==4.6.2.post1
    # via
    #   -c ./base.txt
    #   httpx
@ -16,7 +16,7 @@ certifi==2024.8.30
    #   httpcore
    #   httpx
    #   requests
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via
    #   -c ./base.txt
    #   requests
@ -52,7 +52,7 @@ idna==3.10
    #   anyio
    #   httpx
    #   requests
-imageio==2.35.1
+imageio==2.36.0
    # via
    #   imgaug
    #   scikit-image
@ -104,7 +104,7 @@ paddlepaddle==3.0.0b1
    # via -r ./extra-paddleocr.in
 pdf2image==1.17.0
    # via unstructured-paddleocr
-pillow==10.4.0
+pillow==11.0.0
    # via
    #   imageio
    #   imgaug
@ -117,9 +117,9 @@ protobuf==4.25.5
    # via
    #   -c ././deps/constraints.txt
    #   paddlepaddle
-pyclipper==1.3.0.post5
+pyclipper==1.3.0.post6
    # via unstructured-paddleocr
-pyparsing==3.1.4
+pyparsing==3.2.0
    # via matplotlib
 python-dateutil==2.9.0.post0
    # via
--- a/requirements/extra-pandoc.txt
+++ b/requirements/extra-pandoc.txt
@ -4,5 +4,5 @@
 #
 #    pip-compile ./extra-pandoc.in
 #
-pypandoc==1.13
+pypandoc==1.14
    # via -r ./extra-pandoc.in
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -11,5 +11,5 @@ google-cloud-vision
 effdet
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.36
+unstructured-inference==0.8.0
 unstructured.pytesseract>=0.3.12
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -16,7 +16,7 @@ cffi==1.17.1
    # via
    #   -c ./base.txt
    #   cryptography
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via
    #   -c ./base.txt
    #   pdfminer-six
@ -25,7 +25,7 @@ coloredlogs==15.0.1
    # via onnxruntime
 contourpy==1.3.0
    # via matplotlib
-cryptography==43.0.1
+cryptography==43.0.3
    # via
    #   -c ./base.txt
    #   pdfminer-six
@ -48,7 +48,7 @@ fsspec==2024.9.0
    # via
    #   huggingface-hub
    #   torch
-google-api-core[grpc]==2.20.0
+google-api-core[grpc]==2.21.0
    # via google-cloud-vision
 google-auth==2.35.0
    # via
@ -60,14 +60,14 @@ googleapis-common-protos==1.65.0
    # via
    #   google-api-core
    #   grpcio-status
-grpcio==1.66.2
+grpcio==1.67.0
    # via
    #   -c ././deps/constraints.txt
    #   google-api-core
    #   grpcio-status
 grpcio-status==1.62.3
    # via google-api-core
-huggingface-hub==0.25.1
+huggingface-hub==0.26.0
    # via
    #   timm
    #   tokenizers
@ -93,7 +93,7 @@ lxml==5.3.0
    # via
    #   -c ./base.txt
    #   pikepdf
-markupsafe==2.1.5
+markupsafe==3.0.2
    # via jinja2
 matplotlib==3.9.2
    # via
@ -117,6 +117,7 @@ numpy==1.26.4
    #   scipy
    #   torchvision
    #   transformers
+    #   unstructured-inference
 omegaconf==2.3.0
    # via effdet
 onnx==1.17.0
@ -150,11 +151,11 @@ pdfminer-six==20231228
    #   pdfplumber
 pdfplumber==0.11.4
    # via layoutparser
-pi-heif==0.18.0
+pi-heif==0.20.0
    # via -r ./extra-pdf-image.in
 pikepdf==9.3.0
    # via -r ./extra-pdf-image.in
-pillow==10.4.0
+pillow==11.0.0
    # via
    #   layoutparser
    #   matplotlib
@ -192,7 +193,7 @@ pycparser==2.22
    # via
    #   -c ./base.txt
    #   cffi
-pyparsing==3.1.4
+pyparsing==3.2.0
    # via matplotlib
 pypdf==5.0.1
    # via
@ -242,11 +243,11 @@ six==1.16.0
    # via
    #   -c ./base.txt
    #   python-dateutil
-sympy==1.13.3
+sympy==1.13.1
    # via
    #   onnxruntime
    #   torch
-timm==1.0.9
+timm==1.0.11
    # via
    #   effdet
    #   unstructured-inference
@ -254,13 +255,13 @@ tokenizers==0.19.1
    # via
    #   -c ././deps/constraints.txt
    #   transformers
-torch==2.4.1
+torch==2.5.0
    # via
    #   effdet
    #   timm
    #   torchvision
    #   unstructured-inference
-torchvision==0.19.1
+torchvision==0.20.0
    # via
    #   effdet
    #   timm
@ -281,7 +282,7 @@ typing-extensions==4.12.2
    #   torch
 tzdata==2024.2
    # via pandas
-unstructured-inference==0.7.36
+unstructured-inference==0.8.0
    # via -r ./extra-pdf-image.in
 unstructured-pytesseract==0.3.13
    # via -r ./extra-pdf-image.in
--- a/requirements/extra-pptx.txt
+++ b/requirements/extra-pptx.txt
@ -6,7 +6,7 @@
 #
 lxml==5.3.0
    # via python-pptx
-pillow==10.4.0
+pillow==11.0.0
    # via python-pptx
 python-pptx==1.0.2
    # via -r ./extra-pptx.in
--- a/requirements/huggingface.txt
+++ b/requirements/huggingface.txt
@ -8,7 +8,7 @@ certifi==2024.8.30
    # via
    #   -c ./base.txt
    #   requests
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via
    #   -c ./base.txt
    #   requests
@ -25,7 +25,7 @@ fsspec==2024.9.0
    # via
    #   huggingface-hub
    #   torch
-huggingface-hub==0.25.1
+huggingface-hub==0.26.0
    # via
    #   tokenizers
    #   transformers
@ -43,7 +43,7 @@ langdetect==1.0.9
    # via
    #   -c ./base.txt
    #   -r ./huggingface.in
-markupsafe==2.1.5
+markupsafe==3.0.2
    # via jinja2
 mpmath==1.3.0
    # via sympy
@ -82,13 +82,13 @@ six==1.16.0
    # via
    #   -c ./base.txt
    #   langdetect
-sympy==1.13.3
+sympy==1.13.1
    # via torch
 tokenizers==0.19.1
    # via
    #   -c ././deps/constraints.txt
    #   transformers
-torch==2.4.1
+torch==2.5.0
    # via -r ./huggingface.in
 tqdm==4.66.5
    # via
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -6,7 +6,7 @@
 #
 annotated-types==0.7.0
    # via pydantic
-anyio==4.6.0
+anyio==4.6.2.post1
    # via
    #   -c ./base.txt
    #   httpx
@ -16,7 +16,7 @@ attrs==24.2.0
    # via jsonschema
 autoflake==2.3.1
    # via -r ./test.in
-black==24.8.0
+black==24.10.0
    # via -r ./test.in
 certifi==2024.8.30
    # via
@ -24,7 +24,7 @@ certifi==2024.8.30
    #   httpcore
    #   httpx
    #   requests
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via
    #   -c ./base.txt
    #   requests
@ -33,7 +33,7 @@ click==8.1.7
    #   -c ./base.txt
    #   black
    #   nltk
-coverage[toml]==7.6.1
+coverage[toml]==7.6.4
    # via
    #   -r ./test.in
    #   pytest-cov
@ -50,7 +50,7 @@ flake8-print==5.0.0
    # via -r ./test.in
 freezegun==1.5.1
    # via -r ./test.in
-grpcio==1.66.2
+grpcio==1.67.0
    # via
    #   -c ././deps/constraints.txt
    #   -r ./test.in
@ -95,7 +95,7 @@ mccabe==0.7.0
    # via flake8
 multidict==6.1.0
    # via yarl
-mypy==1.11.2
+mypy==1.12.1
    # via -r ./test.in
 mypy-extensions==1.0.0
    # via
@ -119,12 +119,14 @@ pandas==2.2.3
    # via label-studio-sdk
 pathspec==0.12.1
    # via black
-pillow==10.4.0
+pillow==11.0.0
    # via label-studio-sdk
 platformdirs==4.3.6
    # via black
 pluggy==1.5.0
    # via pytest
+propcache==0.2.0
+    # via yarl
 pycodestyle==2.12.1
    # via
    #   flake8
@ -226,7 +228,7 @@ urllib3==1.26.20
    #   -c ./base.txt
    #   requests
    #   vcrpy
-vcrpy==6.0.1
+vcrpy==6.0.2
    # via -r ./test.in
 wrapt==1.16.0
    # via
@ -234,7 +236,7 @@ wrapt==1.16.0
    #   vcrpy
 xmljson==0.2.1
    # via label-studio-sdk
-yarl==1.13.1
+yarl==1.15.5
    # via vcrpy

 # The following packages are considered to be unsafe in a requirements file:
--- a/test_unstructured/partition/pdf_image/test_inference_utils.py
+++ b/test_unstructured/partition/pdf_image/test_inference_utils.py
@ -1,4 +1,4 @@
-from unstructured_inference.inference.elements import TextRegion
+from unstructured_inference.inference.elements import TextRegion, TextRegions
 from unstructured_inference.inference.layoutelement import LayoutElement

 from unstructured.documents.elements import ElementType
@ -17,7 +17,7 @@ def test_merge_text_regions(mock_embedded_text_regions):
        text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
    )

-    merged_text_region = merge_text_regions(mock_embedded_text_regions)
+    merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
    assert merged_text_region == expected


--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -179,6 +179,12 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
        # check that the pdf has multiple different page numbers
        assert {element.metadata.page_number for element in result} == expected_page_numbers
        if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
+            print(
+                [
+                    (element.metadata.detection_origin, element.category, element.text)
+                    for element in result
+                ]
+            )
            assert {element.metadata.detection_origin for element in result} == origin

    if file_mode == "filename":
--- a/test_unstructured/partition/test_api.py
+++ b/test_unstructured/partition/test_api.py
@ -19,8 +19,8 @@ from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock

 DIRECTORY = pathlib.Path(__file__).parent.resolve()

-# NOTE(crag): point to freemium API for now
-API_URL = "https://api.unstructured.io/general/v0/general"
+# NOTE(yao): point to paid API for now
+API_URL = "https://api.unstructuredapp.io/general/v0/general"

 is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"}
 skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@ -338,7 +338,20 @@
    "type": "ListItem"
  },
  {
-    "element_id": "6277cd91869e10d6256f362b08d3e789",
+    "element_id": "f0f0586caeb3af4284c1b367a5269d27",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 2
+    },
+    "text": "452",
+    "type": "Header"
+  },
+  {
+    "element_id": "ac79570be092923eb29899f64281c3b3",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -351,7 +364,7 @@
    "type": "Table"
  },
  {
-    "element_id": "22b8448fe36b3ccd06d1d8e4ea2dc1ea",
+    "element_id": "13fd694e1ff862d163b840a246964e58",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -364,7 +377,7 @@
    "type": "Title"
  },
  {
-    "element_id": "f2b57562924402b85f6eb07925ea1654",
+    "element_id": "5f1c4074c1b5d641b724b99be6f5ddfd",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -377,7 +390,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "d9f6efffd49ef59e671206bfb5f094de",
+    "element_id": "afed004de4c50d761640b6c18729a988",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -390,7 +403,7 @@
    "type": "ListItem"
  },
  {
-    "element_id": "2a1e46bc589c5eca777b657e141e824b",
+    "element_id": "f93d89ccb971e2b60f44afbf710673c6",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -403,7 +416,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "2c42182c07ecdb96362b534a8fad4d59",
+    "element_id": "cb6e8acb9c24820b59f8973cc236ef35",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -416,7 +429,7 @@
    "type": "ListItem"
  },
  {
-    "element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f",
+    "element_id": "5964ede27be8850de7a13e0dd32c1b21",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -429,7 +442,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "07cdb1623f501ea23a343039300178cc",
+    "element_id": "e1f7e635d8739a97d8d0000ba8004f61",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -442,7 +455,7 @@
    "type": "ListItem"
  },
  {
-    "element_id": "4bf8165bcb21c5296b741ba0f9e38f93",
+    "element_id": "deb8964830ba1f9dd1eec7b08bd3ea19",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -455,7 +468,7 @@
    "type": "Title"
  },
  {
-    "element_id": "85918ce2a03e9f236137a0fe72985af0",
+    "element_id": "be270e13c935334fa3b17b13066d639b",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -468,7 +481,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "93537983496efa695cfc65ad895d9412",
+    "element_id": "5c97405ec921495b23d2b400516cbd06",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -481,7 +494,7 @@
    "type": "Image"
  },
  {
-    "element_id": "76b94e78b638b79374e266284c1a0d83",
+    "element_id": "7956ee39ac5e080a362967e2f6a5753e",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@ -598,20 +598,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "448de3300a8c7e2cfdd2028dd0bb4171",
-    "metadata": {
-      "data_source": {},
-      "filetype": "application/pdf",
-      "languages": [
-        "eng"
-      ],
-      "page_number": 2
-    },
-    "text": "and",
-    "type": "NarrativeText"
-  },
-  {
-    "element_id": "b13807f59ac7c6647ee0aee74f9b0dd3",
+    "element_id": "db6ff60cbdb77adc14a6b9491af8d161",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -624,7 +611,7 @@
    "type": "ListItem"
  },
  {
-    "element_id": "db480e847a5703b19be6b79223e1ee03",
+    "element_id": "9f6ef223a141a5381951eff39b3af039",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
@ -637,7 +624,7 @@
    "type": "NarrativeText"
  },
  {
-    "element_id": "326c44638a881f86474b82cc244896f9",
+    "element_id": "5c67842128e14fc16344beaa2aa0111e",
    "metadata": {
      "data_source": {},
      "filetype": "application/pdf",
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@ -1276,9 +1276,75 @@
      }
    }
  },
+  {
+    "type": "ListItem",
+    "element_id": "53b448c75f1556b1f60b4e3324bd0724",
+    "text": "1 import layoutparser as lp",
+    "metadata": {
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 5,
+      "data_source": {
+        "record_locator": {
+          "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+        },
+        "permissions_data": [
+          {
+            "mode": 33188
+          }
+        ]
+      }
+    }
+  },
+  {
+    "type": "ListItem",
+    "element_id": "a002e13c7ea2613b2eabb9ea3501856d",
+    "text": "3 model = lp . De t e c tro n2 Lay outM odel (",
+    "metadata": {
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 5,
+      "data_source": {
+        "record_locator": {
+          "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+        },
+        "permissions_data": [
+          {
+            "mode": 33188
+          }
+        ]
+      }
+    }
+  },
+  {
+    "type": "ListItem",
+    "element_id": "366c05fd7babc86bf01d690b9df755da",
+    "text": "5 layout = model . detect ( image )",
+    "metadata": {
+      "filetype": "application/pdf",
+      "languages": [
+        "eng"
+      ],
+      "page_number": 5,
+      "data_source": {
+        "record_locator": {
+          "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
+        },
+        "permissions_data": [
+          {
+            "mode": 33188
+          }
+        ]
+      }
+    }
+  },
  {
    "type": "NarrativeText",
-    "element_id": "59171bb0b4a32c9ec1b0e1d327ddb88f",
+    "element_id": "f888c5e8f5b1339f2af75612ea13c719",
    "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp://<dataset-name>/<model-architecture-name>.",
    "metadata": {
      "filetype": "application/pdf",
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.1-dev5"  # pragma: no cover
+__version__ = "0.16.1-dev6"  # pragma: no cover
--- a/unstructured/partition/pdf_image/inference_utils.py
+++ b/unstructured/partition/pdf_image/inference_utils.py
@ -3,7 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Optional

 from unstructured_inference.constants import Source
-from unstructured_inference.inference.elements import TextRegion
+from unstructured_inference.inference.elements import TextRegion, TextRegions
 from unstructured_inference.inference.layoutelement import (
    LayoutElement,
    partition_groups_from_regions,
@ -66,9 +66,9 @@ def build_layout_elements_from_ocr_regions(
            for r in regions:
                ocr_regions.remove(r)

-            grouped_regions.append(regions)
+            grouped_regions.append(TextRegions.from_list(regions))
    else:
-        grouped_regions = partition_groups_from_regions(ocr_regions)
+        grouped_regions = partition_groups_from_regions(TextRegions.from_list(ocr_regions))

    merged_regions = [merge_text_regions(group) for group in grouped_regions]
    return [
@ -79,12 +79,12 @@ def build_layout_elements_from_ocr_regions(
    ]


-def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
+def merge_text_regions(regions: TextRegions) -> TextRegion:
    """
    Merge a list of TextRegion objects into a single TextRegion.

    Parameters:
-    - group (list[TextRegion]): A list of TextRegion objects to be merged.
+    - group (TextRegions): A group of TextRegion objects to be merged.

    Returns:
    - TextRegion: A single merged TextRegion object.
@ -93,13 +93,12 @@ def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
    if not regions:
        raise ValueError("The text regions to be merged must be provided.")

-    min_x1 = min([tr.bbox.x1 for tr in regions])
-    min_y1 = min([tr.bbox.y1 for tr in regions])
-    max_x2 = max([tr.bbox.x2 for tr in regions])
-    max_y2 = max([tr.bbox.y2 for tr in regions])
+    min_x1 = regions.x1.min().astype(float)
+    min_y1 = regions.y1.min().astype(float)
+    max_x2 = regions.x2.max().astype(float)
+    max_y2 = regions.y2.max().astype(float)

-    merged_text = " ".join([tr.text for tr in regions if tr.text])
-    sources = [tr.source for tr in regions]
-    source = sources[0] if all(s == sources[0] for s in sources) else None
+    merged_text = " ".join([text for text in regions.texts if text])
+    source = regions.source

    return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, merged_text, source)