bump unstructured-inference (#3711)

This PR bumps `unstructured-inference` to `0.8.0`, which introduces
vectorized data structure for layout elements and text regions.
This PR also cleans up a few places in CI that has repeated definition
of env variables or missing installation of testing dependencies in
cache.

A few document ingest results are changed:
- two places for `biomed-api` (actually processed locally on runner) are
due to very small changes in numerical results of the bounding box
areas: one results in a duplicated page number/header and another
results in a deduplication of a word of a sentence that starts in a new
line. (yes, two cases goes in opposite directions)
- the layout parser paper now outputs the code lines with page number
inside the code box as list items

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
Yao You 2024-10-21 16:55:08 -05:00 committed by GitHub
parent e764bc503c
commit a11ad22609
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 184 additions and 109 deletions

View File

@ -30,7 +30,9 @@ runs:
shell: bash
run: |
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
if [ ! -d ".venv" ]; then
python${{ inputs.python-version }} -m venv .venv
fi
source .venv/bin/activate
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
@ -38,6 +40,7 @@ runs:
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
- name: Save Cache
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
id: virtualenv-cache-save

View File

@ -18,7 +18,7 @@ runs:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
lookup-only: ${{ inputs.check-only }}
- name: Set up Python ${{ inputs.python-version }}
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@ -39,6 +39,8 @@ runs:
python -m pip install --upgrade setuptools
fi
make install-ci
make install-nltk-models
make install-all-docs
make install-ingest
- name: Save Ingest Cache
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
@ -48,5 +50,5 @@ runs:
path: |
.venv
nltk_data
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}

View File

@ -12,14 +12,15 @@ permissions:
id-token: write
contents: read
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
jobs:
setup:
strategy:
matrix:
python-version: ["3.9","3.10","3.11", "3.12"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/base-cache
@ -78,8 +79,6 @@ jobs:
strategy:
matrix:
python-version: ["3.9","3.10","3.11"]
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
runs-on: ubuntu-latest
needs: [setup, changelog]
steps:
@ -185,8 +184,6 @@ jobs:
python-version: ["3.10"]
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint, test_unit_no_extras]
steps:
- uses: actions/checkout@v4
@ -220,6 +217,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make install-${{ matrix.extra }}
make test-extra-${{ matrix.extra }} CI=true
setup_ingest:
@ -227,8 +225,6 @@ jobs:
matrix:
python-version: [ "3.9","3.10" ]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup]
steps:
- uses: actions/checkout@v4
@ -307,7 +303,6 @@ jobs:
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
CI: "true"
NLTK_DATA: ${{ github.workspace }}/nltk_data
PYTHON: python${{ matrix.python-version }}
run: |
source .venv/bin/activate
@ -320,6 +315,8 @@ jobs:
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
make install-all-docs
make install-ingest
./test_unstructured_ingest/test-ingest-src.sh
@ -329,8 +326,6 @@ jobs:
# NOTE(yuming): Unstructured API only use Python 3.10
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4

View File

@ -1,7 +1,8 @@
## 0.16.1-dev5
## 0.16.1-dev6
### Enhancements
* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
* **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.
### Features

View File

@ -4,7 +4,7 @@
#
# pip-compile ./base.in
#
anyio==4.6.0
anyio==4.6.2.post1
# via httpx
backoff==2.2.1
# via -r ./base.in
@ -20,7 +20,7 @@ cffi==1.17.1
# via cryptography
chardet==5.2.0
# via -r ./base.in
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via
# requests
# unstructured-client
@ -28,7 +28,7 @@ click==8.1.7
# via
# nltk
# python-oxmsg
cryptography==43.0.1
cryptography==43.0.3
# via unstructured-client
dataclasses-json==0.6.7
# via
@ -62,7 +62,7 @@ langdetect==1.0.9
# via -r ./base.in
lxml==5.3.0
# via -r ./base.in
marshmallow==3.22.0
marshmallow==3.23.0
# via
# dataclasses-json
# unstructured-client
@ -84,7 +84,7 @@ packaging==24.1
# via
# marshmallow
# unstructured-client
psutil==6.0.0
psutil==6.1.0
# via -r ./base.in
pycparser==2.22
# via cffi

View File

@ -4,7 +4,7 @@
#
# pip-compile ./dev.in
#
build==1.2.2
build==1.2.2.post1
# via pip-tools
cfgv==3.4.0
# via pre-commit
@ -13,7 +13,7 @@ click==8.1.7
# -c ./base.txt
# -c ./test.txt
# pip-tools
distlib==0.3.8
distlib==0.3.9
# via virtualenv
filelock==3.16.1
# via virtualenv
@ -36,7 +36,7 @@ platformdirs==4.3.6
# via
# -c ./test.txt
# virtualenv
pre-commit==3.8.0
pre-commit==4.0.1
# via -r ./dev.in
pyproject-hooks==1.2.0
# via
@ -51,7 +51,7 @@ tomli==2.0.2
# -c ./test.txt
# build
# pip-tools
virtualenv==20.26.6
virtualenv==20.27.0
# via pre-commit
wheel==0.44.0
# via pip-tools

View File

@ -4,5 +4,5 @@
#
# pip-compile ./extra-epub.in
#
pypandoc==1.13
pypandoc==1.14
# via -r ./extra-epub.in

View File

@ -8,7 +8,7 @@ lxml==5.3.0
# via
# -c ./base.txt
# python-docx
pypandoc==1.13
pypandoc==1.14
# via -r ./extra-odt.in
python-docx==1.1.2
# via -r ./extra-odt.in

View File

@ -4,7 +4,7 @@
#
# pip-compile ./extra-paddleocr.in
#
anyio==4.6.0
anyio==4.6.2.post1
# via
# -c ./base.txt
# httpx
@ -16,7 +16,7 @@ certifi==2024.8.30
# httpcore
# httpx
# requests
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via
# -c ./base.txt
# requests
@ -52,7 +52,7 @@ idna==3.10
# anyio
# httpx
# requests
imageio==2.35.1
imageio==2.36.0
# via
# imgaug
# scikit-image
@ -104,7 +104,7 @@ paddlepaddle==3.0.0b1
# via -r ./extra-paddleocr.in
pdf2image==1.17.0
# via unstructured-paddleocr
pillow==10.4.0
pillow==11.0.0
# via
# imageio
# imgaug
@ -117,9 +117,9 @@ protobuf==4.25.5
# via
# -c ././deps/constraints.txt
# paddlepaddle
pyclipper==1.3.0.post5
pyclipper==1.3.0.post6
# via unstructured-paddleocr
pyparsing==3.1.4
pyparsing==3.2.0
# via matplotlib
python-dateutil==2.9.0.post0
# via

View File

@ -4,5 +4,5 @@
#
# pip-compile ./extra-pandoc.in
#
pypandoc==1.13
pypandoc==1.14
# via -r ./extra-pandoc.in

View File

@ -11,5 +11,5 @@ google-cloud-vision
effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.36
unstructured-inference==0.8.0
unstructured.pytesseract>=0.3.12

View File

@ -16,7 +16,7 @@ cffi==1.17.1
# via
# -c ./base.txt
# cryptography
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via
# -c ./base.txt
# pdfminer-six
@ -25,7 +25,7 @@ coloredlogs==15.0.1
# via onnxruntime
contourpy==1.3.0
# via matplotlib
cryptography==43.0.1
cryptography==43.0.3
# via
# -c ./base.txt
# pdfminer-six
@ -48,7 +48,7 @@ fsspec==2024.9.0
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.20.0
google-api-core[grpc]==2.21.0
# via google-cloud-vision
google-auth==2.35.0
# via
@ -60,14 +60,14 @@ googleapis-common-protos==1.65.0
# via
# google-api-core
# grpcio-status
grpcio==1.66.2
grpcio==1.67.0
# via
# -c ././deps/constraints.txt
# google-api-core
# grpcio-status
grpcio-status==1.62.3
# via google-api-core
huggingface-hub==0.25.1
huggingface-hub==0.26.0
# via
# timm
# tokenizers
@ -93,7 +93,7 @@ lxml==5.3.0
# via
# -c ./base.txt
# pikepdf
markupsafe==2.1.5
markupsafe==3.0.2
# via jinja2
matplotlib==3.9.2
# via
@ -117,6 +117,7 @@ numpy==1.26.4
# scipy
# torchvision
# transformers
# unstructured-inference
omegaconf==2.3.0
# via effdet
onnx==1.17.0
@ -150,11 +151,11 @@ pdfminer-six==20231228
# pdfplumber
pdfplumber==0.11.4
# via layoutparser
pi-heif==0.18.0
pi-heif==0.20.0
# via -r ./extra-pdf-image.in
pikepdf==9.3.0
# via -r ./extra-pdf-image.in
pillow==10.4.0
pillow==11.0.0
# via
# layoutparser
# matplotlib
@ -192,7 +193,7 @@ pycparser==2.22
# via
# -c ./base.txt
# cffi
pyparsing==3.1.4
pyparsing==3.2.0
# via matplotlib
pypdf==5.0.1
# via
@ -242,11 +243,11 @@ six==1.16.0
# via
# -c ./base.txt
# python-dateutil
sympy==1.13.3
sympy==1.13.1
# via
# onnxruntime
# torch
timm==1.0.9
timm==1.0.11
# via
# effdet
# unstructured-inference
@ -254,13 +255,13 @@ tokenizers==0.19.1
# via
# -c ././deps/constraints.txt
# transformers
torch==2.4.1
torch==2.5.0
# via
# effdet
# timm
# torchvision
# unstructured-inference
torchvision==0.19.1
torchvision==0.20.0
# via
# effdet
# timm
@ -281,7 +282,7 @@ typing-extensions==4.12.2
# torch
tzdata==2024.2
# via pandas
unstructured-inference==0.7.36
unstructured-inference==0.8.0
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.13
# via -r ./extra-pdf-image.in

View File

@ -6,7 +6,7 @@
#
lxml==5.3.0
# via python-pptx
pillow==10.4.0
pillow==11.0.0
# via python-pptx
python-pptx==1.0.2
# via -r ./extra-pptx.in

View File

@ -8,7 +8,7 @@ certifi==2024.8.30
# via
# -c ./base.txt
# requests
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via
# -c ./base.txt
# requests
@ -25,7 +25,7 @@ fsspec==2024.9.0
# via
# huggingface-hub
# torch
huggingface-hub==0.25.1
huggingface-hub==0.26.0
# via
# tokenizers
# transformers
@ -43,7 +43,7 @@ langdetect==1.0.9
# via
# -c ./base.txt
# -r ./huggingface.in
markupsafe==2.1.5
markupsafe==3.0.2
# via jinja2
mpmath==1.3.0
# via sympy
@ -82,13 +82,13 @@ six==1.16.0
# via
# -c ./base.txt
# langdetect
sympy==1.13.3
sympy==1.13.1
# via torch
tokenizers==0.19.1
# via
# -c ././deps/constraints.txt
# transformers
torch==2.4.1
torch==2.5.0
# via -r ./huggingface.in
tqdm==4.66.5
# via

View File

@ -6,7 +6,7 @@
#
annotated-types==0.7.0
# via pydantic
anyio==4.6.0
anyio==4.6.2.post1
# via
# -c ./base.txt
# httpx
@ -16,7 +16,7 @@ attrs==24.2.0
# via jsonschema
autoflake==2.3.1
# via -r ./test.in
black==24.8.0
black==24.10.0
# via -r ./test.in
certifi==2024.8.30
# via
@ -24,7 +24,7 @@ certifi==2024.8.30
# httpcore
# httpx
# requests
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via
# -c ./base.txt
# requests
@ -33,7 +33,7 @@ click==8.1.7
# -c ./base.txt
# black
# nltk
coverage[toml]==7.6.1
coverage[toml]==7.6.4
# via
# -r ./test.in
# pytest-cov
@ -50,7 +50,7 @@ flake8-print==5.0.0
# via -r ./test.in
freezegun==1.5.1
# via -r ./test.in
grpcio==1.66.2
grpcio==1.67.0
# via
# -c ././deps/constraints.txt
# -r ./test.in
@ -95,7 +95,7 @@ mccabe==0.7.0
# via flake8
multidict==6.1.0
# via yarl
mypy==1.11.2
mypy==1.12.1
# via -r ./test.in
mypy-extensions==1.0.0
# via
@ -119,12 +119,14 @@ pandas==2.2.3
# via label-studio-sdk
pathspec==0.12.1
# via black
pillow==10.4.0
pillow==11.0.0
# via label-studio-sdk
platformdirs==4.3.6
# via black
pluggy==1.5.0
# via pytest
propcache==0.2.0
# via yarl
pycodestyle==2.12.1
# via
# flake8
@ -226,7 +228,7 @@ urllib3==1.26.20
# -c ./base.txt
# requests
# vcrpy
vcrpy==6.0.1
vcrpy==6.0.2
# via -r ./test.in
wrapt==1.16.0
# via
@ -234,7 +236,7 @@ wrapt==1.16.0
# vcrpy
xmljson==0.2.1
# via label-studio-sdk
yarl==1.13.1
yarl==1.15.5
# via vcrpy
# The following packages are considered to be unsafe in a requirements file:

View File

@ -1,4 +1,4 @@
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.elements import ElementType
@ -17,7 +17,7 @@ def test_merge_text_regions(mock_embedded_text_regions):
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
)
merged_text_region = merge_text_regions(mock_embedded_text_regions)
merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
assert merged_text_region == expected

View File

@ -179,6 +179,12 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected_page_numbers
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
print(
[
(element.metadata.detection_origin, element.category, element.text)
for element in result
]
)
assert {element.metadata.detection_origin for element in result} == origin
if file_mode == "filename":

View File

@ -19,8 +19,8 @@ from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
DIRECTORY = pathlib.Path(__file__).parent.resolve()
# NOTE(crag): point to freemium API for now
API_URL = "https://api.unstructured.io/general/v0/general"
# NOTE(yao): point to paid API for now
API_URL = "https://api.unstructuredapp.io/general/v0/general"
is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"}
skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"

View File

@ -338,7 +338,20 @@
"type": "ListItem"
},
{
"element_id": "6277cd91869e10d6256f362b08d3e789",
"element_id": "f0f0586caeb3af4284c1b367a5269d27",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "452",
"type": "Header"
},
{
"element_id": "ac79570be092923eb29899f64281c3b3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -351,7 +364,7 @@
"type": "Table"
},
{
"element_id": "22b8448fe36b3ccd06d1d8e4ea2dc1ea",
"element_id": "13fd694e1ff862d163b840a246964e58",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -364,7 +377,7 @@
"type": "Title"
},
{
"element_id": "f2b57562924402b85f6eb07925ea1654",
"element_id": "5f1c4074c1b5d641b724b99be6f5ddfd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -377,7 +390,7 @@
"type": "NarrativeText"
},
{
"element_id": "d9f6efffd49ef59e671206bfb5f094de",
"element_id": "afed004de4c50d761640b6c18729a988",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -390,7 +403,7 @@
"type": "ListItem"
},
{
"element_id": "2a1e46bc589c5eca777b657e141e824b",
"element_id": "f93d89ccb971e2b60f44afbf710673c6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -403,7 +416,7 @@
"type": "NarrativeText"
},
{
"element_id": "2c42182c07ecdb96362b534a8fad4d59",
"element_id": "cb6e8acb9c24820b59f8973cc236ef35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -416,7 +429,7 @@
"type": "ListItem"
},
{
"element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f",
"element_id": "5964ede27be8850de7a13e0dd32c1b21",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -429,7 +442,7 @@
"type": "NarrativeText"
},
{
"element_id": "07cdb1623f501ea23a343039300178cc",
"element_id": "e1f7e635d8739a97d8d0000ba8004f61",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -442,7 +455,7 @@
"type": "ListItem"
},
{
"element_id": "4bf8165bcb21c5296b741ba0f9e38f93",
"element_id": "deb8964830ba1f9dd1eec7b08bd3ea19",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -455,7 +468,7 @@
"type": "Title"
},
{
"element_id": "85918ce2a03e9f236137a0fe72985af0",
"element_id": "be270e13c935334fa3b17b13066d639b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -468,7 +481,7 @@
"type": "NarrativeText"
},
{
"element_id": "93537983496efa695cfc65ad895d9412",
"element_id": "5c97405ec921495b23d2b400516cbd06",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -481,7 +494,7 @@
"type": "Image"
},
{
"element_id": "76b94e78b638b79374e266284c1a0d83",
"element_id": "7956ee39ac5e080a362967e2f6a5753e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",

View File

@ -598,20 +598,7 @@
"type": "NarrativeText"
},
{
"element_id": "448de3300a8c7e2cfdd2028dd0bb4171",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 2
},
"text": "and",
"type": "NarrativeText"
},
{
"element_id": "b13807f59ac7c6647ee0aee74f9b0dd3",
"element_id": "db6ff60cbdb77adc14a6b9491af8d161",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -624,7 +611,7 @@
"type": "ListItem"
},
{
"element_id": "db480e847a5703b19be6b79223e1ee03",
"element_id": "9f6ef223a141a5381951eff39b3af039",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -637,7 +624,7 @@
"type": "NarrativeText"
},
{
"element_id": "326c44638a881f86474b82cc244896f9",
"element_id": "5c67842128e14fc16344beaa2aa0111e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",

View File

@ -1276,9 +1276,75 @@
}
}
},
{
"type": "ListItem",
"element_id": "53b448c75f1556b1f60b4e3324bd0724",
"text": "1 import layoutparser as lp",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 5,
"data_source": {
"record_locator": {
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
},
"permissions_data": [
{
"mode": 33188
}
]
}
}
},
{
"type": "ListItem",
"element_id": "a002e13c7ea2613b2eabb9ea3501856d",
"text": "3 model = lp . De t e c tro n2 Lay outM odel (",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 5,
"data_source": {
"record_locator": {
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
},
"permissions_data": [
{
"mode": 33188
}
]
}
}
},
{
"type": "ListItem",
"element_id": "366c05fd7babc86bf01d690b9df755da",
"text": "5 layout = model . detect ( image )",
"metadata": {
"filetype": "application/pdf",
"languages": [
"eng"
],
"page_number": 5,
"data_source": {
"record_locator": {
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
},
"permissions_data": [
{
"mode": 33188
}
]
}
}
},
{
"type": "NarrativeText",
"element_id": "59171bb0b4a32c9ec1b0e1d327ddb88f",
"element_id": "f888c5e8f5b1339f2af75612ea13c719",
"text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp://<dataset-name>/<model-architecture-name>.",
"metadata": {
"filetype": "application/pdf",

View File

@ -1 +1 @@
__version__ = "0.16.1-dev5" # pragma: no cover
__version__ = "0.16.1-dev6" # pragma: no cover

View File

@ -3,7 +3,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING, Optional
from unstructured_inference.constants import Source
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.inference.layoutelement import (
LayoutElement,
partition_groups_from_regions,
@ -66,9 +66,9 @@ def build_layout_elements_from_ocr_regions(
for r in regions:
ocr_regions.remove(r)
grouped_regions.append(regions)
grouped_regions.append(TextRegions.from_list(regions))
else:
grouped_regions = partition_groups_from_regions(ocr_regions)
grouped_regions = partition_groups_from_regions(TextRegions.from_list(ocr_regions))
merged_regions = [merge_text_regions(group) for group in grouped_regions]
return [
@ -79,12 +79,12 @@ def build_layout_elements_from_ocr_regions(
]
def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
def merge_text_regions(regions: TextRegions) -> TextRegion:
"""
Merge a list of TextRegion objects into a single TextRegion.
Parameters:
- group (list[TextRegion]): A list of TextRegion objects to be merged.
- group (TextRegions): A group of TextRegion objects to be merged.
Returns:
- TextRegion: A single merged TextRegion object.
@ -93,13 +93,12 @@ def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
if not regions:
raise ValueError("The text regions to be merged must be provided.")
min_x1 = min([tr.bbox.x1 for tr in regions])
min_y1 = min([tr.bbox.y1 for tr in regions])
max_x2 = max([tr.bbox.x2 for tr in regions])
max_y2 = max([tr.bbox.y2 for tr in regions])
min_x1 = regions.x1.min().astype(float)
min_y1 = regions.y1.min().astype(float)
max_x2 = regions.x2.max().astype(float)
max_y2 = regions.y2.max().astype(float)
merged_text = " ".join([tr.text for tr in regions if tr.text])
sources = [tr.source for tr in regions]
source = sources[0] if all(s == sources[0] for s in sources) else None
merged_text = " ".join([text for text in regions.texts if text])
source = regions.source
return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, merged_text, source)