mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 13:16:47 +00:00
bump unstructured-inference
(#3711)
This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com>
This commit is contained in:
parent
e764bc503c
commit
a11ad22609
3
.github/actions/base-cache/action.yml
vendored
3
.github/actions/base-cache/action.yml
vendored
@ -30,7 +30,9 @@ runs:
|
||||
shell: bash
|
||||
run: |
|
||||
python${{ inputs.python-version }} -m pip install --upgrade virtualenv
|
||||
if [ ! -d ".venv" ]; then
|
||||
python${{ inputs.python-version }} -m venv .venv
|
||||
fi
|
||||
source .venv/bin/activate
|
||||
[ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
|
||||
if [ "${{ inputs.python-version == '3.12' }}" == "true" ]; then
|
||||
@ -38,6 +40,7 @@ runs:
|
||||
python -m pip install --upgrade setuptools
|
||||
fi
|
||||
make install-ci
|
||||
make install-nltk-models
|
||||
- name: Save Cache
|
||||
if: steps.virtualenv-cache-restore.outputs.cache-hit != 'true'
|
||||
id: virtualenv-cache-save
|
||||
|
6
.github/actions/base-ingest-cache/action.yml
vendored
6
.github/actions/base-ingest-cache/action.yml
vendored
@ -18,7 +18,7 @@ runs:
|
||||
path: |
|
||||
.venv
|
||||
nltk_data
|
||||
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
|
||||
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
|
||||
lookup-only: ${{ inputs.check-only }}
|
||||
- name: Set up Python ${{ inputs.python-version }}
|
||||
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
|
||||
@ -39,6 +39,8 @@ runs:
|
||||
python -m pip install --upgrade setuptools
|
||||
fi
|
||||
make install-ci
|
||||
make install-nltk-models
|
||||
make install-all-docs
|
||||
make install-ingest
|
||||
- name: Save Ingest Cache
|
||||
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
|
||||
@ -48,5 +50,5 @@ runs:
|
||||
path: |
|
||||
.venv
|
||||
nltk_data
|
||||
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt') }}-${{ hashFiles('requirements/*.txt') }}
|
||||
key: unstructured-ingest-${{ runner.os }}-${{ inputs.python-version }}-${{ hashFiles('requirements/ingest/*.txt', 'requirements/*.txt') }}
|
||||
|
||||
|
17
.github/workflows/ci.yml
vendored
17
.github/workflows/ci.yml
vendored
@ -12,14 +12,15 @@ permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11", "3.12"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/base-cache
|
||||
@ -78,8 +79,6 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.9","3.10","3.11"]
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup, changelog]
|
||||
steps:
|
||||
@ -185,8 +184,6 @@ jobs:
|
||||
python-version: ["3.10"]
|
||||
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "pdf-image", "pptx", "xlsx"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint, test_unit_no_extras]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -220,6 +217,7 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make install-${{ matrix.extra }}
|
||||
make test-extra-${{ matrix.extra }} CI=true
|
||||
|
||||
setup_ingest:
|
||||
@ -227,8 +225,6 @@ jobs:
|
||||
matrix:
|
||||
python-version: [ "3.9","3.10" ]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -307,7 +303,6 @@ jobs:
|
||||
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
|
||||
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
|
||||
CI: "true"
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
PYTHON: python${{ matrix.python-version }}
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
@ -320,6 +315,8 @@ jobs:
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
sudo apt-get install diffstat
|
||||
tesseract --version
|
||||
make install-all-docs
|
||||
make install-ingest
|
||||
./test_unstructured_ingest/test-ingest-src.sh
|
||||
|
||||
|
||||
@ -329,8 +326,6 @@ jobs:
|
||||
# NOTE(yuming): Unstructured API only use Python 3.10
|
||||
python-version: ["3.10"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -1,7 +1,8 @@
|
||||
## 0.16.1-dev5
|
||||
## 0.16.1-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
|
||||
* **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.
|
||||
|
||||
### Features
|
||||
|
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./base.in
|
||||
#
|
||||
anyio==4.6.0
|
||||
anyio==4.6.2.post1
|
||||
# via httpx
|
||||
backoff==2.2.1
|
||||
# via -r ./base.in
|
||||
@ -20,7 +20,7 @@ cffi==1.17.1
|
||||
# via cryptography
|
||||
chardet==5.2.0
|
||||
# via -r ./base.in
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
# via
|
||||
# requests
|
||||
# unstructured-client
|
||||
@ -28,7 +28,7 @@ click==8.1.7
|
||||
# via
|
||||
# nltk
|
||||
# python-oxmsg
|
||||
cryptography==43.0.1
|
||||
cryptography==43.0.3
|
||||
# via unstructured-client
|
||||
dataclasses-json==0.6.7
|
||||
# via
|
||||
@ -62,7 +62,7 @@ langdetect==1.0.9
|
||||
# via -r ./base.in
|
||||
lxml==5.3.0
|
||||
# via -r ./base.in
|
||||
marshmallow==3.22.0
|
||||
marshmallow==3.23.0
|
||||
# via
|
||||
# dataclasses-json
|
||||
# unstructured-client
|
||||
@ -84,7 +84,7 @@ packaging==24.1
|
||||
# via
|
||||
# marshmallow
|
||||
# unstructured-client
|
||||
psutil==6.0.0
|
||||
psutil==6.1.0
|
||||
# via -r ./base.in
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
|
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./dev.in
|
||||
#
|
||||
build==1.2.2
|
||||
build==1.2.2.post1
|
||||
# via pip-tools
|
||||
cfgv==3.4.0
|
||||
# via pre-commit
|
||||
@ -13,7 +13,7 @@ click==8.1.7
|
||||
# -c ./base.txt
|
||||
# -c ./test.txt
|
||||
# pip-tools
|
||||
distlib==0.3.8
|
||||
distlib==0.3.9
|
||||
# via virtualenv
|
||||
filelock==3.16.1
|
||||
# via virtualenv
|
||||
@ -36,7 +36,7 @@ platformdirs==4.3.6
|
||||
# via
|
||||
# -c ./test.txt
|
||||
# virtualenv
|
||||
pre-commit==3.8.0
|
||||
pre-commit==4.0.1
|
||||
# via -r ./dev.in
|
||||
pyproject-hooks==1.2.0
|
||||
# via
|
||||
@ -51,7 +51,7 @@ tomli==2.0.2
|
||||
# -c ./test.txt
|
||||
# build
|
||||
# pip-tools
|
||||
virtualenv==20.26.6
|
||||
virtualenv==20.27.0
|
||||
# via pre-commit
|
||||
wheel==0.44.0
|
||||
# via pip-tools
|
||||
|
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./extra-epub.in
|
||||
#
|
||||
pypandoc==1.13
|
||||
pypandoc==1.14
|
||||
# via -r ./extra-epub.in
|
||||
|
@ -8,7 +8,7 @@ lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-docx
|
||||
pypandoc==1.13
|
||||
pypandoc==1.14
|
||||
# via -r ./extra-odt.in
|
||||
python-docx==1.1.2
|
||||
# via -r ./extra-odt.in
|
||||
|
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./extra-paddleocr.in
|
||||
#
|
||||
anyio==4.6.0
|
||||
anyio==4.6.2.post1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
@ -16,7 +16,7 @@ certifi==2024.8.30
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -52,7 +52,7 @@ idna==3.10
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
imageio==2.35.1
|
||||
imageio==2.36.0
|
||||
# via
|
||||
# imgaug
|
||||
# scikit-image
|
||||
@ -104,7 +104,7 @@ paddlepaddle==3.0.0b1
|
||||
# via -r ./extra-paddleocr.in
|
||||
pdf2image==1.17.0
|
||||
# via unstructured-paddleocr
|
||||
pillow==10.4.0
|
||||
pillow==11.0.0
|
||||
# via
|
||||
# imageio
|
||||
# imgaug
|
||||
@ -117,9 +117,9 @@ protobuf==4.25.5
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# paddlepaddle
|
||||
pyclipper==1.3.0.post5
|
||||
pyclipper==1.3.0.post6
|
||||
# via unstructured-paddleocr
|
||||
pyparsing==3.1.4
|
||||
pyparsing==3.2.0
|
||||
# via matplotlib
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
|
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./extra-pandoc.in
|
||||
#
|
||||
pypandoc==1.13
|
||||
pypandoc==1.14
|
||||
# via -r ./extra-pandoc.in
|
||||
|
@ -11,5 +11,5 @@ google-cloud-vision
|
||||
effdet
|
||||
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference==0.7.36
|
||||
unstructured-inference==0.8.0
|
||||
unstructured.pytesseract>=0.3.12
|
||||
|
@ -16,7 +16,7 @@ cffi==1.17.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# cryptography
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
@ -25,7 +25,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.3.0
|
||||
# via matplotlib
|
||||
cryptography==43.0.1
|
||||
cryptography==43.0.3
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pdfminer-six
|
||||
@ -48,7 +48,7 @@ fsspec==2024.9.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
google-api-core[grpc]==2.20.0
|
||||
google-api-core[grpc]==2.21.0
|
||||
# via google-cloud-vision
|
||||
google-auth==2.35.0
|
||||
# via
|
||||
@ -60,14 +60,14 @@ googleapis-common-protos==1.65.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.66.2
|
||||
grpcio==1.67.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio-status==1.62.3
|
||||
# via google-api-core
|
||||
huggingface-hub==0.25.1
|
||||
huggingface-hub==0.26.0
|
||||
# via
|
||||
# timm
|
||||
# tokenizers
|
||||
@ -93,7 +93,7 @@ lxml==5.3.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# pikepdf
|
||||
markupsafe==2.1.5
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
matplotlib==3.9.2
|
||||
# via
|
||||
@ -117,6 +117,7 @@ numpy==1.26.4
|
||||
# scipy
|
||||
# torchvision
|
||||
# transformers
|
||||
# unstructured-inference
|
||||
omegaconf==2.3.0
|
||||
# via effdet
|
||||
onnx==1.17.0
|
||||
@ -150,11 +151,11 @@ pdfminer-six==20231228
|
||||
# pdfplumber
|
||||
pdfplumber==0.11.4
|
||||
# via layoutparser
|
||||
pi-heif==0.18.0
|
||||
pi-heif==0.20.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pikepdf==9.3.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
pillow==10.4.0
|
||||
pillow==11.0.0
|
||||
# via
|
||||
# layoutparser
|
||||
# matplotlib
|
||||
@ -192,7 +193,7 @@ pycparser==2.22
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# cffi
|
||||
pyparsing==3.1.4
|
||||
pyparsing==3.2.0
|
||||
# via matplotlib
|
||||
pypdf==5.0.1
|
||||
# via
|
||||
@ -242,11 +243,11 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
sympy==1.13.3
|
||||
sympy==1.13.1
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==1.0.9
|
||||
timm==1.0.11
|
||||
# via
|
||||
# effdet
|
||||
# unstructured-inference
|
||||
@ -254,13 +255,13 @@ tokenizers==0.19.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.4.1
|
||||
torch==2.5.0
|
||||
# via
|
||||
# effdet
|
||||
# timm
|
||||
# torchvision
|
||||
# unstructured-inference
|
||||
torchvision==0.19.1
|
||||
torchvision==0.20.0
|
||||
# via
|
||||
# effdet
|
||||
# timm
|
||||
@ -281,7 +282,7 @@ typing-extensions==4.12.2
|
||||
# torch
|
||||
tzdata==2024.2
|
||||
# via pandas
|
||||
unstructured-inference==0.7.36
|
||||
unstructured-inference==0.8.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.13
|
||||
# via -r ./extra-pdf-image.in
|
||||
|
@ -6,7 +6,7 @@
|
||||
#
|
||||
lxml==5.3.0
|
||||
# via python-pptx
|
||||
pillow==10.4.0
|
||||
pillow==11.0.0
|
||||
# via python-pptx
|
||||
python-pptx==1.0.2
|
||||
# via -r ./extra-pptx.in
|
||||
|
@ -8,7 +8,7 @@ certifi==2024.8.30
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -25,7 +25,7 @@ fsspec==2024.9.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
huggingface-hub==0.25.1
|
||||
huggingface-hub==0.26.0
|
||||
# via
|
||||
# tokenizers
|
||||
# transformers
|
||||
@ -43,7 +43,7 @@ langdetect==1.0.9
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# -r ./huggingface.in
|
||||
markupsafe==2.1.5
|
||||
markupsafe==3.0.2
|
||||
# via jinja2
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
@ -82,13 +82,13 @@ six==1.16.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# langdetect
|
||||
sympy==1.13.3
|
||||
sympy==1.13.1
|
||||
# via torch
|
||||
tokenizers==0.19.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# transformers
|
||||
torch==2.4.1
|
||||
torch==2.5.0
|
||||
# via -r ./huggingface.in
|
||||
tqdm==4.66.5
|
||||
# via
|
||||
|
@ -6,7 +6,7 @@
|
||||
#
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.6.0
|
||||
anyio==4.6.2.post1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
@ -16,7 +16,7 @@ attrs==24.2.0
|
||||
# via jsonschema
|
||||
autoflake==2.3.1
|
||||
# via -r ./test.in
|
||||
black==24.8.0
|
||||
black==24.10.0
|
||||
# via -r ./test.in
|
||||
certifi==2024.8.30
|
||||
# via
|
||||
@ -24,7 +24,7 @@ certifi==2024.8.30
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
charset-normalizer==3.4.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
@ -33,7 +33,7 @@ click==8.1.7
|
||||
# -c ./base.txt
|
||||
# black
|
||||
# nltk
|
||||
coverage[toml]==7.6.1
|
||||
coverage[toml]==7.6.4
|
||||
# via
|
||||
# -r ./test.in
|
||||
# pytest-cov
|
||||
@ -50,7 +50,7 @@ flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.1
|
||||
# via -r ./test.in
|
||||
grpcio==1.66.2
|
||||
grpcio==1.67.0
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./test.in
|
||||
@ -95,7 +95,7 @@ mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.1.0
|
||||
# via yarl
|
||||
mypy==1.11.2
|
||||
mypy==1.12.1
|
||||
# via -r ./test.in
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
@ -119,12 +119,14 @@ pandas==2.2.3
|
||||
# via label-studio-sdk
|
||||
pathspec==0.12.1
|
||||
# via black
|
||||
pillow==10.4.0
|
||||
pillow==11.0.0
|
||||
# via label-studio-sdk
|
||||
platformdirs==4.3.6
|
||||
# via black
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
propcache==0.2.0
|
||||
# via yarl
|
||||
pycodestyle==2.12.1
|
||||
# via
|
||||
# flake8
|
||||
@ -226,7 +228,7 @@ urllib3==1.26.20
|
||||
# -c ./base.txt
|
||||
# requests
|
||||
# vcrpy
|
||||
vcrpy==6.0.1
|
||||
vcrpy==6.0.2
|
||||
# via -r ./test.in
|
||||
wrapt==1.16.0
|
||||
# via
|
||||
@ -234,7 +236,7 @@ wrapt==1.16.0
|
||||
# vcrpy
|
||||
xmljson==0.2.1
|
||||
# via label-studio-sdk
|
||||
yarl==1.13.1
|
||||
yarl==1.15.5
|
||||
# via vcrpy
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
|
@ -1,4 +1,4 @@
|
||||
from unstructured_inference.inference.elements import TextRegion
|
||||
from unstructured_inference.inference.elements import TextRegion, TextRegions
|
||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||
|
||||
from unstructured.documents.elements import ElementType
|
||||
@ -17,7 +17,7 @@ def test_merge_text_regions(mock_embedded_text_regions):
|
||||
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
||||
)
|
||||
|
||||
merged_text_region = merge_text_regions(mock_embedded_text_regions)
|
||||
merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
|
||||
assert merged_text_region == expected
|
||||
|
||||
|
||||
|
@ -179,6 +179,12 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
||||
# check that the pdf has multiple different page numbers
|
||||
assert {element.metadata.page_number for element in result} == expected_page_numbers
|
||||
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
||||
print(
|
||||
[
|
||||
(element.metadata.detection_origin, element.category, element.text)
|
||||
for element in result
|
||||
]
|
||||
)
|
||||
assert {element.metadata.detection_origin for element in result} == origin
|
||||
|
||||
if file_mode == "filename":
|
||||
|
@ -19,8 +19,8 @@ from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
# NOTE(crag): point to freemium API for now
|
||||
API_URL = "https://api.unstructured.io/general/v0/general"
|
||||
# NOTE(yao): point to paid API for now
|
||||
API_URL = "https://api.unstructuredapp.io/general/v0/general"
|
||||
|
||||
is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"}
|
||||
skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"
|
||||
|
@ -338,7 +338,20 @@
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "6277cd91869e10d6256f362b08d3e789",
|
||||
"element_id": "f0f0586caeb3af4284c1b367a5269d27",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "452",
|
||||
"type": "Header"
|
||||
},
|
||||
{
|
||||
"element_id": "ac79570be092923eb29899f64281c3b3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -351,7 +364,7 @@
|
||||
"type": "Table"
|
||||
},
|
||||
{
|
||||
"element_id": "22b8448fe36b3ccd06d1d8e4ea2dc1ea",
|
||||
"element_id": "13fd694e1ff862d163b840a246964e58",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -364,7 +377,7 @@
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "f2b57562924402b85f6eb07925ea1654",
|
||||
"element_id": "5f1c4074c1b5d641b724b99be6f5ddfd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -377,7 +390,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "d9f6efffd49ef59e671206bfb5f094de",
|
||||
"element_id": "afed004de4c50d761640b6c18729a988",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -390,7 +403,7 @@
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "2a1e46bc589c5eca777b657e141e824b",
|
||||
"element_id": "f93d89ccb971e2b60f44afbf710673c6",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -403,7 +416,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "2c42182c07ecdb96362b534a8fad4d59",
|
||||
"element_id": "cb6e8acb9c24820b59f8973cc236ef35",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -416,7 +429,7 @@
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "c6fd85f9219a2c75bb1f8c1889bb2b5f",
|
||||
"element_id": "5964ede27be8850de7a13e0dd32c1b21",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -429,7 +442,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "07cdb1623f501ea23a343039300178cc",
|
||||
"element_id": "e1f7e635d8739a97d8d0000ba8004f61",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -442,7 +455,7 @@
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "4bf8165bcb21c5296b741ba0f9e38f93",
|
||||
"element_id": "deb8964830ba1f9dd1eec7b08bd3ea19",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -455,7 +468,7 @@
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "85918ce2a03e9f236137a0fe72985af0",
|
||||
"element_id": "be270e13c935334fa3b17b13066d639b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -468,7 +481,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "93537983496efa695cfc65ad895d9412",
|
||||
"element_id": "5c97405ec921495b23d2b400516cbd06",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -481,7 +494,7 @@
|
||||
"type": "Image"
|
||||
},
|
||||
{
|
||||
"element_id": "76b94e78b638b79374e266284c1a0d83",
|
||||
"element_id": "7956ee39ac5e080a362967e2f6a5753e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
|
@ -598,20 +598,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "448de3300a8c7e2cfdd2028dd0bb4171",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "and",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "b13807f59ac7c6647ee0aee74f9b0dd3",
|
||||
"element_id": "db6ff60cbdb77adc14a6b9491af8d161",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -624,7 +611,7 @@
|
||||
"type": "ListItem"
|
||||
},
|
||||
{
|
||||
"element_id": "db480e847a5703b19be6b79223e1ee03",
|
||||
"element_id": "9f6ef223a141a5381951eff39b3af039",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -637,7 +624,7 @@
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "326c44638a881f86474b82cc244896f9",
|
||||
"element_id": "5c67842128e14fc16344beaa2aa0111e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
|
@ -1276,9 +1276,75 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "53b448c75f1556b1f60b4e3324bd0724",
|
||||
"text": "1 import layoutparser as lp",
|
||||
"metadata": {
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 5,
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
|
||||
},
|
||||
"permissions_data": [
|
||||
{
|
||||
"mode": 33188
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "a002e13c7ea2613b2eabb9ea3501856d",
|
||||
"text": "3 model = lp . De t e c tro n2 Lay outM odel (",
|
||||
"metadata": {
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 5,
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
|
||||
},
|
||||
"permissions_data": [
|
||||
{
|
||||
"mode": 33188
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "366c05fd7babc86bf01d690b9df755da",
|
||||
"text": "5 layout = model . detect ( image )",
|
||||
"metadata": {
|
||||
"filetype": "application/pdf",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"page_number": 5,
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/example-docs/layout-parser-paper.pdf"
|
||||
},
|
||||
"permissions_data": [
|
||||
{
|
||||
"mode": 33188
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "59171bb0b4a32c9ec1b0e1d327ddb88f",
|
||||
"element_id": "f888c5e8f5b1339f2af75612ea13c719",
|
||||
"text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp://<dataset-name>/<model-architecture-name>.",
|
||||
"metadata": {
|
||||
"filetype": "application/pdf",
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.1-dev5" # pragma: no cover
|
||||
__version__ = "0.16.1-dev6" # pragma: no cover
|
||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
from unstructured_inference.constants import Source
|
||||
from unstructured_inference.inference.elements import TextRegion
|
||||
from unstructured_inference.inference.elements import TextRegion, TextRegions
|
||||
from unstructured_inference.inference.layoutelement import (
|
||||
LayoutElement,
|
||||
partition_groups_from_regions,
|
||||
@ -66,9 +66,9 @@ def build_layout_elements_from_ocr_regions(
|
||||
for r in regions:
|
||||
ocr_regions.remove(r)
|
||||
|
||||
grouped_regions.append(regions)
|
||||
grouped_regions.append(TextRegions.from_list(regions))
|
||||
else:
|
||||
grouped_regions = partition_groups_from_regions(ocr_regions)
|
||||
grouped_regions = partition_groups_from_regions(TextRegions.from_list(ocr_regions))
|
||||
|
||||
merged_regions = [merge_text_regions(group) for group in grouped_regions]
|
||||
return [
|
||||
@ -79,12 +79,12 @@ def build_layout_elements_from_ocr_regions(
|
||||
]
|
||||
|
||||
|
||||
def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
|
||||
def merge_text_regions(regions: TextRegions) -> TextRegion:
|
||||
"""
|
||||
Merge a list of TextRegion objects into a single TextRegion.
|
||||
|
||||
Parameters:
|
||||
- group (list[TextRegion]): A list of TextRegion objects to be merged.
|
||||
- group (TextRegions): A group of TextRegion objects to be merged.
|
||||
|
||||
Returns:
|
||||
- TextRegion: A single merged TextRegion object.
|
||||
@ -93,13 +93,12 @@ def merge_text_regions(regions: list[TextRegion]) -> TextRegion:
|
||||
if not regions:
|
||||
raise ValueError("The text regions to be merged must be provided.")
|
||||
|
||||
min_x1 = min([tr.bbox.x1 for tr in regions])
|
||||
min_y1 = min([tr.bbox.y1 for tr in regions])
|
||||
max_x2 = max([tr.bbox.x2 for tr in regions])
|
||||
max_y2 = max([tr.bbox.y2 for tr in regions])
|
||||
min_x1 = regions.x1.min().astype(float)
|
||||
min_y1 = regions.y1.min().astype(float)
|
||||
max_x2 = regions.x2.max().astype(float)
|
||||
max_y2 = regions.y2.max().astype(float)
|
||||
|
||||
merged_text = " ".join([tr.text for tr in regions if tr.text])
|
||||
sources = [tr.source for tr in regions]
|
||||
source = sources[0] if all(s == sources[0] for s in sources) else None
|
||||
merged_text = " ".join([text for text in regions.texts if text])
|
||||
source = regions.source
|
||||
|
||||
return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, merged_text, source)
|
||||
|
Loading…
x
Reference in New Issue
Block a user