build(deps): Remove unstructured.paddlepaddle fork (#3506)

This PR aims to remove "unstructured.paddlepaddle" fork. Previously, we
used `unstructured.paddlepaddle` fork to support
`unstructured.paddleocr` on arm64 architecture. But currently,
`unstructured.paddleocr` with `unstructured.paddlepaddle` fails to work
on `arm64` architecture. Also, `unstructured.paddleocr` with the latest
version of the original `paddlepaddle` works on both `amd64` and `arm64`
architectures.

### Testing
```
os.environ["OCR_AGENT"] = "unstructured.partition.utils.ocr_models.paddle_ocr.OCRAgentPaddle"

elements = partition_pdf(
    filename=<file_path>,
    strategy="hi_res",
    infer_table_structure=True,
)
```
This commit is contained in:
Christine Straub 2024-08-09 15:04:22 -07:00 committed by GitHub
parent a2ae2ed646
commit d99b39923d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 53 additions and 16 deletions

View File

@ -72,7 +72,6 @@ jobs:
- name: Install all doc and test dependencies
run: |
make install-ci
make install-paddleocr
make install-all-ingest
make check-licenses

View File

@ -17,7 +17,6 @@ RUN chown -R notebook-user:notebook-user /app && \
USER notebook-user
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
pip3.11 install unstructured.paddlepaddle && \
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

View File

@ -277,10 +277,6 @@ install-local-inference: install install-all-docs
install-pandoc:
ARCH=${ARCH} ./scripts/install-pandoc.sh
.PHONY: install-paddleocr
install-paddleocr:
ARCH=${ARCH} ./scripts/install-paddleocr.sh
## pip-compile: compiles all base/dev/test requirements
.PHONY: pip-compile
pip-compile:

View File

@ -1,4 +1,5 @@
-c ./deps/constraints.txt
-c base.txt
paddlepaddle==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
unstructured.paddleocr==2.8.0.1

View File

@ -4,6 +4,13 @@
#
# pip-compile ./extra-paddleocr.in
#
anyio==3.7.1
# via
# -c ././deps/constraints.txt
# -c ./base.txt
# httpx
astor==0.8.1
# via paddlepaddle
attrdict==2.0.1
# via unstructured-paddleocr
cachetools==5.4.0
@ -12,6 +19,8 @@ certifi==2024.7.4
# via
# -c ././deps/constraints.txt
# -c ./base.txt
# httpcore
# httpx
# requests
charset-normalizer==3.3.2
# via
@ -27,13 +36,33 @@ cycler==0.12.1
# via matplotlib
cython==3.0.11
# via unstructured-paddleocr
decorator==5.1.1
# via paddlepaddle
et-xmlfile==1.1.0
# via openpyxl
exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
fonttools==4.53.1
# via matplotlib
h11==0.14.0
# via
# -c ./base.txt
# httpcore
httpcore==1.0.5
# via
# -c ./base.txt
# httpx
httpx==0.27.0
# via
# -c ./base.txt
# paddlepaddle
idna==3.7
# via
# -c ./base.txt
# anyio
# httpx
# requests
imageio==2.34.2
# via
@ -59,7 +88,9 @@ matplotlib==3.9.1.post1
more-itertools==10.4.0
# via cssutils
networkx==3.2.1
# via scikit-image
# via
# paddlepaddle
# scikit-image
numpy==1.26.4
# via
# -c ./base.txt
@ -69,6 +100,8 @@ numpy==1.26.4
# matplotlib
# opencv-contrib-python
# opencv-python
# opt-einsum
# paddlepaddle
# scikit-image
# scipy
# shapely
@ -85,6 +118,8 @@ opencv-python==4.8.0.76
# unstructured-paddleocr
openpyxl==3.1.5
# via unstructured-paddleocr
opt-einsum==3.3.0
# via paddlepaddle
packaging==23.2
# via
# -c ././deps/constraints.txt
@ -92,6 +127,8 @@ packaging==23.2
# lazy-loader
# matplotlib
# scikit-image
paddlepaddle==3.0.0b1
# via -r ./extra-paddleocr.in
pdf2image==1.17.0
# via unstructured-paddleocr
pillow==10.4.0
@ -99,11 +136,16 @@ pillow==10.4.0
# imageio
# imgaug
# matplotlib
# paddlepaddle
# pdf2image
# scikit-image
# unstructured-paddleocr
premailer==3.10.0
# via unstructured-paddleocr
protobuf==4.23.4
# via
# -c ././deps/constraints.txt
# paddlepaddle
pyclipper==1.3.0.post5
# via unstructured-paddleocr
pyparsing==3.0.9
@ -144,12 +186,21 @@ six==1.16.0
# attrdict
# imgaug
# python-dateutil
sniffio==1.3.1
# via
# -c ./base.txt
# anyio
# httpx
tifffile==2024.7.24
# via scikit-image
tqdm==4.66.5
# via
# -c ./base.txt
# unstructured-paddleocr
typing-extensions==4.12.2
# via
# -c ./base.txt
# paddlepaddle
unstructured-paddleocr==2.8.0.1
# via -r ./extra-paddleocr.in
urllib3==1.26.19

View File

@ -1,9 +0,0 @@
#!/usr/bin/env bash
# aarch64 requires a custom build of paddlepaddle
if [ "${ARCH}" = "aarch64" ]; then
python3 -m pip install unstructured.paddlepaddle
else
python3 -m pip install paddlepaddle
fi
python3 -m pip install unstructured.paddleocr