mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 15:42:19 +00:00
build(deps): Remove unstructured.paddlepaddle fork (#3506)
This PR aims to remove "unstructured.paddlepaddle" fork. Previously, we
used `unstructured.paddlepaddle` fork to support
`unstructured.paddleocr` on arm64 architecture. But currently,
`unstructured.paddleocr` with `unstructured.paddlepaddle` fails to work
on `arm64` architecture. Also, `unstructured.paddleocr` with the latest
version of the original `paddlepaddle` works on both `amd64` and `arm64`
architectures.
### Testing
```
os.environ["OCR_AGENT"] = "unstructured.partition.utils.ocr_models.paddle_ocr.OCRAgentPaddle"
elements = partition_pdf(
filename=<file_path>,
strategy="hi_res",
infer_table_structure=True,
)
```
This commit is contained in:
parent
a2ae2ed646
commit
d99b39923d
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -72,7 +72,6 @@ jobs:
|
||||
- name: Install all doc and test dependencies
|
||||
run: |
|
||||
make install-ci
|
||||
make install-paddleocr
|
||||
make install-all-ingest
|
||||
make check-licenses
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@ RUN chown -R notebook-user:notebook-user /app && \
|
||||
USER notebook-user
|
||||
|
||||
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
|
||||
pip3.11 install unstructured.paddlepaddle && \
|
||||
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
|
||||
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
||||
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
||||
|
||||
4
Makefile
4
Makefile
@ -277,10 +277,6 @@ install-local-inference: install install-all-docs
|
||||
install-pandoc:
|
||||
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
||||
|
||||
.PHONY: install-paddleocr
|
||||
install-paddleocr:
|
||||
ARCH=${ARCH} ./scripts/install-paddleocr.sh
|
||||
|
||||
## pip-compile: compiles all base/dev/test requirements
|
||||
.PHONY: pip-compile
|
||||
pip-compile:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
-c ./deps/constraints.txt
|
||||
-c base.txt
|
||||
|
||||
paddlepaddle==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
unstructured.paddleocr==2.8.0.1
|
||||
|
||||
@ -4,6 +4,13 @@
|
||||
#
|
||||
# pip-compile ./extra-paddleocr.in
|
||||
#
|
||||
anyio==3.7.1
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
astor==0.8.1
|
||||
# via paddlepaddle
|
||||
attrdict==2.0.1
|
||||
# via unstructured-paddleocr
|
||||
cachetools==5.4.0
|
||||
@ -12,6 +19,8 @@ certifi==2024.7.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
# httpx
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
@ -27,13 +36,33 @@ cycler==0.12.1
|
||||
# via matplotlib
|
||||
cython==3.0.11
|
||||
# via unstructured-paddleocr
|
||||
decorator==5.1.1
|
||||
# via paddlepaddle
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
exceptiongroup==1.2.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
fonttools==4.53.1
|
||||
# via matplotlib
|
||||
h11==0.14.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpcore
|
||||
httpcore==1.0.5
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# httpx
|
||||
httpx==0.27.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# paddlepaddle
|
||||
idna==3.7
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
imageio==2.34.2
|
||||
# via
|
||||
@ -59,7 +88,9 @@ matplotlib==3.9.1.post1
|
||||
more-itertools==10.4.0
|
||||
# via cssutils
|
||||
networkx==3.2.1
|
||||
# via scikit-image
|
||||
# via
|
||||
# paddlepaddle
|
||||
# scikit-image
|
||||
numpy==1.26.4
|
||||
# via
|
||||
# -c ./base.txt
|
||||
@ -69,6 +100,8 @@ numpy==1.26.4
|
||||
# matplotlib
|
||||
# opencv-contrib-python
|
||||
# opencv-python
|
||||
# opt-einsum
|
||||
# paddlepaddle
|
||||
# scikit-image
|
||||
# scipy
|
||||
# shapely
|
||||
@ -85,6 +118,8 @@ opencv-python==4.8.0.76
|
||||
# unstructured-paddleocr
|
||||
openpyxl==3.1.5
|
||||
# via unstructured-paddleocr
|
||||
opt-einsum==3.3.0
|
||||
# via paddlepaddle
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
@ -92,6 +127,8 @@ packaging==23.2
|
||||
# lazy-loader
|
||||
# matplotlib
|
||||
# scikit-image
|
||||
paddlepaddle==3.0.0b1
|
||||
# via -r ./extra-paddleocr.in
|
||||
pdf2image==1.17.0
|
||||
# via unstructured-paddleocr
|
||||
pillow==10.4.0
|
||||
@ -99,11 +136,16 @@ pillow==10.4.0
|
||||
# imageio
|
||||
# imgaug
|
||||
# matplotlib
|
||||
# paddlepaddle
|
||||
# pdf2image
|
||||
# scikit-image
|
||||
# unstructured-paddleocr
|
||||
premailer==3.10.0
|
||||
# via unstructured-paddleocr
|
||||
protobuf==4.23.4
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# paddlepaddle
|
||||
pyclipper==1.3.0.post5
|
||||
# via unstructured-paddleocr
|
||||
pyparsing==3.0.9
|
||||
@ -144,12 +186,21 @@ six==1.16.0
|
||||
# attrdict
|
||||
# imgaug
|
||||
# python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# anyio
|
||||
# httpx
|
||||
tifffile==2024.7.24
|
||||
# via scikit-image
|
||||
tqdm==4.66.5
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# unstructured-paddleocr
|
||||
typing-extensions==4.12.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# paddlepaddle
|
||||
unstructured-paddleocr==2.8.0.1
|
||||
# via -r ./extra-paddleocr.in
|
||||
urllib3==1.26.19
|
||||
|
||||
@ -1,9 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# aarch64 requires a custom build of paddlepaddle
|
||||
if [ "${ARCH}" = "aarch64" ]; then
|
||||
python3 -m pip install unstructured.paddlepaddle
|
||||
else
|
||||
python3 -m pip install paddlepaddle
|
||||
fi
|
||||
python3 -m pip install unstructured.paddleocr
|
||||
Loading…
x
Reference in New Issue
Block a user