mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-13 08:01:37 +00:00
build(deps): Remove unstructured.paddlepaddle fork (#3506)
This PR aims to remove "unstructured.paddlepaddle" fork. Previously, we
used `unstructured.paddlepaddle` fork to support
`unstructured.paddleocr` on arm64 architecture. But currently,
`unstructured.paddleocr` with `unstructured.paddlepaddle` fails to work
on `arm64` architecture. Also, `unstructured.paddleocr` with the latest
version of the original `paddlepaddle` works on both `amd64` and `arm64`
architectures.
### Testing
```
os.environ["OCR_AGENT"] = "unstructured.partition.utils.ocr_models.paddle_ocr.OCRAgentPaddle"
elements = partition_pdf(
filename=<file_path>,
strategy="hi_res",
infer_table_structure=True,
)
```
This commit is contained in:
parent
a2ae2ed646
commit
d99b39923d
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -72,7 +72,6 @@ jobs:
|
|||||||
- name: Install all doc and test dependencies
|
- name: Install all doc and test dependencies
|
||||||
run: |
|
run: |
|
||||||
make install-ci
|
make install-ci
|
||||||
make install-paddleocr
|
|
||||||
make install-all-ingest
|
make install-all-ingest
|
||||||
make check-licenses
|
make check-licenses
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,6 @@ RUN chown -R notebook-user:notebook-user /app && \
|
|||||||
USER notebook-user
|
USER notebook-user
|
||||||
|
|
||||||
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
|
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
|
||||||
pip3.11 install unstructured.paddlepaddle && \
|
|
||||||
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
|
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
|
||||||
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
|
||||||
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
|
||||||
|
|||||||
4
Makefile
4
Makefile
@ -277,10 +277,6 @@ install-local-inference: install install-all-docs
|
|||||||
install-pandoc:
|
install-pandoc:
|
||||||
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
||||||
|
|
||||||
.PHONY: install-paddleocr
|
|
||||||
install-paddleocr:
|
|
||||||
ARCH=${ARCH} ./scripts/install-paddleocr.sh
|
|
||||||
|
|
||||||
## pip-compile: compiles all base/dev/test requirements
|
## pip-compile: compiles all base/dev/test requirements
|
||||||
.PHONY: pip-compile
|
.PHONY: pip-compile
|
||||||
pip-compile:
|
pip-compile:
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
-c ./deps/constraints.txt
|
-c ./deps/constraints.txt
|
||||||
-c base.txt
|
-c base.txt
|
||||||
|
|
||||||
|
paddlepaddle==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||||
unstructured.paddleocr==2.8.0.1
|
unstructured.paddleocr==2.8.0.1
|
||||||
|
|||||||
@ -4,6 +4,13 @@
|
|||||||
#
|
#
|
||||||
# pip-compile ./extra-paddleocr.in
|
# pip-compile ./extra-paddleocr.in
|
||||||
#
|
#
|
||||||
|
anyio==3.7.1
|
||||||
|
# via
|
||||||
|
# -c ././deps/constraints.txt
|
||||||
|
# -c ./base.txt
|
||||||
|
# httpx
|
||||||
|
astor==0.8.1
|
||||||
|
# via paddlepaddle
|
||||||
attrdict==2.0.1
|
attrdict==2.0.1
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
cachetools==5.4.0
|
cachetools==5.4.0
|
||||||
@ -12,6 +19,8 @@ certifi==2024.7.4
|
|||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c ././deps/constraints.txt
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
|
# httpcore
|
||||||
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
# via
|
# via
|
||||||
@ -27,13 +36,33 @@ cycler==0.12.1
|
|||||||
# via matplotlib
|
# via matplotlib
|
||||||
cython==3.0.11
|
cython==3.0.11
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
|
decorator==5.1.1
|
||||||
|
# via paddlepaddle
|
||||||
et-xmlfile==1.1.0
|
et-xmlfile==1.1.0
|
||||||
# via openpyxl
|
# via openpyxl
|
||||||
|
exceptiongroup==1.2.2
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# anyio
|
||||||
fonttools==4.53.1
|
fonttools==4.53.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
|
h11==0.14.0
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# httpcore
|
||||||
|
httpcore==1.0.5
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# httpx
|
||||||
|
httpx==0.27.0
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# paddlepaddle
|
||||||
idna==3.7
|
idna==3.7
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
|
# anyio
|
||||||
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
imageio==2.34.2
|
imageio==2.34.2
|
||||||
# via
|
# via
|
||||||
@ -59,7 +88,9 @@ matplotlib==3.9.1.post1
|
|||||||
more-itertools==10.4.0
|
more-itertools==10.4.0
|
||||||
# via cssutils
|
# via cssutils
|
||||||
networkx==3.2.1
|
networkx==3.2.1
|
||||||
# via scikit-image
|
# via
|
||||||
|
# paddlepaddle
|
||||||
|
# scikit-image
|
||||||
numpy==1.26.4
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
@ -69,6 +100,8 @@ numpy==1.26.4
|
|||||||
# matplotlib
|
# matplotlib
|
||||||
# opencv-contrib-python
|
# opencv-contrib-python
|
||||||
# opencv-python
|
# opencv-python
|
||||||
|
# opt-einsum
|
||||||
|
# paddlepaddle
|
||||||
# scikit-image
|
# scikit-image
|
||||||
# scipy
|
# scipy
|
||||||
# shapely
|
# shapely
|
||||||
@ -85,6 +118,8 @@ opencv-python==4.8.0.76
|
|||||||
# unstructured-paddleocr
|
# unstructured-paddleocr
|
||||||
openpyxl==3.1.5
|
openpyxl==3.1.5
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
|
opt-einsum==3.3.0
|
||||||
|
# via paddlepaddle
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c ././deps/constraints.txt
|
||||||
@ -92,6 +127,8 @@ packaging==23.2
|
|||||||
# lazy-loader
|
# lazy-loader
|
||||||
# matplotlib
|
# matplotlib
|
||||||
# scikit-image
|
# scikit-image
|
||||||
|
paddlepaddle==3.0.0b1
|
||||||
|
# via -r ./extra-paddleocr.in
|
||||||
pdf2image==1.17.0
|
pdf2image==1.17.0
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
pillow==10.4.0
|
pillow==10.4.0
|
||||||
@ -99,11 +136,16 @@ pillow==10.4.0
|
|||||||
# imageio
|
# imageio
|
||||||
# imgaug
|
# imgaug
|
||||||
# matplotlib
|
# matplotlib
|
||||||
|
# paddlepaddle
|
||||||
# pdf2image
|
# pdf2image
|
||||||
# scikit-image
|
# scikit-image
|
||||||
# unstructured-paddleocr
|
# unstructured-paddleocr
|
||||||
premailer==3.10.0
|
premailer==3.10.0
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
|
protobuf==4.23.4
|
||||||
|
# via
|
||||||
|
# -c ././deps/constraints.txt
|
||||||
|
# paddlepaddle
|
||||||
pyclipper==1.3.0.post5
|
pyclipper==1.3.0.post5
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
pyparsing==3.0.9
|
pyparsing==3.0.9
|
||||||
@ -144,12 +186,21 @@ six==1.16.0
|
|||||||
# attrdict
|
# attrdict
|
||||||
# imgaug
|
# imgaug
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
|
sniffio==1.3.1
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# anyio
|
||||||
|
# httpx
|
||||||
tifffile==2024.7.24
|
tifffile==2024.7.24
|
||||||
# via scikit-image
|
# via scikit-image
|
||||||
tqdm==4.66.5
|
tqdm==4.66.5
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
# unstructured-paddleocr
|
# unstructured-paddleocr
|
||||||
|
typing-extensions==4.12.2
|
||||||
|
# via
|
||||||
|
# -c ./base.txt
|
||||||
|
# paddlepaddle
|
||||||
unstructured-paddleocr==2.8.0.1
|
unstructured-paddleocr==2.8.0.1
|
||||||
# via -r ./extra-paddleocr.in
|
# via -r ./extra-paddleocr.in
|
||||||
urllib3==1.26.19
|
urllib3==1.26.19
|
||||||
|
|||||||
@ -1,9 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# aarch64 requires a custom build of paddlepaddle
|
|
||||||
if [ "${ARCH}" = "aarch64" ]; then
|
|
||||||
python3 -m pip install unstructured.paddlepaddle
|
|
||||||
else
|
|
||||||
python3 -m pip install paddlepaddle
|
|
||||||
fi
|
|
||||||
python3 -m pip install unstructured.paddleocr
|
|
||||||
Loading…
x
Reference in New Issue
Block a user