feat: modify test-ingest-src and evaluation-metrics to allow EXPORT_DIR (#2551)

The current `test-ingest-src.sh` and `evaluation-metrics` do not allow
passing the `EXPORT_DIR` (`OUTPUT_ROOT` in `evaluation-metrics`). It is
currently saving at the current working directory
(`unstructured/test_unstructured_ingest`). When running the eval from
`core-product`, all outputs is now saved at
`core-product/upstream-unstructured/test_unstructured_ingest` which is
undesirable.

This PR modifies two scripts to accommodate such behavior:
1. `test-ingest-src.sh` - assign `EVAL_OUTPUT_ROOT` to the value set
within the environment if exist, or the current working directory if
not. Then calls to run `evaluation-metrics.sh`.
2. `evaluation-metrics.sh` - accepting param from `test-ingest-src.sh`
if exist, or to the value set within the environment if exist, or the
current directory if not.

(Note: I also add param to `evaluation-metrics.sh` because it makes
sense to allow a separate run to be able to specify an export directory)

This PR should work in sync with another PR under `core-product`, which
I will add the link here later.

**To test:**

Run the script below, change `$SCRIPT_DIR` as needed to see the result.

```
export OVERWRITE_FIXTURES=true

./upstream-unstructured/test_unstructured_ingest/src/s3.sh

SCRIPT_DIR=$(dirname "$(realpath "$0")")
bash -x ./upstream-unstructured/test_unstructured_ingest/evaluation-metrics.sh text-extraction "$SCRIPT_DIR"
```

----

This PR also updates the requirements by `make pip-compile` since the
`click` module was not found.
This commit is contained in:
Klaijan 2024-02-17 12:21:15 +07:00 committed by GitHub
parent ad561b7939
commit d06936d35a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 96 additions and 94 deletions

View File

@ -52,7 +52,7 @@ mypy-extensions==1.0.0
# unstructured-client
nltk==3.8.1
# via -r base.in
numpy==1.26.3
numpy==1.26.4
# via -r base.in
packaging==23.2
# via
@ -60,7 +60,7 @@ packaging==23.2
# unstructured-client
python-dateutil==2.8.2
# via unstructured-client
python-iso639==2024.1.2
python-iso639==2024.2.7
# via -r base.in
python-magic==0.4.27
# via -r base.in
@ -81,7 +81,7 @@ soupsieve==2.5
# via beautifulsoup4
tabulate==0.9.0
# via -r base.in
tqdm==4.66.1
tqdm==4.66.2
# via nltk
typing-extensions==4.9.0
# via
@ -93,7 +93,7 @@ typing-inspect==0.9.0
# dataclasses-json
# dataclasses-json-speakeasy
# unstructured-client
unstructured-client==0.17.0
unstructured-client==0.18.0
# via -r base.in
urllib3==1.26.18
# via

View File

@ -9,7 +9,7 @@ anyio==3.7.1
# -c constraints.in
# httpx
# jupyter-server
appnope==0.1.3
appnope==0.1.4
# via
# ipykernel
# ipython
@ -65,7 +65,7 @@ comm==0.2.1
# via
# ipykernel
# ipywidgets
debugpy==1.8.0
debugpy==1.8.1
# via ipykernel
decorator==5.1.1
# via ipython
@ -87,11 +87,11 @@ fqdn==1.5.1
# via jsonschema
h11==0.14.0
# via httpcore
httpcore==1.0.2
httpcore==1.0.3
# via httpx
httpx==0.26.0
# via jupyterlab
identify==2.5.33
identify==2.5.34
# via pre-commit
idna==3.6
# via
@ -109,7 +109,7 @@ importlib-metadata==7.0.1
# jupyterlab
# jupyterlab-server
# nbconvert
ipykernel==6.29.0
ipykernel==6.29.2
# via
# jupyter
# jupyter-console
@ -122,7 +122,7 @@ ipython==8.12.3
# ipykernel
# ipywidgets
# jupyter-console
ipywidgets==8.1.1
ipywidgets==8.1.2
# via jupyter
isoduration==20.11.0
# via jsonschema
@ -181,15 +181,15 @@ jupyter-server==2.12.5
# notebook-shim
jupyter-server-terminals==0.5.2
# via jupyter-server
jupyterlab==4.1.0
jupyterlab==4.1.1
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
jupyterlab-server==2.25.2
jupyterlab-server==2.25.3
# via
# jupyterlab
# notebook
jupyterlab-widgets==3.0.9
jupyterlab-widgets==3.0.10
# via ipywidgets
markupsafe==2.1.5
# via
@ -203,7 +203,7 @@ mistune==3.0.2
# via nbconvert
nbclient==0.9.0
# via nbconvert
nbconvert==7.14.2
nbconvert==7.16.0
# via
# jupyter
# jupyter-server
@ -216,9 +216,9 @@ nest-asyncio==1.6.0
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.0.7
notebook==7.1.0
# via jupyter
notebook-shim==0.2.3
notebook-shim==0.2.4
# via
# jupyterlab
# notebook
@ -252,9 +252,9 @@ platformdirs==3.10.0
# -c test.txt
# jupyter-core
# virtualenv
pre-commit==3.6.0
pre-commit==3.6.1
# via -r dev.in
prometheus-client==0.19.0
prometheus-client==0.20.0
# via jupyter-server
prompt-toolkit==3.0.43
# via
@ -320,7 +320,7 @@ rfc3986-validator==0.1.1
# via
# jsonschema
# jupyter-events
rpds-py==0.17.1
rpds-py==0.18.0
# via
# jsonschema
# referencing
@ -414,7 +414,7 @@ wheel==0.42.0
# via
# -c constraints.in
# pip-tools
widgetsnbextension==4.0.9
widgetsnbextension==4.0.10
# via ipywidgets
zipp==3.17.0
# via importlib-metadata

View File

@ -4,7 +4,7 @@
#
# pip-compile --output-file=extra-csv.txt extra-csv.in
#
numpy==1.26.3
numpy==1.26.4
# via
# -c base.txt
# pandas
@ -20,5 +20,5 @@ six==1.16.0
# via
# -c base.txt
# python-dateutil
tzdata==2023.4
tzdata==2024.1
# via pandas

View File

@ -45,7 +45,7 @@ flask==3.0.2
# visualdl
flask-babel==4.0.0
# via visualdl
fonttools==4.47.2
fonttools==4.49.0
# via matplotlib
future==0.18.3
# via bce-python-sdk
@ -53,7 +53,7 @@ idna==3.6
# via
# -c base.txt
# requests
imageio==2.33.1
imageio==2.34.0
# via
# imgaug
# scikit-image
@ -93,7 +93,7 @@ matplotlib==3.7.2
# visualdl
networkx==3.2.1
# via scikit-image
numpy==1.26.3
numpy==1.26.4
# via
# -c base.txt
# contourpy
@ -197,13 +197,13 @@ six==1.16.0
# imgaug
# python-dateutil
# visualdl
tifffile==2024.1.30
tifffile==2024.2.12
# via scikit-image
tqdm==4.66.1
tqdm==4.66.2
# via
# -c base.txt
# unstructured-paddleocr
tzdata==2023.4
tzdata==2024.1
# via pandas
unstructured-paddleocr==2.6.1.3
# via -r extra-paddleocr.in

View File

@ -37,7 +37,7 @@ filelock==3.13.1
# transformers
flatbuffers==23.5.26
# via onnxruntime
fonttools==4.47.2
fonttools==4.49.0
# via matplotlib
fsspec==2024.2.0
# via
@ -79,7 +79,7 @@ mpmath==1.3.0
# via sympy
networkx==3.2.1
# via torch
numpy==1.26.3
numpy==1.26.4
# via
# -c base.txt
# contourpy
@ -128,7 +128,7 @@ pdfminer-six==20221105
# via
# -r extra-pdf-image.in
# pdfplumber
pdfplumber==0.10.3
pdfplumber==0.10.4
# via layoutparser
pikepdf==8.11.0
# via -r extra-pdf-image.in
@ -164,7 +164,7 @@ pyparsing==3.0.9
# matplotlib
pypdf==4.0.1
# via -r extra-pdf-image.in
pypdfium2==4.26.0
pypdfium2==4.27.0
# via pdfplumber
pytesseract==0.3.10
# via layoutparser
@ -173,7 +173,7 @@ python-dateutil==2.8.2
# -c base.txt
# matplotlib
# pandas
python-multipart==0.0.7
python-multipart==0.0.9
# via unstructured-inference
pytz==2024.1
# via pandas
@ -217,7 +217,7 @@ sympy==1.12
# torch
timm==0.9.12
# via effdet
tokenizers==0.15.1
tokenizers==0.15.2
# via transformers
torch==2.2.0
# via
@ -231,7 +231,7 @@ torchvision==0.17.0
# effdet
# layoutparser
# timm
tqdm==4.66.1
tqdm==4.66.2
# via
# -c base.txt
# huggingface-hub
@ -246,7 +246,7 @@ typing-extensions==4.9.0
# iopath
# pypdf
# torch
tzdata==2023.4
tzdata==2024.1
# via pandas
unstructured-inference==0.7.23
# via -r extra-pdf-image.in

View File

@ -8,7 +8,7 @@ et-xmlfile==1.1.0
# via openpyxl
networkx==3.2.1
# via -r extra-xlsx.in
numpy==1.26.3
numpy==1.26.4
# via
# -c base.txt
# pandas
@ -26,7 +26,7 @@ six==1.16.0
# via
# -c base.txt
# python-dateutil
tzdata==2023.4
tzdata==2024.1
# via pandas
xlrd==2.0.1
# via -r extra-xlsx.in

View File

@ -50,7 +50,7 @@ mpmath==1.3.0
# via sympy
networkx==3.2.1
# via torch
numpy==1.26.3
numpy==1.26.4
# via
# -c base.txt
# transformers
@ -87,13 +87,13 @@ six==1.16.0
# langdetect
sympy==1.12
# via torch
tokenizers==0.15.1
tokenizers==0.15.2
# via transformers
torch==2.2.0
# via
# -c constraints.in
# -r huggingface.in
tqdm==4.66.1
tqdm==4.66.2
# via
# -c base.txt
# huggingface-hub

View File

@ -59,7 +59,7 @@ flatbuffers==23.5.26
# via onnxruntime
fsspec==2024.2.0
# via huggingface-hub
google-auth==2.27.0
google-auth==2.28.0
# via kubernetes
googleapis-common-protos==1.62.0
# via opentelemetry-exporter-otlp-proto-grpc
@ -94,7 +94,7 @@ monotonic==1.6
# via posthog
mpmath==1.3.0
# via sympy
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# chroma-hnswlib
@ -153,7 +153,7 @@ packaging==23.2
# build
# huggingface-hub
# onnxruntime
posthog==3.4.0
posthog==3.4.1
# via chromadb
protobuf==4.23.4
# via
@ -217,13 +217,13 @@ sympy==1.12
# via onnxruntime
tenacity==8.2.3
# via chromadb
tokenizers==0.15.1
tokenizers==0.15.2
# via chromadb
tomli==2.0.1
# via
# build
# pyproject-hooks
tqdm==4.66.1
tqdm==4.66.2
# via
# -c ingest/../base.txt
# chromadb
@ -248,7 +248,7 @@ urllib3==1.26.18
# -c ingest/../constraints.in
# kubernetes
# requests
uvicorn[standard]==0.27.0.post1
uvicorn[standard]==0.27.1
# via
# chromadb
# uvicorn

View File

@ -4,7 +4,7 @@
#
# pip-compile --output-file=ingest/confluence.txt ingest/confluence.in
#
atlassian-python-api==3.41.9
atlassian-python-api==3.41.10
# via -r ingest/confluence.in
beautifulsoup4==4.12.3
# via

View File

@ -15,9 +15,9 @@ charset-normalizer==3.3.2
# via
# -c ingest/../base.txt
# requests
databricks-sdk==0.18.0
databricks-sdk==0.19.1
# via -r ingest/databricks-volumes.in
google-auth==2.27.0
google-auth==2.28.0
# via databricks-sdk
idna==3.6
# via

View File

@ -4,11 +4,11 @@
#
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
#
deltalake==0.15.2
deltalake==0.15.3
# via -r ingest/delta-table.in
fsspec==2024.2.0
# via -r ingest/delta-table.in
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# pyarrow

View File

@ -58,11 +58,11 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain-community==0.0.17
langchain-community==0.0.20
# via -r ingest/embed-aws-bedrock.in
langchain-core==0.1.18
langchain-core==0.1.23
# via langchain-community
langsmith==0.0.86
langsmith==0.0.87
# via
# langchain-community
# langchain-core
@ -78,7 +78,7 @@ mypy-extensions==1.0.0
# via
# -c ingest/../base.txt
# typing-inspect
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# langchain-community
@ -114,7 +114,7 @@ six==1.16.0
# python-dateutil
sniffio==1.3.0
# via anyio
sqlalchemy==2.0.25
sqlalchemy==2.0.27
# via langchain-community
tenacity==8.2.3
# via

View File

@ -72,11 +72,11 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain-community==0.0.17
langchain-community==0.0.20
# via -r ingest/embed-huggingface.in
langchain-core==0.1.18
langchain-core==0.1.23
# via langchain-community
langsmith==0.0.86
langsmith==0.0.87
# via
# langchain-community
# langchain-core
@ -102,7 +102,7 @@ nltk==3.8.1
# via
# -c ingest/../base.txt
# sentence-transformers
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# langchain-community
@ -160,7 +160,7 @@ sentencepiece==0.1.99
# via sentence-transformers
sniffio==1.3.0
# via anyio
sqlalchemy==2.0.25
sqlalchemy==2.0.27
# via langchain-community
sympy==1.12
# via torch
@ -168,15 +168,15 @@ tenacity==8.2.3
# via
# langchain-community
# langchain-core
threadpoolctl==3.2.0
threadpoolctl==3.3.0
# via scikit-learn
tokenizers==0.15.1
tokenizers==0.15.2
# via transformers
torch==2.2.0
# via
# -c ingest/../constraints.in
# sentence-transformers
tqdm==4.66.1
tqdm==4.66.2
# via
# -c ingest/../base.txt
# huggingface-hub

View File

@ -43,7 +43,7 @@ frozenlist==1.4.1
# aiosignal
h11==0.14.0
# via httpcore
httpcore==1.0.2
httpcore==1.0.3
# via httpx
httpx==0.26.0
# via openai
@ -58,11 +58,11 @@ jsonpatch==1.33
# via langchain-core
jsonpointer==2.4
# via jsonpatch
langchain-community==0.0.17
langchain-community==0.0.20
# via -r ingest/embed-openai.in
langchain-core==0.1.18
langchain-core==0.1.23
# via langchain-community
langsmith==0.0.86
langsmith==0.0.87
# via
# langchain-community
# langchain-core
@ -78,11 +78,11 @@ mypy-extensions==1.0.0
# via
# -c ingest/../base.txt
# typing-inspect
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# langchain-community
openai==1.11.1
openai==1.12.0
# via -r ingest/embed-openai.in
packaging==23.2
# via
@ -115,15 +115,15 @@ sniffio==1.3.0
# anyio
# httpx
# openai
sqlalchemy==2.0.25
sqlalchemy==2.0.27
# via langchain-community
tenacity==8.2.3
# via
# langchain-community
# langchain-core
tiktoken==0.5.2
tiktoken==0.6.0
# via -r ingest/embed-openai.in
tqdm==4.66.1
tqdm==4.66.2
# via
# -c ingest/../base.txt
# openai

View File

@ -41,11 +41,11 @@ fsspec==2024.2.0
# gcsfs
gcsfs==2024.2.0
# via -r ingest/gcs.in
google-api-core==2.16.2
google-api-core==2.17.1
# via
# google-cloud-core
# google-cloud-storage
google-auth==2.27.0
google-auth==2.28.0
# via
# gcsfs
# google-api-core

View File

@ -15,11 +15,11 @@ charset-normalizer==3.3.2
# via
# -c ingest/../base.txt
# requests
google-api-core==2.16.2
google-api-core==2.17.1
# via google-api-python-client
google-api-python-client==2.116.0
google-api-python-client==2.118.0
# via -r ingest/google-drive.in
google-auth==2.27.0
google-auth==2.28.0
# via
# google-api-core
# google-api-python-client

View File

@ -4,7 +4,7 @@
#
# pip-compile --output-file=ingest/jira.txt ingest/jira.in
#
atlassian-python-api==3.41.9
atlassian-python-api==3.41.10
# via -r ingest/jira.in
beautifulsoup4==4.12.3
# via

View File

@ -20,7 +20,7 @@ h11==0.14.0
# via httpcore
htmlbuilder==1.0.0
# via -r ingest/notion.in
httpcore==1.0.2
httpcore==1.0.3
# via httpx
httpx==0.26.0
# via notion-client

View File

@ -21,7 +21,7 @@ idna==3.6
# requests
loguru==0.7.2
# via pinecone-client
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# pinecone-client
@ -41,7 +41,7 @@ six==1.16.0
# via
# -c ingest/../base.txt
# python-dateutil
tqdm==4.66.1
tqdm==4.66.2
# via
# -c ingest/../base.txt
# pinecone-client

View File

@ -28,7 +28,7 @@ h2==4.1.0
# via httpx
hpack==4.0.0
# via h2
httpcore==1.0.2
httpcore==1.0.3
# via httpx
httpx[http2]==0.26.0
# via
@ -41,7 +41,7 @@ idna==3.6
# -c ingest/../base.txt
# anyio
# httpx
numpy==1.26.3
numpy==1.26.4
# via
# -c ingest/../base.txt
# qdrant-client
@ -55,7 +55,7 @@ pydantic==1.10.14
# via
# -c ingest/../constraints.in
# qdrant-client
qdrant-client==1.7.2
qdrant-client==1.7.3
# via -r ingest/qdrant.in
sniffio==1.3.0
# via

View File

@ -68,7 +68,7 @@ six==1.16.0
# python-dateutil
time-machine==2.13.0
# via pendulum
tzdata==2023.4
tzdata==2024.1
# via pendulum
urllib3==1.26.18
# via

View File

@ -4,5 +4,5 @@
#
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
#
slack-sdk==3.26.2
slack-sdk==3.27.0
# via -r ingest/slack.in

View File

@ -8,7 +8,7 @@ appdirs==1.4.4
# via label-studio-tools
autoflake==2.2.1
# via -r test.in
black==24.1.1
black==24.2.0
# via -r test.in
certifi==2024.2.2
# via
@ -111,7 +111,7 @@ requests==2.31.0
# via
# -c base.txt
# label-studio-sdk
ruff==0.2.0
ruff==0.2.1
# via -r test.in
six==1.16.0
# via

View File

@ -5,13 +5,13 @@ set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
EVAL_NAME="$1"
# List all structured outputs to use in this evaluation
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_ROOT=${2:-${OUTPUT_ROOT:-$SCRIPT_DIR}}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval
mkdir -p "$OUTPUT_DIR"
EVAL_NAME="$1"
if [ "$EVAL_NAME" == "text-extraction" ]; then
METRIC_STRATEGY="measure-text-extraction-accuracy-command"
elif [ "$EVAL_NAME" == "element-type" ]; then

View File

@ -11,6 +11,8 @@ fi
touch "$SKIPPED_FILES_LOG"
cd "$SCRIPT_DIR"/.. || exit 1
EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR}
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1
@ -121,6 +123,6 @@ all_eval=(
for eval in "${all_eval[@]}"; do
CURRENT_TEST="evaluation-metrics.sh $eval"
echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
./test_unstructured_ingest/evaluation-metrics.sh "$eval"
./test_unstructured_ingest/evaluation-metrics.sh "$eval" "$EVAL_OUTPUT_ROOT"
echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
done