mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 15:37:58 +00:00
feat: modify test-ingest-src and evaluation-metrics to allow EXPORT_DIR (#2551)
The current `test-ingest-src.sh` and `evaluation-metrics` do not allow passing the `EXPORT_DIR` (`OUTPUT_ROOT` in `evaluation-metrics`). It is currently saving at the current working directory (`unstructured/test_unstructured_ingest`). When running the eval from `core-product`, all outputs is now saved at `core-product/upstream-unstructured/test_unstructured_ingest` which is undesirable. This PR modifies two scripts to accommodate such behavior: 1. `test-ingest-src.sh` - assign `EVAL_OUTPUT_ROOT` to the value set within the environment if exist, or the current working directory if not. Then calls to run `evaluation-metrics.sh`. 2. `evaluation-metrics.sh` - accepting param from `test-ingest-src.sh` if exist, or to the value set within the environment if exist, or the current directory if not. (Note: I also add param to `evaluation-metrics.sh` because it makes sense to allow a separate run to be able to specify an export directory) This PR should work in sync with another PR under `core-product`, which I will add the link here later. **To test:** Run the script below, change `$SCRIPT_DIR` as needed to see the result. ``` export OVERWRITE_FIXTURES=true ./upstream-unstructured/test_unstructured_ingest/src/s3.sh SCRIPT_DIR=$(dirname "$(realpath "$0")") bash -x ./upstream-unstructured/test_unstructured_ingest/evaluation-metrics.sh text-extraction "$SCRIPT_DIR" ``` ---- This PR also updates the requirements by `make pip-compile` since the `click` module was not found.
This commit is contained in:
parent
ad561b7939
commit
d06936d35a
@ -52,7 +52,7 @@ mypy-extensions==1.0.0
|
|||||||
# unstructured-client
|
# unstructured-client
|
||||||
nltk==3.8.1
|
nltk==3.8.1
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -60,7 +60,7 @@ packaging==23.2
|
|||||||
# unstructured-client
|
# unstructured-client
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
# via unstructured-client
|
# via unstructured-client
|
||||||
python-iso639==2024.1.2
|
python-iso639==2024.2.7
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
python-magic==0.4.27
|
python-magic==0.4.27
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
@ -81,7 +81,7 @@ soupsieve==2.5
|
|||||||
# via beautifulsoup4
|
# via beautifulsoup4
|
||||||
tabulate==0.9.0
|
tabulate==0.9.0
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via nltk
|
# via nltk
|
||||||
typing-extensions==4.9.0
|
typing-extensions==4.9.0
|
||||||
# via
|
# via
|
||||||
@ -93,7 +93,7 @@ typing-inspect==0.9.0
|
|||||||
# dataclasses-json
|
# dataclasses-json
|
||||||
# dataclasses-json-speakeasy
|
# dataclasses-json-speakeasy
|
||||||
# unstructured-client
|
# unstructured-client
|
||||||
unstructured-client==0.17.0
|
unstructured-client==0.18.0
|
||||||
# via -r base.in
|
# via -r base.in
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -9,7 +9,7 @@ anyio==3.7.1
|
|||||||
# -c constraints.in
|
# -c constraints.in
|
||||||
# httpx
|
# httpx
|
||||||
# jupyter-server
|
# jupyter-server
|
||||||
appnope==0.1.3
|
appnope==0.1.4
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipython
|
# ipython
|
||||||
@ -65,7 +65,7 @@ comm==0.2.1
|
|||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# ipywidgets
|
# ipywidgets
|
||||||
debugpy==1.8.0
|
debugpy==1.8.1
|
||||||
# via ipykernel
|
# via ipykernel
|
||||||
decorator==5.1.1
|
decorator==5.1.1
|
||||||
# via ipython
|
# via ipython
|
||||||
@ -87,11 +87,11 @@ fqdn==1.5.1
|
|||||||
# via jsonschema
|
# via jsonschema
|
||||||
h11==0.14.0
|
h11==0.14.0
|
||||||
# via httpcore
|
# via httpcore
|
||||||
httpcore==1.0.2
|
httpcore==1.0.3
|
||||||
# via httpx
|
# via httpx
|
||||||
httpx==0.26.0
|
httpx==0.26.0
|
||||||
# via jupyterlab
|
# via jupyterlab
|
||||||
identify==2.5.33
|
identify==2.5.34
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
idna==3.6
|
idna==3.6
|
||||||
# via
|
# via
|
||||||
@ -109,7 +109,7 @@ importlib-metadata==7.0.1
|
|||||||
# jupyterlab
|
# jupyterlab
|
||||||
# jupyterlab-server
|
# jupyterlab-server
|
||||||
# nbconvert
|
# nbconvert
|
||||||
ipykernel==6.29.0
|
ipykernel==6.29.2
|
||||||
# via
|
# via
|
||||||
# jupyter
|
# jupyter
|
||||||
# jupyter-console
|
# jupyter-console
|
||||||
@ -122,7 +122,7 @@ ipython==8.12.3
|
|||||||
# ipykernel
|
# ipykernel
|
||||||
# ipywidgets
|
# ipywidgets
|
||||||
# jupyter-console
|
# jupyter-console
|
||||||
ipywidgets==8.1.1
|
ipywidgets==8.1.2
|
||||||
# via jupyter
|
# via jupyter
|
||||||
isoduration==20.11.0
|
isoduration==20.11.0
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
@ -181,15 +181,15 @@ jupyter-server==2.12.5
|
|||||||
# notebook-shim
|
# notebook-shim
|
||||||
jupyter-server-terminals==0.5.2
|
jupyter-server-terminals==0.5.2
|
||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
jupyterlab==4.1.0
|
jupyterlab==4.1.1
|
||||||
# via notebook
|
# via notebook
|
||||||
jupyterlab-pygments==0.3.0
|
jupyterlab-pygments==0.3.0
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
jupyterlab-server==2.25.2
|
jupyterlab-server==2.25.3
|
||||||
# via
|
# via
|
||||||
# jupyterlab
|
# jupyterlab
|
||||||
# notebook
|
# notebook
|
||||||
jupyterlab-widgets==3.0.9
|
jupyterlab-widgets==3.0.10
|
||||||
# via ipywidgets
|
# via ipywidgets
|
||||||
markupsafe==2.1.5
|
markupsafe==2.1.5
|
||||||
# via
|
# via
|
||||||
@ -203,7 +203,7 @@ mistune==3.0.2
|
|||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbclient==0.9.0
|
nbclient==0.9.0
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbconvert==7.14.2
|
nbconvert==7.16.0
|
||||||
# via
|
# via
|
||||||
# jupyter
|
# jupyter
|
||||||
# jupyter-server
|
# jupyter-server
|
||||||
@ -216,9 +216,9 @@ nest-asyncio==1.6.0
|
|||||||
# via ipykernel
|
# via ipykernel
|
||||||
nodeenv==1.8.0
|
nodeenv==1.8.0
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
notebook==7.0.7
|
notebook==7.1.0
|
||||||
# via jupyter
|
# via jupyter
|
||||||
notebook-shim==0.2.3
|
notebook-shim==0.2.4
|
||||||
# via
|
# via
|
||||||
# jupyterlab
|
# jupyterlab
|
||||||
# notebook
|
# notebook
|
||||||
@ -252,9 +252,9 @@ platformdirs==3.10.0
|
|||||||
# -c test.txt
|
# -c test.txt
|
||||||
# jupyter-core
|
# jupyter-core
|
||||||
# virtualenv
|
# virtualenv
|
||||||
pre-commit==3.6.0
|
pre-commit==3.6.1
|
||||||
# via -r dev.in
|
# via -r dev.in
|
||||||
prometheus-client==0.19.0
|
prometheus-client==0.20.0
|
||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
prompt-toolkit==3.0.43
|
prompt-toolkit==3.0.43
|
||||||
# via
|
# via
|
||||||
@ -320,7 +320,7 @@ rfc3986-validator==0.1.1
|
|||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# jupyter-events
|
# jupyter-events
|
||||||
rpds-py==0.17.1
|
rpds-py==0.18.0
|
||||||
# via
|
# via
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# referencing
|
# referencing
|
||||||
@ -414,7 +414,7 @@ wheel==0.42.0
|
|||||||
# via
|
# via
|
||||||
# -c constraints.in
|
# -c constraints.in
|
||||||
# pip-tools
|
# pip-tools
|
||||||
widgetsnbextension==4.0.9
|
widgetsnbextension==4.0.10
|
||||||
# via ipywidgets
|
# via ipywidgets
|
||||||
zipp==3.17.0
|
zipp==3.17.0
|
||||||
# via importlib-metadata
|
# via importlib-metadata
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=extra-csv.txt extra-csv.in
|
# pip-compile --output-file=extra-csv.txt extra-csv.in
|
||||||
#
|
#
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# pandas
|
# pandas
|
||||||
@ -20,5 +20,5 @@ six==1.16.0
|
|||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
tzdata==2023.4
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
|
|||||||
@ -45,7 +45,7 @@ flask==3.0.2
|
|||||||
# visualdl
|
# visualdl
|
||||||
flask-babel==4.0.0
|
flask-babel==4.0.0
|
||||||
# via visualdl
|
# via visualdl
|
||||||
fonttools==4.47.2
|
fonttools==4.49.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
future==0.18.3
|
future==0.18.3
|
||||||
# via bce-python-sdk
|
# via bce-python-sdk
|
||||||
@ -53,7 +53,7 @@ idna==3.6
|
|||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# requests
|
# requests
|
||||||
imageio==2.33.1
|
imageio==2.34.0
|
||||||
# via
|
# via
|
||||||
# imgaug
|
# imgaug
|
||||||
# scikit-image
|
# scikit-image
|
||||||
@ -93,7 +93,7 @@ matplotlib==3.7.2
|
|||||||
# visualdl
|
# visualdl
|
||||||
networkx==3.2.1
|
networkx==3.2.1
|
||||||
# via scikit-image
|
# via scikit-image
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# contourpy
|
# contourpy
|
||||||
@ -197,13 +197,13 @@ six==1.16.0
|
|||||||
# imgaug
|
# imgaug
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
# visualdl
|
# visualdl
|
||||||
tifffile==2024.1.30
|
tifffile==2024.2.12
|
||||||
# via scikit-image
|
# via scikit-image
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# unstructured-paddleocr
|
# unstructured-paddleocr
|
||||||
tzdata==2023.4
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
unstructured-paddleocr==2.6.1.3
|
unstructured-paddleocr==2.6.1.3
|
||||||
# via -r extra-paddleocr.in
|
# via -r extra-paddleocr.in
|
||||||
|
|||||||
@ -37,7 +37,7 @@ filelock==3.13.1
|
|||||||
# transformers
|
# transformers
|
||||||
flatbuffers==23.5.26
|
flatbuffers==23.5.26
|
||||||
# via onnxruntime
|
# via onnxruntime
|
||||||
fonttools==4.47.2
|
fonttools==4.49.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
fsspec==2024.2.0
|
fsspec==2024.2.0
|
||||||
# via
|
# via
|
||||||
@ -79,7 +79,7 @@ mpmath==1.3.0
|
|||||||
# via sympy
|
# via sympy
|
||||||
networkx==3.2.1
|
networkx==3.2.1
|
||||||
# via torch
|
# via torch
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# contourpy
|
# contourpy
|
||||||
@ -128,7 +128,7 @@ pdfminer-six==20221105
|
|||||||
# via
|
# via
|
||||||
# -r extra-pdf-image.in
|
# -r extra-pdf-image.in
|
||||||
# pdfplumber
|
# pdfplumber
|
||||||
pdfplumber==0.10.3
|
pdfplumber==0.10.4
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
pikepdf==8.11.0
|
pikepdf==8.11.0
|
||||||
# via -r extra-pdf-image.in
|
# via -r extra-pdf-image.in
|
||||||
@ -164,7 +164,7 @@ pyparsing==3.0.9
|
|||||||
# matplotlib
|
# matplotlib
|
||||||
pypdf==4.0.1
|
pypdf==4.0.1
|
||||||
# via -r extra-pdf-image.in
|
# via -r extra-pdf-image.in
|
||||||
pypdfium2==4.26.0
|
pypdfium2==4.27.0
|
||||||
# via pdfplumber
|
# via pdfplumber
|
||||||
pytesseract==0.3.10
|
pytesseract==0.3.10
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
@ -173,7 +173,7 @@ python-dateutil==2.8.2
|
|||||||
# -c base.txt
|
# -c base.txt
|
||||||
# matplotlib
|
# matplotlib
|
||||||
# pandas
|
# pandas
|
||||||
python-multipart==0.0.7
|
python-multipart==0.0.9
|
||||||
# via unstructured-inference
|
# via unstructured-inference
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
@ -217,7 +217,7 @@ sympy==1.12
|
|||||||
# torch
|
# torch
|
||||||
timm==0.9.12
|
timm==0.9.12
|
||||||
# via effdet
|
# via effdet
|
||||||
tokenizers==0.15.1
|
tokenizers==0.15.2
|
||||||
# via transformers
|
# via transformers
|
||||||
torch==2.2.0
|
torch==2.2.0
|
||||||
# via
|
# via
|
||||||
@ -231,7 +231,7 @@ torchvision==0.17.0
|
|||||||
# effdet
|
# effdet
|
||||||
# layoutparser
|
# layoutparser
|
||||||
# timm
|
# timm
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
@ -246,7 +246,7 @@ typing-extensions==4.9.0
|
|||||||
# iopath
|
# iopath
|
||||||
# pypdf
|
# pypdf
|
||||||
# torch
|
# torch
|
||||||
tzdata==2023.4
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
unstructured-inference==0.7.23
|
unstructured-inference==0.7.23
|
||||||
# via -r extra-pdf-image.in
|
# via -r extra-pdf-image.in
|
||||||
|
|||||||
@ -8,7 +8,7 @@ et-xmlfile==1.1.0
|
|||||||
# via openpyxl
|
# via openpyxl
|
||||||
networkx==3.2.1
|
networkx==3.2.1
|
||||||
# via -r extra-xlsx.in
|
# via -r extra-xlsx.in
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# pandas
|
# pandas
|
||||||
@ -26,7 +26,7 @@ six==1.16.0
|
|||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
tzdata==2023.4
|
tzdata==2024.1
|
||||||
# via pandas
|
# via pandas
|
||||||
xlrd==2.0.1
|
xlrd==2.0.1
|
||||||
# via -r extra-xlsx.in
|
# via -r extra-xlsx.in
|
||||||
|
|||||||
@ -50,7 +50,7 @@ mpmath==1.3.0
|
|||||||
# via sympy
|
# via sympy
|
||||||
networkx==3.2.1
|
networkx==3.2.1
|
||||||
# via torch
|
# via torch
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# transformers
|
# transformers
|
||||||
@ -87,13 +87,13 @@ six==1.16.0
|
|||||||
# langdetect
|
# langdetect
|
||||||
sympy==1.12
|
sympy==1.12
|
||||||
# via torch
|
# via torch
|
||||||
tokenizers==0.15.1
|
tokenizers==0.15.2
|
||||||
# via transformers
|
# via transformers
|
||||||
torch==2.2.0
|
torch==2.2.0
|
||||||
# via
|
# via
|
||||||
# -c constraints.in
|
# -c constraints.in
|
||||||
# -r huggingface.in
|
# -r huggingface.in
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
|||||||
@ -59,7 +59,7 @@ flatbuffers==23.5.26
|
|||||||
# via onnxruntime
|
# via onnxruntime
|
||||||
fsspec==2024.2.0
|
fsspec==2024.2.0
|
||||||
# via huggingface-hub
|
# via huggingface-hub
|
||||||
google-auth==2.27.0
|
google-auth==2.28.0
|
||||||
# via kubernetes
|
# via kubernetes
|
||||||
googleapis-common-protos==1.62.0
|
googleapis-common-protos==1.62.0
|
||||||
# via opentelemetry-exporter-otlp-proto-grpc
|
# via opentelemetry-exporter-otlp-proto-grpc
|
||||||
@ -94,7 +94,7 @@ monotonic==1.6
|
|||||||
# via posthog
|
# via posthog
|
||||||
mpmath==1.3.0
|
mpmath==1.3.0
|
||||||
# via sympy
|
# via sympy
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# chroma-hnswlib
|
# chroma-hnswlib
|
||||||
@ -153,7 +153,7 @@ packaging==23.2
|
|||||||
# build
|
# build
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
# onnxruntime
|
# onnxruntime
|
||||||
posthog==3.4.0
|
posthog==3.4.1
|
||||||
# via chromadb
|
# via chromadb
|
||||||
protobuf==4.23.4
|
protobuf==4.23.4
|
||||||
# via
|
# via
|
||||||
@ -217,13 +217,13 @@ sympy==1.12
|
|||||||
# via onnxruntime
|
# via onnxruntime
|
||||||
tenacity==8.2.3
|
tenacity==8.2.3
|
||||||
# via chromadb
|
# via chromadb
|
||||||
tokenizers==0.15.1
|
tokenizers==0.15.2
|
||||||
# via chromadb
|
# via chromadb
|
||||||
tomli==2.0.1
|
tomli==2.0.1
|
||||||
# via
|
# via
|
||||||
# build
|
# build
|
||||||
# pyproject-hooks
|
# pyproject-hooks
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# chromadb
|
# chromadb
|
||||||
@ -248,7 +248,7 @@ urllib3==1.26.18
|
|||||||
# -c ingest/../constraints.in
|
# -c ingest/../constraints.in
|
||||||
# kubernetes
|
# kubernetes
|
||||||
# requests
|
# requests
|
||||||
uvicorn[standard]==0.27.0.post1
|
uvicorn[standard]==0.27.1
|
||||||
# via
|
# via
|
||||||
# chromadb
|
# chromadb
|
||||||
# uvicorn
|
# uvicorn
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/confluence.txt ingest/confluence.in
|
# pip-compile --output-file=ingest/confluence.txt ingest/confluence.in
|
||||||
#
|
#
|
||||||
atlassian-python-api==3.41.9
|
atlassian-python-api==3.41.10
|
||||||
# via -r ingest/confluence.in
|
# via -r ingest/confluence.in
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -15,9 +15,9 @@ charset-normalizer==3.3.2
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
databricks-sdk==0.18.0
|
databricks-sdk==0.19.1
|
||||||
# via -r ingest/databricks-volumes.in
|
# via -r ingest/databricks-volumes.in
|
||||||
google-auth==2.27.0
|
google-auth==2.28.0
|
||||||
# via databricks-sdk
|
# via databricks-sdk
|
||||||
idna==3.6
|
idna==3.6
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -4,11 +4,11 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
|
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
|
||||||
#
|
#
|
||||||
deltalake==0.15.2
|
deltalake==0.15.3
|
||||||
# via -r ingest/delta-table.in
|
# via -r ingest/delta-table.in
|
||||||
fsspec==2024.2.0
|
fsspec==2024.2.0
|
||||||
# via -r ingest/delta-table.in
|
# via -r ingest/delta-table.in
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# pyarrow
|
# pyarrow
|
||||||
|
|||||||
@ -58,11 +58,11 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain-community==0.0.17
|
langchain-community==0.0.20
|
||||||
# via -r ingest/embed-aws-bedrock.in
|
# via -r ingest/embed-aws-bedrock.in
|
||||||
langchain-core==0.1.18
|
langchain-core==0.1.23
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langsmith==0.0.86
|
langsmith==0.0.87
|
||||||
# via
|
# via
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -78,7 +78,7 @@ mypy-extensions==1.0.0
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# typing-inspect
|
# typing-inspect
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# langchain-community
|
# langchain-community
|
||||||
@ -114,7 +114,7 @@ six==1.16.0
|
|||||||
# python-dateutil
|
# python-dateutil
|
||||||
sniffio==1.3.0
|
sniffio==1.3.0
|
||||||
# via anyio
|
# via anyio
|
||||||
sqlalchemy==2.0.25
|
sqlalchemy==2.0.27
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
tenacity==8.2.3
|
tenacity==8.2.3
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -72,11 +72,11 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain-community==0.0.17
|
langchain-community==0.0.20
|
||||||
# via -r ingest/embed-huggingface.in
|
# via -r ingest/embed-huggingface.in
|
||||||
langchain-core==0.1.18
|
langchain-core==0.1.23
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langsmith==0.0.86
|
langsmith==0.0.87
|
||||||
# via
|
# via
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -102,7 +102,7 @@ nltk==3.8.1
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# sentence-transformers
|
# sentence-transformers
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# langchain-community
|
# langchain-community
|
||||||
@ -160,7 +160,7 @@ sentencepiece==0.1.99
|
|||||||
# via sentence-transformers
|
# via sentence-transformers
|
||||||
sniffio==1.3.0
|
sniffio==1.3.0
|
||||||
# via anyio
|
# via anyio
|
||||||
sqlalchemy==2.0.25
|
sqlalchemy==2.0.27
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
sympy==1.12
|
sympy==1.12
|
||||||
# via torch
|
# via torch
|
||||||
@ -168,15 +168,15 @@ tenacity==8.2.3
|
|||||||
# via
|
# via
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
threadpoolctl==3.2.0
|
threadpoolctl==3.3.0
|
||||||
# via scikit-learn
|
# via scikit-learn
|
||||||
tokenizers==0.15.1
|
tokenizers==0.15.2
|
||||||
# via transformers
|
# via transformers
|
||||||
torch==2.2.0
|
torch==2.2.0
|
||||||
# via
|
# via
|
||||||
# -c ingest/../constraints.in
|
# -c ingest/../constraints.in
|
||||||
# sentence-transformers
|
# sentence-transformers
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# huggingface-hub
|
# huggingface-hub
|
||||||
|
|||||||
@ -43,7 +43,7 @@ frozenlist==1.4.1
|
|||||||
# aiosignal
|
# aiosignal
|
||||||
h11==0.14.0
|
h11==0.14.0
|
||||||
# via httpcore
|
# via httpcore
|
||||||
httpcore==1.0.2
|
httpcore==1.0.3
|
||||||
# via httpx
|
# via httpx
|
||||||
httpx==0.26.0
|
httpx==0.26.0
|
||||||
# via openai
|
# via openai
|
||||||
@ -58,11 +58,11 @@ jsonpatch==1.33
|
|||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain-community==0.0.17
|
langchain-community==0.0.20
|
||||||
# via -r ingest/embed-openai.in
|
# via -r ingest/embed-openai.in
|
||||||
langchain-core==0.1.18
|
langchain-core==0.1.23
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
langsmith==0.0.86
|
langsmith==0.0.87
|
||||||
# via
|
# via
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -78,11 +78,11 @@ mypy-extensions==1.0.0
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# typing-inspect
|
# typing-inspect
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# langchain-community
|
# langchain-community
|
||||||
openai==1.11.1
|
openai==1.12.0
|
||||||
# via -r ingest/embed-openai.in
|
# via -r ingest/embed-openai.in
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -115,15 +115,15 @@ sniffio==1.3.0
|
|||||||
# anyio
|
# anyio
|
||||||
# httpx
|
# httpx
|
||||||
# openai
|
# openai
|
||||||
sqlalchemy==2.0.25
|
sqlalchemy==2.0.27
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
tenacity==8.2.3
|
tenacity==8.2.3
|
||||||
# via
|
# via
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# langchain-core
|
# langchain-core
|
||||||
tiktoken==0.5.2
|
tiktoken==0.6.0
|
||||||
# via -r ingest/embed-openai.in
|
# via -r ingest/embed-openai.in
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# openai
|
# openai
|
||||||
|
|||||||
@ -41,11 +41,11 @@ fsspec==2024.2.0
|
|||||||
# gcsfs
|
# gcsfs
|
||||||
gcsfs==2024.2.0
|
gcsfs==2024.2.0
|
||||||
# via -r ingest/gcs.in
|
# via -r ingest/gcs.in
|
||||||
google-api-core==2.16.2
|
google-api-core==2.17.1
|
||||||
# via
|
# via
|
||||||
# google-cloud-core
|
# google-cloud-core
|
||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
google-auth==2.27.0
|
google-auth==2.28.0
|
||||||
# via
|
# via
|
||||||
# gcsfs
|
# gcsfs
|
||||||
# google-api-core
|
# google-api-core
|
||||||
|
|||||||
@ -15,11 +15,11 @@ charset-normalizer==3.3.2
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
google-api-core==2.16.2
|
google-api-core==2.17.1
|
||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
google-api-python-client==2.116.0
|
google-api-python-client==2.118.0
|
||||||
# via -r ingest/google-drive.in
|
# via -r ingest/google-drive.in
|
||||||
google-auth==2.27.0
|
google-auth==2.28.0
|
||||||
# via
|
# via
|
||||||
# google-api-core
|
# google-api-core
|
||||||
# google-api-python-client
|
# google-api-python-client
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/jira.txt ingest/jira.in
|
# pip-compile --output-file=ingest/jira.txt ingest/jira.in
|
||||||
#
|
#
|
||||||
atlassian-python-api==3.41.9
|
atlassian-python-api==3.41.10
|
||||||
# via -r ingest/jira.in
|
# via -r ingest/jira.in
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -20,7 +20,7 @@ h11==0.14.0
|
|||||||
# via httpcore
|
# via httpcore
|
||||||
htmlbuilder==1.0.0
|
htmlbuilder==1.0.0
|
||||||
# via -r ingest/notion.in
|
# via -r ingest/notion.in
|
||||||
httpcore==1.0.2
|
httpcore==1.0.3
|
||||||
# via httpx
|
# via httpx
|
||||||
httpx==0.26.0
|
httpx==0.26.0
|
||||||
# via notion-client
|
# via notion-client
|
||||||
|
|||||||
@ -21,7 +21,7 @@ idna==3.6
|
|||||||
# requests
|
# requests
|
||||||
loguru==0.7.2
|
loguru==0.7.2
|
||||||
# via pinecone-client
|
# via pinecone-client
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# pinecone-client
|
# pinecone-client
|
||||||
@ -41,7 +41,7 @@ six==1.16.0
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
tqdm==4.66.1
|
tqdm==4.66.2
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# pinecone-client
|
# pinecone-client
|
||||||
|
|||||||
@ -28,7 +28,7 @@ h2==4.1.0
|
|||||||
# via httpx
|
# via httpx
|
||||||
hpack==4.0.0
|
hpack==4.0.0
|
||||||
# via h2
|
# via h2
|
||||||
httpcore==1.0.2
|
httpcore==1.0.3
|
||||||
# via httpx
|
# via httpx
|
||||||
httpx[http2]==0.26.0
|
httpx[http2]==0.26.0
|
||||||
# via
|
# via
|
||||||
@ -41,7 +41,7 @@ idna==3.6
|
|||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# anyio
|
# anyio
|
||||||
# httpx
|
# httpx
|
||||||
numpy==1.26.3
|
numpy==1.26.4
|
||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# qdrant-client
|
# qdrant-client
|
||||||
@ -55,7 +55,7 @@ pydantic==1.10.14
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../constraints.in
|
# -c ingest/../constraints.in
|
||||||
# qdrant-client
|
# qdrant-client
|
||||||
qdrant-client==1.7.2
|
qdrant-client==1.7.3
|
||||||
# via -r ingest/qdrant.in
|
# via -r ingest/qdrant.in
|
||||||
sniffio==1.3.0
|
sniffio==1.3.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -68,7 +68,7 @@ six==1.16.0
|
|||||||
# python-dateutil
|
# python-dateutil
|
||||||
time-machine==2.13.0
|
time-machine==2.13.0
|
||||||
# via pendulum
|
# via pendulum
|
||||||
tzdata==2023.4
|
tzdata==2024.1
|
||||||
# via pendulum
|
# via pendulum
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -4,5 +4,5 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
|
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
|
||||||
#
|
#
|
||||||
slack-sdk==3.26.2
|
slack-sdk==3.27.0
|
||||||
# via -r ingest/slack.in
|
# via -r ingest/slack.in
|
||||||
|
|||||||
@ -8,7 +8,7 @@ appdirs==1.4.4
|
|||||||
# via label-studio-tools
|
# via label-studio-tools
|
||||||
autoflake==2.2.1
|
autoflake==2.2.1
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
black==24.1.1
|
black==24.2.0
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
certifi==2024.2.2
|
certifi==2024.2.2
|
||||||
# via
|
# via
|
||||||
@ -111,7 +111,7 @@ requests==2.31.0
|
|||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# label-studio-sdk
|
# label-studio-sdk
|
||||||
ruff==0.2.0
|
ruff==0.2.1
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -5,13 +5,13 @@ set -e
|
|||||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
cd "$SCRIPT_DIR"/.. || exit 1
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
|
||||||
|
EVAL_NAME="$1"
|
||||||
|
|
||||||
# List all structured outputs to use in this evaluation
|
# List all structured outputs to use in this evaluation
|
||||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
OUTPUT_ROOT=${2:-${OUTPUT_ROOT:-$SCRIPT_DIR}}
|
||||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval
|
OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval
|
||||||
mkdir -p "$OUTPUT_DIR"
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
|
||||||
EVAL_NAME="$1"
|
|
||||||
|
|
||||||
if [ "$EVAL_NAME" == "text-extraction" ]; then
|
if [ "$EVAL_NAME" == "text-extraction" ]; then
|
||||||
METRIC_STRATEGY="measure-text-extraction-accuracy-command"
|
METRIC_STRATEGY="measure-text-extraction-accuracy-command"
|
||||||
elif [ "$EVAL_NAME" == "element-type" ]; then
|
elif [ "$EVAL_NAME" == "element-type" ]; then
|
||||||
|
|||||||
@ -11,6 +11,8 @@ fi
|
|||||||
touch "$SKIPPED_FILES_LOG"
|
touch "$SKIPPED_FILES_LOG"
|
||||||
cd "$SCRIPT_DIR"/.. || exit 1
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
|
||||||
|
EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||||
|
|
||||||
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
||||||
export OMP_THREAD_LIMIT=1
|
export OMP_THREAD_LIMIT=1
|
||||||
|
|
||||||
@ -121,6 +123,6 @@ all_eval=(
|
|||||||
for eval in "${all_eval[@]}"; do
|
for eval in "${all_eval[@]}"; do
|
||||||
CURRENT_TEST="evaluation-metrics.sh $eval"
|
CURRENT_TEST="evaluation-metrics.sh $eval"
|
||||||
echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
|
echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
|
||||||
./test_unstructured_ingest/evaluation-metrics.sh "$eval"
|
./test_unstructured_ingest/evaluation-metrics.sh "$eval" "$EVAL_OUTPUT_ROOT"
|
||||||
echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
|
echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
|
||||||
done
|
done
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user