haystack/.github/utils/tutorials.sh
Daniel Bichuetti e1f399284f
refactor: update dependencies and remove pins (#3147)
* refactor: remove azure-core, pydoc and hf-hub pins

* fix: remove extra-comma

* fix: force minimum version of azure forms recognizer

* refactor: allow newer ocr libs

* refactor: update more dependencies and container versions

* refactor: remove extra comment

* docs: pre-commit manual run

* refactor: remove unnecessary dependency

* tests: update weaviate container image version
2022-09-05 14:30:35 +02:00

140 lines
5.2 KiB
Bash
Executable File

#!/bin/bash
export LAUNCH_GRAPHDB=0 # See tut 10 - GraphDB is already running in CI
export TIKA_LOG_PATH=$PWD # Avoid permission denied errors while importing tika
python_path=$1
files_changed=$2
exclusion_list=$3
make_python_path_editable=$4
containers_policy=$5
no_got_tutorials='4_FAQ_style_QA 5_Evaluation 7_RAG_Generator 8_Preprocessing 10_Knowledge_Graph 15_TableQA 16_Document_Classifier_at_Index_Time'
echo "Files changed in this PR: $files_changed"
echo "Excluding: $exclusion_list"
echo "Python path is editable: $make_python_path_editable"
echo "Containers policy: $containers_policy"
# Collect the tutorials to run
scripts_to_run=""
for script in $files_changed; do
if [[ "$script" != *"tutorials/Tutorial"* ]] || ([[ "$script" != *".py"* ]] && [[ "$script" != *".ipynb"* ]]); then
echo "- not a tutorial: $script"
continue
fi
skip_to_next=0
for excluded in $exclusion_list; do
if [[ "$script" == *"$excluded"* ]]; then skip_to_next=1; fi
done
if [[ $skip_to_next == 1 ]]; then
echo "- excluded: $script"
continue
fi
scripts_to_run="$scripts_to_run $script"
done
# Run the containers
docker run -d -p 9200:9200 --name elasticsearch -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
failed=""
for script in $scripts_to_run; do
echo ""
echo "##################################################################################"
echo "##################################################################################"
echo "## Running $script ..."
echo "##################################################################################"
echo "##################################################################################"
# Do not cache GoT data
reduce_dataset=1
for no_got_tut in $no_got_tutorials; do
if [[ "$script" == *"$no_got_tut"* ]]; then
reduce_dataset=0
fi
done
if [[ $reduce_dataset == 1 ]]; then
# Copy the reduced GoT data into a folder named after the tutorial
# to trigger the caching mechanism of `fetch_archive_from_http`
echo "Using reduced GoT dataset"
no_prefix=${script#"tutorials/Tutorial"}
split_on_underscore=(${no_prefix//_/ })
cp -r data/tutorials data/tutorial${split_on_underscore[0]}
else
echo "NOT using reduced GoT dataset!"
fi
# FIXME Make the Python path editable
# espnet needs to edit files on the PYTHONPATH during execution. However, by default GH runners don't allow
# workflows to edit files into that directory, so in case of tutorials using espnet, we need to make PYTHONPATH
# editable first. For now it's only Tutorial 17.
# Still unclear why it's needed to repeat this operation, but if Tutorial 17 is run twice (once for the .py
# and once for .ipynb version) the error re-appears.
if [[ $make_python_path_editable == "EDITABLE" ]] && [[ "$script" == *"Tutorial17_"* ]]; then
sudo find $python_path/lib -type f -exec chmod 777 {} \;
fi
if [[ "$script" == *".py" ]]; then
output=$(time python $script)
else
output=$(sudo $python_path/bin/ipython -c "%run $script")
fi
echo $output > $script-output.txt
if [ $? -eq 0 ]; then
echo "Execution completed successfully."
else
echo "===================================================="
echo "| $script FAILED!"
echo "===================================================="
echo "Output of the execution: "
echo $output
failed=$failed" "$script
fi
# Restart the necessary containers
# Note: Tika does not store data and therefore can be left running
if [[ "$make_python_path_editable" == "RESTART" ]]; then
docker stop elasticsearch
docker rm elasticsearch
docker run -d -p 9200:9200 --name elasticsearch -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
fi
# Clean up datasets and SQLite DBs to avoid crashing the next tutorial
git clean -f
done
# causes permission errors on Post Cache
sudo rm -rf data/
sudo rm -rf /home/runner/work/haystack/haystack/elasticsearch-7.9.2/
if [[ $failed == "" ]]; then
echo ""
echo ""
echo "------------------------------------------"
echo " All tutorials were executed successfully "
echo "------------------------------------------"
exit 0
else
echo ""
echo "##################################################################################"
echo "## ##"
echo "## Some tutorials have failed! ##"
echo "## ##"
echo "##################################################################################"
for script in $failed; do
echo "## - $script"
done
echo "##################################################################################"
exit 1
fi