diff --git a/.gitignore b/.gitignore index 87f4fc72b..c93815135 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,5 @@ metricsdiff.txt # analysis annotated/ .aider* +pcaps +python-output diff --git a/scripts/image/test-all-outbound-connectivity-scenarios.sh b/scripts/image/test-all-outbound-connectivity-scenarios.sh new file mode 100755 index 000000000..fb5208b5a --- /dev/null +++ b/scripts/image/test-all-outbound-connectivity-scenarios.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Note: +# +# The scenarios baseline, missing-models, and analytics-online-only +# are expected to have conversations reported by tshark +# +# The scenarios offline and offline-and-missing-models +# are *NOT* expected to have any conversations (or attempted conversations) reported by tshark + +set -euo pipefail + +# shellcheck disable=SC2015 +((BASH_VERSINFO[0] >= 5)) || { + echo "Requires bash >= 5" >&2 + exit 1 +} + +mkdir -p python-output +mkdir -p pcaps + +start_timestamp_seconds=$(date +%s) + +./test-outbound-connectivity.sh --cleanup baseline +./test-outbound-connectivity.sh --cleanup missing-models +./test-outbound-connectivity.sh --cleanup analytics-online-only +./test-outbound-connectivity.sh --cleanup offline +./test-outbound-connectivity.sh --cleanup offline-and-missing-models + +set +e +found_pcap_files=$(find "pcaps" -maxdepth 1 -name "*.pcap" -type f -newermt "@$start_timestamp_seconds" 2>/dev/null | wc -l | tr -d ' ') +found_log_files=$(find "python-output" -maxdepth 1 -name "*.log" -type f -newermt "@$start_timestamp_seconds" 2>/dev/null | wc -l | tr -d ' ') +set -e +if [ "$found_pcap_files" -ne "5" ]; then + echo "Expected to find 4 fresh pcap/ files from this test but found $found_pcap_files instead" + exit 1 +fi +if [ "$found_log_files" -ne "5" ]; then + echo "Expected to find 4 fresh python-output .log files from this test but found $found_log_files instead" + exit 1 +fi + +for scenario in baseline missing-models analytics-online-only offline offline-and-missing-models; do + echo + + echo "==================================================================" + echo "======================================== Begin Scenario: $scenario" + echo + echo " -------------------------------------------" + echo " tshark output for $scenario" + echo " -------------------------------------------" + echo + tshark -r pcaps/$scenario.pcap -q -z conv,ip | grep -v '====================================' + + echo + echo " ------------------------------------------" + echo " python log output for $scenario" + echo " ------------------------------------------" + echo + cat python-output/$scenario.log +done diff --git a/scripts/image/test-outbound-connectivity.sh b/scripts/image/test-outbound-connectivity.sh new file mode 100755 index 000000000..82eda5038 --- /dev/null +++ b/scripts/image/test-outbound-connectivity.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# +# test-outbound-connectivity.sh +# +# Capture every external packet an Unstructured Docker image emits while +# partition()‑ing a test PNG, *inside the same container* (works on macOS). +# +# In addition **also capture the Python workload's stdout / stderr** and save it +# under ./python-output/.log while still streaming it to your terminal. +# +# Usage examples +# ./test-outbound-connectivity.sh baseline +# ./test-outbound-connectivity.sh --cleanup missing-models +# ./test-outbound-connectivity.sh --cleanup offline +# ./test-outbound-connectivity.sh offline-and-missing-models +# +# Outputs: +# ./pcaps/.pcap +# ./python-output/.log +# --------------------------------------------------------------------- + +set -euo pipefail + +######################## user‑tunable constants ######################## +IMAGE="downloads.unstructured.io/unstructured-io/unstructured:e42884a" +NET="unstructured_test_net" +CAPTURE_IFACE="${CAPTURE_IFACE:-eth0}" +PCAP_DIR="$(pwd)/pcaps" +PY_LOG_DIR="$(pwd)/python-output" # where Python logs go +HF_CACHE="/home/notebook-user/.cache/huggingface" +######################################################################## + +# shellcheck disable=SC2015 +((BASH_VERSINFO[0] >= 5)) || { + echo "Requires bash >= 5" >&2 + exit 1 +} + +# Create output directories up‑front so failures don’t leave us empty‑handed +mkdir -p "$PCAP_DIR" "$PY_LOG_DIR" + +# ---------- parse flags (optional --cleanup) -------------------------- +CLEANUP=0 +if [[ "${1:-}" == "--cleanup" ]]; then + CLEANUP=1 + shift +fi + +SCENARIO="${1:-}" +if [[ -z "$SCENARIO" ]]; then + echo "Usage: $0 [--cleanup] {baseline|missing-models|offline|offline-and-missing-models}" >&2 + exit 1 +fi + +# ---------- optional pre‑run cleanup ---------------------------------- +if ((CLEANUP)); then + echo ">>> Removing leftover sut_* containers…" + # shellcheck disable=SC2015 + docker rm -f "$(docker ps -aq --filter name='^sut_')" 2>/dev/null || true +fi + +# ---------- scenario‑specific settings -------------------------------- +DO_NOT_TRACK="" +HF_HUB_OFFLINE="" +REMOVE_CACHE=0 +case "$SCENARIO" in +baseline) ;; +missing-models) REMOVE_CACHE=1 ;; +analytics-online-only) HF_HUB_OFFLINE=1 ;; +offline) + DO_NOT_TRACK=true + HF_HUB_OFFLINE=1 + ;; +offline-and-missing-models) + DO_NOT_TRACK=true + HF_HUB_OFFLINE=1 + REMOVE_CACHE=1 + ;; +*) + echo "Unknown scenario: $SCENARIO" + exit 1 + ;; +esac + +docker network inspect "$NET" >/dev/null 2>&1 || docker network create "$NET" + +# ---------- launch SUT idle ------------------------------------------- +CID=$(docker run -d --rm --name "sut_${SCENARIO}" \ + --network "$NET" \ + --cap-add NET_RAW --cap-add NET_ADMIN \ + -e DO_NOT_TRACK="$DO_NOT_TRACK" \ + -e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \ + --entrypoint /bin/sh "$IMAGE" -c "sleep infinity") +echo "Container: $CID (scenario $SCENARIO)" + +# install tcpdump (Wolfi uses apk) as root +docker exec -u root "$CID" apk add --no-cache tcpdump >/dev/null + +# optionally wipe HF cache +# shellcheck disable=SC2015 +((REMOVE_CACHE)) && docker exec "$CID" rm -rf "$HF_CACHE" || true + +# ---------- start tcpdump in background ------------------------------- +FILTER='not (dst net ff02::/16 or src net ff02::/16 or ip6[6] = 58 or ether multicast)' + +docker exec -u root -d "$CID" sh -c "tcpdump -U -n -i $CAPTURE_IFACE '$FILTER' -w /tmp/capture.pcap > /tmp/tcpdump.log 2>&1" + +# check if tcpdump stayed alive +sleep 2 +if ! docker exec "$CID" pgrep tcpdump >/dev/null; then + echo 'tcpdump exited – showing its log:' + docker exec "$CID" cat /tmp/tcpdump.log + exit 1 +fi + +echo "tcpdump running on interface $CAPTURE_IFACE..." +# ---------- run the Python workload ----------------------------------- +echo ">>> Running Python workload (capturing stdout/stderr)…" +# ‑ The "|&" pipes *both* stdout *and* stderr into tee. +# ‑ tee sends it to the terminal *and* writes the log file. +# ‑ With `set -o pipefail` we still fail early if the Python process exits non‑zero. + +if [[ "$HF_HUB_OFFLINE" -eq 1 && "$REMOVE_CACHE" -eq 1 ]]; then + echo "HF_HUB_OFFLINE=1 and REMOVE_CACHE=1 : allowing python command have a non-exit 0 status and will continue the script." + set +e +fi + +docker exec -i -e PYTHONUNBUFFERED=1 "$CID" python - </dev/null