Files
DocsGPT/scripts/e2e/up.sh
Alex 81b6ee5daa Pg 4 (#2390)
* feat: postgres tests

* feat: mongo cutoff

* feat: mongo cutoff

* feat: adjust docs and compose files

* fix: mini code mongo removals

* fix: tests and k8s mongo stuff

* feat: test fixes

* fix: ruff

* fix: vale

* Potential fix for pull request finding 'CodeQL / Clear-text logging of sensitive information'

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>

* fix: mini suggestions

* vale lint fix 2

* fix: codeql columns thing

* fix: test mongo

* fix: tests coverage

* feat: better tests 4

* feat: more tests

* feat: decent coverage

* fix: ruff fixes

* fix: remove mongo mock

* feat: enhance workflow engine and API routes; add document retrieval and source handling

* feat: e2e tests

* fix: mcp, mongo and more

* fix: mini codeql warning

* fix: agent chunk view

* fix: mini issues

* fix: more pg fixes

* feat: postgres prep on start

* feat: qa tests

* fix: mini improvements

* fix: tests

---------

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
Co-authored-by: Siddhant Rai <siddhant.rai.5686@gmail.com>
2026-04-18 13:13:57 +01:00

357 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
# scripts/e2e/up.sh
#
# Boot the DocsGPT end-to-end test stack on this machine, natively.
# See e2e-plan.md (Phase 0 / P0-A) for the contract.
#
# Happy path:
# 1. Preflight shared services (Postgres, Redis). Fail loud if down.
# 2. Reset state: Postgres template clone, Redis FLUSHDB 11/12/13, wipe .e2e-tmp.
# 3. Export env.
# 4. Start mock LLM (7899) → Flask (7099) → Celery → Vite (5179), each in
# background, each with its own pidfile + log + readiness probe.
# 5. Exit 0, leaving services running. Playwright (or the user) invokes
# down.sh separately when done.
#
# On error before handoff: tear everything down, non-zero exit.
# We explicitly DO NOT tear down on the happy-path exit — that would defeat
# the purpose of "up".
set -euo pipefail
# -----------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
PG_BIN="/Users/Shared/DBngin/postgresql/16.2/bin"
DBNGIN_REDIS_BIN="/Users/Shared/DBngin/redis/7.0.0/bin"
# Resolve redis-cli — PATH first, then DBngin's bundled copy.
if command -v redis-cli >/dev/null 2>&1; then
REDIS_CLI="$(command -v redis-cli)"
elif [[ -x "$DBNGIN_REDIS_BIN/redis-cli" ]]; then
REDIS_CLI="$DBNGIN_REDIS_BIN/redis-cli"
else
REDIS_CLI=""
fi
PIDDIR="/tmp/docsgpt-e2e"
E2E_TMP="$REPO_ROOT/.e2e-tmp"
LOGDIR="$E2E_TMP/logs"
BOOT_LOG="$LOGDIR/up.log"
SVC_LOGDIR="$PIDDIR" # per-service logs live with the pidfiles per the brief
MOCK_LLM_PORT=7899
FLASK_PORT=7099
VITE_PORT=5179
# -----------------------------------------------------------------------------
# Bookkeeping — track which services we successfully started so we can tear
# them down if something later fails.
# -----------------------------------------------------------------------------
HANDOFF_OK=0
STARTED_SERVICES=()
log() {
local msg="[up.sh] $*"
# Goes to stderr so stdout stays clean; also mirrored to the boot log.
echo "$msg" >&2
if [[ -n "${BOOT_LOG:-}" ]] && [[ -d "$(dirname "$BOOT_LOG")" ]]; then
echo "$msg" >> "$BOOT_LOG"
fi
}
die() {
log "ERROR: $*"
exit 1
}
# Trap: if we exit before handoff (failure or Ctrl-C), clean up. The happy
# path sets HANDOFF_OK=1 just before `exit 0`, so the trap becomes a no-op.
cleanup_on_failure() {
local rc=$?
if [[ "$HANDOFF_OK" -eq 1 ]]; then
return 0
fi
log "aborting — tearing down any services that started (rc=$rc)"
if [[ -x "$SCRIPT_DIR/down.sh" ]]; then
"$SCRIPT_DIR/down.sh" || true
fi
}
trap cleanup_on_failure EXIT INT TERM
# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
# Wait for a shell predicate to succeed. Args: <label> <timeout-seconds> <cmd...>
wait_for() {
local label="$1"
local timeout="$2"
shift 2
local elapsed=0
while (( elapsed < timeout )); do
if "$@" >/dev/null 2>&1; then
log " -> $label ready after ${elapsed}s"
return 0
fi
sleep 1
elapsed=$(( elapsed + 1 ))
done
return 1
}
# Wait for a substring to appear in a log file.
wait_for_log() {
local label="$1"
local timeout="$2"
local logfile="$3"
local needle="$4"
local elapsed=0
while (( elapsed < timeout )); do
if [[ -f "$logfile" ]] && grep -qF -- "$needle" "$logfile"; then
log " -> $label saw '$needle' after ${elapsed}s"
return 0
fi
sleep 1
elapsed=$(( elapsed + 1 ))
done
return 1
}
# Dump last 50 lines of a log file to stderr (for boot-failure diagnostics).
dump_tail() {
local label="$1"
local logfile="$2"
echo "---- last 50 lines of $label ($logfile) ----" >&2
if [[ -f "$logfile" ]]; then
tail -n 50 "$logfile" >&2 || true
else
echo "(log file does not exist)" >&2
fi
echo "---- end $label ----" >&2
}
# Boot-failure handler: dump the log, then let the trap tear everything down.
boot_fail() {
local svc="$1"
local logfile="$2"
local reason="$3"
log "boot failure: $svc$reason"
dump_tail "$svc" "$logfile"
exit 1
}
# -----------------------------------------------------------------------------
# 1. Preflight
# -----------------------------------------------------------------------------
log "preflight: checking shared native services"
if [[ ! -x "$PG_BIN/pg_isready" ]]; then
die "pg_isready not found at $PG_BIN/pg_isready — is DBngin Postgres 16.2 installed?"
fi
if ! "$PG_BIN/pg_isready" -h 127.0.0.1 -p 5432 -U docsgpt -d postgres >/dev/null 2>&1; then
die "Postgres not reachable at 127.0.0.1:5432 as user 'docsgpt'. Start DBngin Postgres 16.2. (CLAUDE.md: do not kill/start this process from scripts.)"
fi
log " -> postgres OK"
if [[ -z "$REDIS_CLI" ]]; then
die "redis-cli not found on PATH nor at $DBNGIN_REDIS_BIN/redis-cli — install redis or adjust DBNGIN_REDIS_BIN"
fi
if ! "$REDIS_CLI" -h 127.0.0.1 -p 6379 PING 2>/dev/null | grep -q '^PONG$'; then
die "Redis not reachable at 127.0.0.1:6379. Start the native redis-server. (CLAUDE.md: do not kill/start this process from scripts.)"
fi
log " -> redis OK"
# -----------------------------------------------------------------------------
# 2. Reset state
# -----------------------------------------------------------------------------
log "resetting state"
# Wipe & recreate .e2e-tmp first so BOOT_LOG has a home.
rm -rf "$E2E_TMP"
mkdir -p "$E2E_TMP/inputs" "$E2E_TMP/indexes" "$LOGDIR"
: > "$BOOT_LOG"
log " -> .e2e-tmp wiped; logs at $LOGDIR"
mkdir -p "$PIDDIR"
# Leave existing per-service logs alone until we overwrite them at launch time;
# that way a prior failure log isn't immediately erased if someone re-runs up.
# Postgres reset — delegated to reset_db.sh (owned by track P0-B).
RESET_DB_SCRIPT="$SCRIPT_DIR/reset_db.sh"
if [[ ! -x "$RESET_DB_SCRIPT" ]]; then
die "reset_db.sh missing or not executable at $RESET_DB_SCRIPT — has track P0-B landed?"
fi
log " -> invoking reset_db.sh"
if ! "$RESET_DB_SCRIPT" >> "$BOOT_LOG" 2>&1; then
die "reset_db.sh failed — see $BOOT_LOG"
fi
# Redis reset — three dedicated DB indices.
for db in 11 12 13; do
if ! "$REDIS_CLI" -h 127.0.0.1 -p 6379 -n "$db" FLUSHDB >/dev/null 2>&1; then
die "redis-cli FLUSHDB failed on db $db"
fi
done
log " -> redis dbs 11/12/13 flushed"
# -----------------------------------------------------------------------------
# 3. Load env
# -----------------------------------------------------------------------------
log "sourcing env.sh"
# shellcheck source=./env.sh
source "$SCRIPT_DIR/env.sh"
# -----------------------------------------------------------------------------
# 4. Start services
# -----------------------------------------------------------------------------
# Pick Flask / python binaries from the repo venv when present.
if [[ -x "$REPO_ROOT/.venv/bin/flask" ]]; then
FLASK_BIN="$REPO_ROOT/.venv/bin/flask"
else
FLASK_BIN="$(command -v flask || true)"
fi
if [[ -z "$FLASK_BIN" ]]; then
die "flask binary not found (.venv/bin/flask missing and no 'flask' on PATH)"
fi
if [[ -x "$REPO_ROOT/.venv/bin/python" ]]; then
PY_BIN="$REPO_ROOT/.venv/bin/python"
else
PY_BIN="$(command -v python3 || command -v python || true)"
fi
if [[ -z "$PY_BIN" ]]; then
die "python binary not found (.venv/bin/python missing and no 'python3' on PATH)"
fi
log "using flask=$FLASK_BIN python=$PY_BIN"
# ---- 4a. Mock LLM ------------------------------------------------------------
MOCK_LLM_LOG="$SVC_LOGDIR/mock-llm.log"
MOCK_LLM_PID="$PIDDIR/mock-llm.pid"
log "starting mock LLM on 127.0.0.1:$MOCK_LLM_PORT"
(
cd "$REPO_ROOT"
# Port can be read from env by the script; we also export it for clarity.
MOCK_LLM_PORT="$MOCK_LLM_PORT" PYTHONUNBUFFERED=1 nohup "$PY_BIN" scripts/e2e/mock_llm.py \
>"$MOCK_LLM_LOG" 2>&1 &
echo $! > "$MOCK_LLM_PID"
)
STARTED_SERVICES+=("mock-llm")
if ! wait_for "mock-llm /healthz" 10 \
curl -sf "http://127.0.0.1:${MOCK_LLM_PORT}/healthz"; then
boot_fail "mock-llm" "$MOCK_LLM_LOG" "healthz did not respond within 10s"
fi
# ---- 4b. Flask ---------------------------------------------------------------
FLASK_LOG="$SVC_LOGDIR/flask.log"
FLASK_PID="$PIDDIR/flask.pid"
log "starting Flask on 127.0.0.1:$FLASK_PORT"
(
cd "$E2E_TMP"
PYTHONUNBUFFERED=1 nohup "$FLASK_BIN" --app ../application/app.py run \
--host 127.0.0.1 --port "$FLASK_PORT" \
>"$FLASK_LOG" 2>&1 &
echo $! > "$FLASK_PID"
)
STARTED_SERVICES+=("flask")
if ! wait_for "flask /api/config" 30 \
curl -sf "http://127.0.0.1:${FLASK_PORT}/api/config"; then
boot_fail "flask" "$FLASK_LOG" "/api/config did not respond within 30s"
fi
# ---- 4c. Celery --------------------------------------------------------------
CELERY_LOG="$SVC_LOGDIR/celery.log"
CELERY_PID="$PIDDIR/celery.pid"
log "starting Celery worker (solo pool)"
(
cd "$E2E_TMP"
PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" \
PYTHONUNBUFFERED=1 \
nohup "$PY_BIN" -m celery -A application.app.celery worker \
-l INFO --pool=solo \
--without-gossip --without-mingle --without-heartbeat \
>"$CELERY_LOG" 2>&1 &
echo $! > "$CELERY_PID"
)
STARTED_SERVICES+=("celery")
# Celery's "ready" banner contains both "celery@<host>" and "ready.". Wait for
# both in sequence so we know the worker actually finished bootstrapping.
if ! wait_for_log "celery 'celery@'" 30 "$CELERY_LOG" "celery@"; then
boot_fail "celery" "$CELERY_LOG" "never emitted 'celery@' banner within 30s"
fi
# Ready check via `celery inspect ping`. We can't grep the log for 'ready'
# because application/core/logging_config.py calls dictConfig with the default
# disable_existing_loggers=True, which silences celery.worker's ready banner.
# `inspect ping` queries the worker over the broker — it's the canonical
# responsiveness check and doesn't depend on log output.
CELERY_INSPECT_TIMEOUT=45
elapsed=0
ping_ok=0
while (( elapsed < CELERY_INSPECT_TIMEOUT )); do
if ( cd "$E2E_TMP" && \
PYTHONPATH="$REPO_ROOT${PYTHONPATH:+:$PYTHONPATH}" \
PYTHONUNBUFFERED=1 \
"$PY_BIN" -m celery -A application.app.celery inspect ping \
--timeout 2 >/dev/null 2>&1 ); then
ping_ok=1
log " -> celery inspect ping OK after ${elapsed}s"
break
fi
sleep 1
elapsed=$(( elapsed + 1 ))
done
if (( ping_ok == 0 )); then
boot_fail "celery" "$CELERY_LOG" "worker did not respond to 'inspect ping' within ${CELERY_INSPECT_TIMEOUT}s"
fi
# ---- 4d. Vite dev server -----------------------------------------------------
VITE_LOG="$SVC_LOGDIR/vite.log"
VITE_PID="$PIDDIR/vite.pid"
log "starting Vite dev server on 127.0.0.1:$VITE_PORT"
(
cd "$REPO_ROOT/frontend"
VITE_API_HOST="http://127.0.0.1:${FLASK_PORT}" nohup npm run dev -- \
--host 127.0.0.1 --port "$VITE_PORT" --strictPort \
>"$VITE_LOG" 2>&1 &
echo $! > "$VITE_PID"
)
STARTED_SERVICES+=("vite")
# Prefer nc; fall back to lsof. Either succeeding means the port is LISTEN.
vite_ready() {
if command -v nc >/dev/null 2>&1; then
nc -z 127.0.0.1 "$VITE_PORT" >/dev/null 2>&1 && return 0
fi
if command -v lsof >/dev/null 2>&1; then
[[ -n "$(lsof -nP -iTCP:"$VITE_PORT" -sTCP:LISTEN -t 2>/dev/null)" ]] && return 0
fi
return 1
}
if ! wait_for "vite LISTEN on $VITE_PORT" 20 vite_ready; then
boot_fail "vite" "$VITE_LOG" "port $VITE_PORT never entered LISTEN within 20s"
fi
# -----------------------------------------------------------------------------
# 5. Handoff
# -----------------------------------------------------------------------------
log "all services up:"
log " mock-llm pid=$(cat "$MOCK_LLM_PID") log=$MOCK_LLM_LOG"
log " flask pid=$(cat "$FLASK_PID") log=$FLASK_LOG"
log " celery pid=$(cat "$CELERY_PID") log=$CELERY_LOG"
log " vite pid=$(cat "$VITE_PID") log=$VITE_LOG"
log "handoff complete — exiting 0, services remain running. Run scripts/e2e/down.sh to stop."
HANDOFF_OK=1
exit 0