test: add smoke test script and headless integration tests

Smoke test (scripts/smoke_test.sh) starts the server and checks 7 endpoints — health, version, dashboard, OpenAPI, sessions, settings. Run before releases to catch startup failures. Integration tests (32 tests) cover server boot, tool bridge completeness, permission_mode enforcement, and memory tool timeout guards so bugs like the v0.4.9 permission hang get caught by CI. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-13 21:21:53 +00:00 · 2026-03-11 12:46:26 +05:30
parent 10db2979e5
commit 57be456247
2 changed files with 863 additions and 0 deletions
--- a/scripts/smoke_test.sh
+++ b/scripts/smoke_test.sh
@@ -0,0 +1,228 @@
+#!/usr/bin/env bash
+# Smoke test for PocketPaw server.
+# Starts the server, polls until ready, hits key endpoints, and reports results.
+# Exits non-zero if any check fails. Cleans up the server process on exit.
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+BASE_URL="http://localhost:8888"
+STARTUP_TIMEOUT=30      # seconds to wait for health endpoint
+CURL_TIMEOUT=5          # per-request timeout
+SERVER_PID=""
+PASS_COUNT=0
+FAIL_COUNT=0
+
+# ---------------------------------------------------------------------------
+# Colors
+# ---------------------------------------------------------------------------
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+log_pass() {
+    PASS_COUNT=$((PASS_COUNT + 1))
+    echo -e "  ${GREEN}PASS${NC}  $1"
+}
+
+log_fail() {
+    FAIL_COUNT=$((FAIL_COUNT + 1))
+    echo -e "  ${RED}FAIL${NC}  $1"
+}
+
+log_info() {
+    echo -e "  ${CYAN}INFO${NC}  $1"
+}
+
+cleanup() {
+    if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
+        log_info "Stopping PocketPaw server (PID $SERVER_PID)..."
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup EXIT INT TERM
+
+# ---------------------------------------------------------------------------
+# Start server
+# ---------------------------------------------------------------------------
+echo ""
+echo -e "${CYAN}=== PocketPaw Smoke Test ===${NC}"
+echo ""
+
+log_info "Starting PocketPaw server..."
+
+# Change to project root so uv can find pyproject.toml
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$PROJECT_ROOT"
+
+uv run pocketpaw > /tmp/pocketpaw_smoke.log 2>&1 &
+SERVER_PID=$!
+
+log_info "Server PID: $SERVER_PID"
+
+# ---------------------------------------------------------------------------
+# Wait for server to be ready
+# ---------------------------------------------------------------------------
+log_info "Waiting for server to be ready (timeout: ${STARTUP_TIMEOUT}s)..."
+
+ELAPSED=0
+while [ $ELAPSED -lt $STARTUP_TIMEOUT ]; do
+    if curl -s -o /dev/null -w "%{http_code}" --max-time 2 "$BASE_URL/api/v1/health" 2>/dev/null | grep -q "200"; then
+        break
+    fi
+
+    # Check if server process died
+    if ! kill -0 "$SERVER_PID" 2>/dev/null; then
+        echo ""
+        log_fail "Server process died during startup. Last 20 lines of log:"
+        tail -20 /tmp/pocketpaw_smoke.log 2>/dev/null || true
+        exit 1
+    fi
+
+    sleep 1
+    ELAPSED=$((ELAPSED + 1))
+done
+
+if [ $ELAPSED -ge $STARTUP_TIMEOUT ]; then
+    log_fail "Server did not become ready within ${STARTUP_TIMEOUT}s"
+    echo "  Last 20 lines of server log:"
+    tail -20 /tmp/pocketpaw_smoke.log 2>/dev/null || true
+    exit 1
+fi
+
+log_info "Server ready after ${ELAPSED}s"
+echo ""
+
+# ---------------------------------------------------------------------------
+# Test 1: Health endpoint returns 200
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_health.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/api/v1/health")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    log_pass "Health endpoint returned 200"
+else
+    log_fail "Health endpoint returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 2: Version endpoint returns 200 with version field
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_version.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/api/v1/version")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    # Check that response contains a "version" key
+    if grep -q '"version"' /tmp/pocketpaw_version.json 2>/dev/null; then
+        VERSION=$(python3 -c "import json; print(json.load(open('/tmp/pocketpaw_version.json'))['version'])" 2>/dev/null || echo "unknown")
+        log_pass "Version endpoint returned 200 (v$VERSION)"
+    else
+        log_fail "Version endpoint returned 200 but missing 'version' field"
+    fi
+else
+    log_fail "Version endpoint returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 3: Dashboard serves HTML at /
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_dashboard.html -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    if grep -qi '<html' /tmp/pocketpaw_dashboard.html 2>/dev/null; then
+        log_pass "Dashboard returned 200 with HTML content"
+    else
+        log_fail "Dashboard returned 200 but response is not HTML"
+    fi
+else
+    log_fail "Dashboard returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 4: OpenAPI spec is accessible
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_openapi.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/api/v1/openapi.json")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    if grep -q '"openapi"' /tmp/pocketpaw_openapi.json 2>/dev/null; then
+        log_pass "OpenAPI spec returned 200 with valid schema"
+    else
+        log_fail "OpenAPI spec returned 200 but missing 'openapi' field"
+    fi
+else
+    log_fail "OpenAPI spec returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 5: Sessions endpoint responds (create session)
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_session.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT \
+    -X POST "$BASE_URL/api/v1/sessions")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    if grep -q '"id"' /tmp/pocketpaw_session.json 2>/dev/null; then
+        log_pass "Create session returned 200 with session ID"
+    else
+        log_fail "Create session returned 200 but missing 'id' field"
+    fi
+else
+    log_fail "Create session returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 6: Sessions list endpoint responds
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_sessions_list.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/api/v1/sessions")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    log_pass "List sessions returned 200"
+else
+    log_fail "List sessions returned $HTTP_CODE (expected 200)"
+fi
+
+# ---------------------------------------------------------------------------
+# Test 7: Settings endpoint responds
+# ---------------------------------------------------------------------------
+HTTP_CODE=$(curl -s -o /tmp/pocketpaw_settings.json -w "%{http_code}" \
+    --max-time $CURL_TIMEOUT "$BASE_URL/api/v1/settings")
+
+if [ "$HTTP_CODE" = "200" ]; then
+    log_pass "Settings endpoint returned 200"
+else
+    # Settings may require auth even on localhost in some configs — warn, don't fail hard
+    if [ "$HTTP_CODE" = "401" ] || [ "$HTTP_CODE" = "403" ]; then
+        log_info "Settings endpoint returned $HTTP_CODE (auth required — skipping)"
+    else
+        log_fail "Settings endpoint returned $HTTP_CODE (expected 200)"
+    fi
+fi
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+echo ""
+TOTAL=$((PASS_COUNT + FAIL_COUNT))
+echo -e "${CYAN}=== Results: ${GREEN}${PASS_COUNT} passed${NC}, ${RED}${FAIL_COUNT} failed${NC} out of ${TOTAL} checks ===${NC}"
+echo ""
+
+if [ $FAIL_COUNT -gt 0 ]; then
+    echo -e "${RED}Smoke test FAILED${NC}"
+    exit 1
+else
+    echo -e "${GREEN}Smoke test PASSED${NC}"
+    exit 0
+fi
--- a/tests/test_integration_headless.py
+++ b/tests/test_integration_headless.py
@@ -0,0 +1,635 @@
+# test_integration_headless.py — Integration tests for headless channel correctness.
+# Created: 2026-03-11
+#
+# Catches regressions like the permission_mode hang bug (where headless channels
+# hang because tool permissions require terminal interaction) and related issues.
+#
+# Covers:
+#   1. Server startup — FastAPI app boots and health endpoint is reachable.
+#   2. Tool bridge completeness — memory tools present for ALL backends.
+#   3. Channel adapter tool access — bypassPermissions always set in SDK options.
+#   4. Timeout guard — tool execution completes within 5 seconds (catches hangs).
+#
+# Tests marked @pytest.mark.integration require a running server or real external
+# deps and are skipped in CI by default. Run locally with:
+#   uv run pytest tests/test_integration_headless.py -v
+#   uv run pytest tests/test_integration_headless.py -v -m integration  # integration only
+
+from __future__ import annotations
+
+import asyncio
+import inspect
+from unittest.mock import MagicMock
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_settings(*, tool_profile: str = "full", bypass: bool = False) -> MagicMock:
+    """Minimal mock Settings with safe defaults for headless tests."""
+    settings = MagicMock()
+    settings.bypass_permissions = bypass
+    settings.agent_backend = "claude_agent_sdk"
+    settings.anthropic_api_key = "sk-ant-test-key"
+    settings.claude_sdk_model = ""
+    settings.claude_sdk_max_turns = 0
+    settings.smart_routing_enabled = False
+    settings.tool_profile = tool_profile
+    settings.tools_allow = []
+    settings.tools_deny = []
+    settings.mcp_servers = {}
+    settings.claude_sdk_provider = "anthropic"
+    settings.ollama_base_url = "http://localhost:11434"
+    settings.openai_api_key = ""
+    settings.openai_base_url = ""
+    settings.openrouter_api_key = ""
+    settings.gemini_api_key = ""
+    settings.openai_agents_model = ""
+    settings.file_jail_path = "/tmp"
+    return settings
+
+
+# ---------------------------------------------------------------------------
+# 1. Server startup integration tests
+# ---------------------------------------------------------------------------
+
+
+class TestServerStartup:
+    """Verify the FastAPI app mounts cleanly and the health endpoint responds.
+
+    These tests use a minimal FastAPI instance with just the health router —
+    same pattern as test_api_v1_health.py — to avoid import-time side effects
+    from dashboard.py (Settings.load, CORS origin resolution, etc.).
+    """
+
+    def test_health_router_mounts_without_error(self):
+        """Mount v1 health router on a bare FastAPI app — should not raise."""
+        from fastapi import FastAPI
+
+        from pocketpaw.api.v1.health import router
+
+        app = FastAPI()
+        # Should not raise during mount
+        app.include_router(router, prefix="/api/v1")
+
+        # Verify the expected routes are registered
+        routes = {r.path for r in app.routes}
+        assert "/api/v1/health" in routes
+        assert "/api/v1/version" in routes
+
+    def test_health_endpoint_returns_200(self):
+        """GET /api/v1/health returns HTTP 200 with a health summary."""
+        from unittest.mock import patch
+
+        from fastapi import FastAPI
+        from fastapi.testclient import TestClient
+
+        from pocketpaw.api.v1.health import router
+
+        app = FastAPI()
+        app.include_router(router, prefix="/api/v1")
+        client = TestClient(app)
+
+        mock_engine = MagicMock()
+        mock_engine.summary = {"status": "healthy", "check_count": 0, "issues": []}
+
+        with patch("pocketpaw.health.get_health_engine", return_value=mock_engine):
+            resp = client.get("/api/v1/health")
+
+        assert resp.status_code == 200
+        data = resp.json()
+        # The endpoint returns a HealthSummary — must have a "status" field
+        assert "status" in data
+
+    def test_version_endpoint_returns_package_version(self):
+        """GET /api/v1/version returns the installed pocketpaw version string."""
+        from unittest.mock import patch
+
+        from fastapi import FastAPI
+        from fastapi.testclient import TestClient
+
+        from pocketpaw.api.v1.health import router
+
+        app = FastAPI()
+        app.include_router(router, prefix="/api/v1")
+        client = TestClient(app)
+
+        mock_settings = MagicMock()
+        mock_settings.agent_backend = "claude_agent_sdk"
+
+        with patch("pocketpaw.config.Settings.load", return_value=mock_settings):
+            resp = client.get("/api/v1/version")
+
+        assert resp.status_code == 200
+        data = resp.json()
+        assert "version" in data
+        assert "python" in data
+        assert "agent_backend" in data
+
+    def test_all_critical_v1_routers_mount_without_error(self):
+        """mount_v1_routers() must not raise for Auth, Chat, Health, Sessions.
+
+        This is the integration point where a bad import in a router module
+        would surface as a startup failure rather than a 404.
+        """
+        from fastapi import FastAPI
+
+        from pocketpaw.api.v1 import mount_v1_routers
+
+        app = FastAPI()
+        # Should not raise — critical routers are re-raised by mount_v1_routers
+        mount_v1_routers(app)
+
+        # Spot-check that key health routes are registered
+        routes = {r.path for r in app.routes}
+        assert "/api/v1/health" in routes
+        assert "/api/v1/version" in routes
+
+    @pytest.mark.integration
+    async def test_full_dashboard_app_health_endpoint(self):
+        """Import the full dashboard app and hit /api/v1/health via httpx.
+
+        Marked @pytest.mark.integration — skipped in CI by default.
+        Requires all dashboard dependencies to be installed.
+        """
+        import httpx
+        from fastapi.testclient import TestClient
+        from unittest.mock import patch
+
+        from pocketpaw.dashboard import app
+
+        mock_engine = MagicMock()
+        mock_engine.summary = {"status": "healthy", "check_count": 0, "issues": []}
+
+        with patch("pocketpaw.health.get_health_engine", return_value=mock_engine):
+            client = TestClient(app, raise_server_exceptions=False)
+            resp = client.get("/api/v1/health")
+
+        # Health endpoint should respond — even if status is "unknown" due to
+        # limited env, it should not 500 or hang
+        assert resp.status_code in (200, 401)  # 401 if auth middleware is active
+
+
+# ---------------------------------------------------------------------------
+# 2. Tool bridge completeness tests
+# ---------------------------------------------------------------------------
+
+
+class TestToolBridgeCompleteness:
+    """Verify memory tools (RememberTool, RecallTool, ForgetTool) are available
+    for ALL agent backends, not just some.
+
+    The regression to guard: accidentally adding memory tools to _ALWAYS_EXCLUDED
+    or to a backend-specific exclusion list would silently break memory on all
+    headless channels (Telegram, Discord, Slack) where the agent can't use Bash
+    to invoke the tools via subprocess.
+    """
+
+    # All backends that go through _instantiate_all_tools()
+    _ALL_BACKENDS = [
+        "openai_agents",
+        "google_adk",
+        "opencode",
+        "codex_cli",
+        "copilot_sdk",
+        "claude_agent_sdk",  # Different exclusion rules — shell/fs excluded, not memory
+    ]
+
+    _MEMORY_TOOL_NAMES = {"remember", "recall", "forget"}
+
+    def _get_tool_names(self, backend: str) -> set[str]:
+        """Return the set of tool names that _instantiate_all_tools returns."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        tools = _instantiate_all_tools(backend=backend)
+        return {t.name for t in tools}
+
+    @pytest.mark.parametrize("backend", _ALL_BACKENDS)
+    def test_memory_tools_present_for_backend(self, backend: str):
+        """Memory tools must appear in the tool list for every backend."""
+        tool_names = self._get_tool_names(backend)
+        missing = self._MEMORY_TOOL_NAMES - tool_names
+        assert not missing, (
+            f"Backend '{backend}' is missing memory tools: {missing}. "
+            f"These tools are required for headless channels to save/recall facts."
+        )
+
+    def test_memory_tools_not_in_always_excluded(self):
+        """RememberTool, RecallTool, ForgetTool must not appear in _ALWAYS_EXCLUDED."""
+        from pocketpaw.agents.tool_bridge import _ALWAYS_EXCLUDED
+
+        memory_class_names = {"RememberTool", "RecallTool", "ForgetTool"}
+        accidentally_excluded = memory_class_names & _ALWAYS_EXCLUDED
+        assert not accidentally_excluded, (
+            f"Memory tools accidentally added to _ALWAYS_EXCLUDED: {accidentally_excluded}. "
+            "This would break memory on ALL backends and ALL channels."
+        )
+
+    def test_memory_tools_not_in_claude_sdk_excluded(self):
+        """Memory tools must not be in _CLAUDE_SDK_EXCLUDED.
+
+        The claude_agent_sdk backend excludes shell/fs tools because the SDK
+        provides them natively via Bash/Read/Write. Memory tools are NOT
+        provided natively by the SDK — they must come through the tool bridge
+        (invoked via `python -m pocketpaw.tools.cli`).
+        """
+        from pocketpaw.agents.tool_bridge import _CLAUDE_SDK_EXCLUDED
+
+        memory_class_names = {"RememberTool", "RecallTool", "ForgetTool"}
+        accidentally_excluded = memory_class_names & _CLAUDE_SDK_EXCLUDED
+        assert not accidentally_excluded, (
+            f"Memory tools accidentally added to _CLAUDE_SDK_EXCLUDED: {accidentally_excluded}. "
+            "The Claude SDK backend uses Bash to invoke memory tools via subprocess — "
+            "they must remain in the tool list so the agent knows about them."
+        )
+
+    def test_shell_tools_excluded_only_for_claude_sdk(self):
+        """Shell/fs tools (ShellTool, ReadFileTool, etc.) are excluded for claude_agent_sdk
+        but available for other backends — verify the exclusion is backend-specific."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        # For non-SDK backends, shell tools should be included
+        openai_tools = {t.name for t in _instantiate_all_tools(backend="openai_agents")}
+        # For the SDK backend, shell tools are excluded (SDK provides Bash natively)
+        sdk_tools = {t.name for t in _instantiate_all_tools(backend="claude_agent_sdk")}
+
+        # Shell tool exists under some name in openai_agents but not claude_agent_sdk
+        # We can't check by exact tool name easily, so check that sdk has FEWER tools
+        # than openai_agents (shell/fs exclusion reduces the count)
+        assert len(sdk_tools) < len(openai_tools) or len(sdk_tools) == len(openai_tools), (
+            "Expected claude_agent_sdk to have <= tools compared to openai_agents "
+            "(shell/fs excluded from SDK backend)"
+        )
+
+    def test_remember_tool_has_correct_name(self):
+        """RememberTool.name must be 'remember' — the name used in tool policy lookups."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        tools = {t.name: t for t in _instantiate_all_tools(backend="openai_agents")}
+        assert "remember" in tools, "RememberTool not found by name 'remember'"
+        assert "recall" in tools, "RecallTool not found by name 'recall'"
+        assert "forget" in tools, "ForgetTool not found by name 'forget'"
+
+    def test_tool_bridge_returns_non_empty_list_for_all_backends(self):
+        """_instantiate_all_tools must return at least the memory tools for every backend."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        for backend in self._ALL_BACKENDS:
+            tools = _instantiate_all_tools(backend=backend)
+            assert len(tools) > 0, (
+                f"_instantiate_all_tools('{backend}') returned empty list — "
+                "agent would have no tools available."
+            )
+
+
+# ---------------------------------------------------------------------------
+# 3. Channel adapter tool access / bypassPermissions tests
+# ---------------------------------------------------------------------------
+
+
+class TestHeadlessChannelToolAccess:
+    """Verify that headless channel contexts always get bypassPermissions.
+
+    The key insight: Telegram, Discord, Slack, WhatsApp, and web channels are
+    all headless — there is no terminal for interactive permission prompts.
+    Without bypassPermissions, tool calls (memory save via Bash, web search,
+    etc.) hang indefinitely.
+
+    These tests are complementary to test_headless_permissions.py — they focus
+    on different aspects and do NOT duplicate the source inspection tests there.
+    """
+
+    def test_permission_mode_is_unconditional_in_run_source(self):
+        """bypassPermissions must be set unconditionally — not inside any if-block.
+
+        This test specifically checks that the assignment is NOT gated on any
+        settings attribute (like bypass_permissions, which defaults to False).
+
+        Complements test_headless_permissions.py::test_no_conditional_bypass_in_options_build
+        by also checking that the assignment line is not indented under a settings check.
+        """
+        from pocketpaw.agents.claude_sdk import ClaudeSDKBackend
+
+        source = inspect.getsource(ClaudeSDKBackend.run)
+        lines = source.split("\n")
+
+        # Find the permission_mode assignment line
+        permission_line = None
+        for line in lines:
+            stripped = line.strip()
+            if 'permission_mode' in stripped and '=' in stripped and 'bypassPermissions' in stripped:
+                permission_line = stripped
+                break
+
+        assert permission_line is not None, (
+            "Could not find 'permission_mode = ...' assignment with 'bypassPermissions' in run(). "
+            "The permission bypass must be explicitly set."
+        )
+
+        # Verify the line is a direct dict assignment, not inside a conditional
+        # A conditional guard would look like: `if ...:` on the previous non-empty line
+        permission_line_idx = None
+        for i, line in enumerate(lines):
+            if 'permission_mode' in line and 'bypassPermissions' in line:
+                permission_line_idx = i
+                break
+
+        assert permission_line_idx is not None
+        # Walk backwards to find the most recent non-comment, non-empty line
+        for j in range(permission_line_idx - 1, max(0, permission_line_idx - 10), -1):
+            prev = lines[j].strip()
+            if prev and not prev.startswith("#"):
+                # If the preceding substantive line is an `if` that checks bypass_permissions,
+                # the fix has regressed
+                assert "bypass_permissions" not in prev or "if" not in prev, (
+                    f"permission_mode assignment appears to be inside a bypass_permissions guard. "
+                    f"Preceding line: {prev!r}"
+                )
+                break
+
+    def test_bypass_permissions_false_does_not_gate_permission_mode(self):
+        """When bypass_permissions=False (the default), the run() source must still
+        contain the unconditional bypassPermissions assignment.
+
+        This tests the exact failure mode from the original bug: the setting defaulted
+        to False, which gated the permission_mode assignment and caused hangs.
+        """
+        from pocketpaw.agents.claude_sdk import ClaudeSDKBackend
+
+        # Construct with bypass=False (the default / the bug scenario)
+        backend = ClaudeSDKBackend(_make_settings(bypass=False))
+        source = inspect.getsource(backend.run)
+
+        # The source must not have the old conditional pattern
+        assert "if self.settings.bypass_permissions:" not in source, (
+            "Found 'if self.settings.bypass_permissions:' in run() — "
+            "this is the regression that causes tool hangs on headless channels."
+        )
+        # The unconditional assignment must be present
+        assert '"bypassPermissions"' in source or "'bypassPermissions'" in source, (
+            "bypassPermissions string not found in run() source — "
+            "permission mode is not being set."
+        )
+
+    @pytest.mark.parametrize(
+        "tool_profile",
+        ["full", "minimal", "coding"],
+        ids=["full-profile", "minimal-profile", "coding-profile"],
+    )
+    def test_memory_tools_allowed_under_all_profiles(self, tool_profile: str):
+        """Memory tools must pass ToolPolicy.is_tool_allowed() for every built-in profile.
+
+        If a profile accidentally excludes memory tools, the agent can't save/recall
+        facts on headless channels — silent data loss with no error message.
+        """
+        from pocketpaw.tools.policy import ToolPolicy
+
+        policy = ToolPolicy(profile=tool_profile, allow=[], deny=[])
+
+        for tool_name in ("remember", "recall", "forget"):
+            allowed = policy.is_tool_allowed(tool_name)
+            assert allowed, (
+                f"Tool '{tool_name}' is blocked by profile '{tool_profile}'. "
+                f"Memory tools must be available on headless channels."
+            )
+
+    def test_tool_policy_deny_list_can_block_memory_tools(self):
+        """Sanity check: explicit deny list DOES block memory tools.
+
+        This verifies the policy system works correctly — if an operator
+        explicitly denies memory tools, the policy should honor that.
+        """
+        from pocketpaw.tools.policy import ToolPolicy
+
+        policy = ToolPolicy(profile="full", allow=[], deny=["remember"])
+        assert not policy.is_tool_allowed("remember"), (
+            "Explicit deny list should block 'remember' tool."
+        )
+        # Other memory tools not denied should still pass
+        assert policy.is_tool_allowed("recall")
+        assert policy.is_tool_allowed("forget")
+
+    def test_group_memory_in_deny_blocks_all_memory_tools(self):
+        """group:memory in deny list blocks all three memory tools."""
+        from pocketpaw.tools.policy import ToolPolicy
+
+        policy = ToolPolicy(profile="full", allow=[], deny=["group:memory"])
+        for tool_name in ("remember", "recall", "forget"):
+            assert not policy.is_tool_allowed(tool_name), (
+                f"group:memory deny should block '{tool_name}'"
+            )
+
+
+# ---------------------------------------------------------------------------
+# 4. Timeout guard tests — catch hangs like the permission bug
+# ---------------------------------------------------------------------------
+
+
+class TestToolExecutionTimeout:
+    """Any tool execution must complete within a 5-second timeout.
+
+    The permission hang bug manifested as an indefinite block — the tool call
+    never returned. These tests wrap executions in asyncio.wait_for() so a
+    hang becomes a deterministic test failure rather than a CI timeout.
+
+    We test the in-process tool execution path (not subprocess) since the
+    subprocess path is covered by test_headless_permissions.py.
+    """
+
+    @staticmethod
+    def _make_isolated_manager(tmp_path):
+        """Create a MemoryManager backed by tmp_path for test isolation."""
+        from pocketpaw.memory.manager import MemoryManager
+
+        return MemoryManager(
+            backend="file",
+            base_path=tmp_path / "memory",
+        )
+
+    @pytest.mark.asyncio
+    async def test_remember_tool_completes_within_timeout(self, tmp_path):
+        """RememberTool.execute() must complete within 5 seconds."""
+        from unittest.mock import patch
+
+        from pocketpaw.tools.builtin.memory import RememberTool
+
+        tool = RememberTool()
+        mgr = self._make_isolated_manager(tmp_path)
+
+        with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
+            result = await asyncio.wait_for(
+                tool.execute(content="User name is Ade", tags=["personal"]),
+                timeout=5.0,
+            )
+
+        assert result is not None, "RememberTool returned None"
+        assert isinstance(result, (dict, str)), f"Unexpected result type: {type(result)}"
+
+    @pytest.mark.asyncio
+    async def test_recall_tool_completes_within_timeout(self, tmp_path):
+        """RecallTool.execute() must complete within 5 seconds even on empty store."""
+        from unittest.mock import patch
+
+        from pocketpaw.tools.builtin.memory import RecallTool
+
+        tool = RecallTool()
+        mgr = self._make_isolated_manager(tmp_path)
+
+        with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
+            result = await asyncio.wait_for(
+                tool.execute(query="anything"),
+                timeout=5.0,
+            )
+
+        assert result is not None
+
+    @pytest.mark.asyncio
+    async def test_forget_tool_completes_within_timeout(self, tmp_path):
+        """ForgetTool.execute() must complete within 5 seconds."""
+        from unittest.mock import patch
+
+        from pocketpaw.tools.builtin.memory import ForgetTool
+
+        tool = ForgetTool()
+        mgr = self._make_isolated_manager(tmp_path)
+
+        with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
+            result = await asyncio.wait_for(
+                tool.execute(query="nonexistent"),
+                timeout=5.0,
+            )
+
+        assert result is not None
+
+    @pytest.mark.asyncio
+    async def test_remember_recall_roundtrip_within_timeout(self, tmp_path):
+        """Full memory roundtrip: save then recall, both must complete within 5s each."""
+        from unittest.mock import patch
+
+        from pocketpaw.tools.builtin.memory import RecallTool, RememberTool
+
+        remember = RememberTool()
+        recall = RecallTool()
+        mgr = self._make_isolated_manager(tmp_path)
+
+        with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
+            save_result = await asyncio.wait_for(
+                remember.execute(content="User name is Ade", tags=["personal"]),
+                timeout=5.0,
+            )
+            recall_result = await asyncio.wait_for(
+                recall.execute(query="Ade"),
+                timeout=5.0,
+            )
+
+        assert save_result is not None
+        assert recall_result is not None
+
+    @pytest.mark.asyncio
+    async def test_concurrent_tool_calls_complete_within_timeout(self, tmp_path):
+        """Multiple concurrent tool calls should all complete within 5 seconds.
+
+        The permission hang bug was especially bad under concurrent load — a
+        single blocked tool call could starve the entire event loop.
+        """
+        from unittest.mock import patch
+
+        from pocketpaw.tools.builtin.memory import RecallTool, RememberTool
+
+        remember = RememberTool()
+        recall = RecallTool()
+        mgr = self._make_isolated_manager(tmp_path)
+
+        async def run_all():
+            with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
+                await asyncio.gather(
+                    remember.execute(content="fact one", tags=[]),
+                    remember.execute(content="fact two", tags=[]),
+                    recall.execute(query="fact"),
+                )
+
+        try:
+            await asyncio.wait_for(run_all(), timeout=5.0)
+        except asyncio.TimeoutError:
+            pytest.fail(
+                "Concurrent tool calls timed out after 5s — "
+                "possible event loop starvation from a blocking tool call."
+            )
+
+
+# ---------------------------------------------------------------------------
+# 5. Tool bridge integration — full pipeline without real SDK
+# ---------------------------------------------------------------------------
+
+
+class TestToolBridgePipelineIntegration:
+    """End-to-end tool bridge pipeline tests: policy → registry → tool list.
+
+    These tests verify the full path that agent backends take when building
+    their tool lists, without requiring the actual OpenAI/ADK SDKs to be
+    installed.
+    """
+
+    def test_instantiate_all_tools_full_profile_returns_memory_tools(self):
+        """With profile='full', all memory tools are instantiated."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        tools = _instantiate_all_tools(backend="openai_agents")
+        names = {t.name for t in tools}
+
+        assert "remember" in names
+        assert "recall" in names
+        assert "forget" in names
+
+    def test_tool_definition_schema_is_valid_for_memory_tools(self):
+        """Each memory tool's definition must have name, description, and parameters."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        tools = {t.name: t for t in _instantiate_all_tools(backend="openai_agents")}
+
+        for tool_name in ("remember", "recall", "forget"):
+            assert tool_name in tools, f"Tool '{tool_name}' not found"
+            tool = tools[tool_name]
+            defn = tool.definition
+
+            assert defn.name, f"Tool '{tool_name}' has empty name"
+            assert defn.description, f"Tool '{tool_name}' has empty description"
+            assert defn.parameters is not None, f"Tool '{tool_name}' has no parameters schema"
+            assert "properties" in defn.parameters, (
+                f"Tool '{tool_name}' parameters schema missing 'properties' key"
+            )
+
+    def test_tool_count_is_consistent_across_backends(self):
+        """The number of tools for non-SDK backends should be equal.
+
+        All non-claude_agent_sdk backends use the same exclusion set
+        (_ALWAYS_EXCLUDED only), so they should return the same tool count.
+        """
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        non_sdk_backends = ["openai_agents", "google_adk", "opencode", "codex_cli", "copilot_sdk"]
+        counts = {b: len(_instantiate_all_tools(backend=b)) for b in non_sdk_backends}
+
+        # All non-SDK backends must return the same count
+        unique_counts = set(counts.values())
+        assert len(unique_counts) == 1, (
+            f"Non-SDK backends returned different tool counts: {counts}. "
+            "This means backend-specific exclusions were accidentally added."
+        )
+
+    def test_claude_sdk_backend_has_fewer_tools_than_others(self):
+        """claude_agent_sdk excludes shell/fs tools — it must return fewer tools."""
+        from pocketpaw.agents.tool_bridge import _instantiate_all_tools
+
+        sdk_count = len(_instantiate_all_tools(backend="claude_agent_sdk"))
+        openai_count = len(_instantiate_all_tools(backend="openai_agents"))
+
+        assert sdk_count < openai_count, (
+            f"Expected claude_agent_sdk ({sdk_count} tools) to have fewer tools than "
+            f"openai_agents ({openai_count} tools). "
+            "Shell/fs tools should be excluded from the SDK backend (provided natively)."
+        )