test: add smoke test script and headless integration tests

Smoke test (scripts/smoke_test.sh) starts the server and checks 7
endpoints — health, version, dashboard, OpenAPI, sessions, settings.
Run before releases to catch startup failures.

Integration tests (32 tests) cover server boot, tool bridge
completeness, permission_mode enforcement, and memory tool timeout
guards so bugs like the v0.4.9 permission hang get caught by CI.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Prakash
2026-03-11 12:46:26 +05:30
parent 10db2979e5
commit 57be456247
2 changed files with 863 additions and 0 deletions

228
scripts/smoke_test.sh Executable file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env bash
# Smoke test for PocketPaw server.
# Starts the server, polls until ready, hits key endpoints, and reports results.
# Exits non-zero if any check fails. Cleans up the server process on exit.
set -euo pipefail
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
BASE_URL="http://localhost:8888"
STARTUP_TIMEOUT=30 # seconds to wait for health endpoint
CURL_TIMEOUT=5 # per-request timeout
SERVER_PID=""
PASS_COUNT=0
FAIL_COUNT=0
# ---------------------------------------------------------------------------
# Colors
# ---------------------------------------------------------------------------
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
log_pass() {
PASS_COUNT=$((PASS_COUNT + 1))
echo -e " ${GREEN}PASS${NC} $1"
}
log_fail() {
FAIL_COUNT=$((FAIL_COUNT + 1))
echo -e " ${RED}FAIL${NC} $1"
}
log_info() {
echo -e " ${CYAN}INFO${NC} $1"
}
cleanup() {
if [ -n "$SERVER_PID" ] && kill -0 "$SERVER_PID" 2>/dev/null; then
log_info "Stopping PocketPaw server (PID $SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
}
trap cleanup EXIT INT TERM
# ---------------------------------------------------------------------------
# Start server
# ---------------------------------------------------------------------------
echo ""
echo -e "${CYAN}=== PocketPaw Smoke Test ===${NC}"
echo ""
log_info "Starting PocketPaw server..."
# Change to project root so uv can find pyproject.toml
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
cd "$PROJECT_ROOT"
uv run pocketpaw > /tmp/pocketpaw_smoke.log 2>&1 &
SERVER_PID=$!
log_info "Server PID: $SERVER_PID"
# ---------------------------------------------------------------------------
# Wait for server to be ready
# ---------------------------------------------------------------------------
log_info "Waiting for server to be ready (timeout: ${STARTUP_TIMEOUT}s)..."
ELAPSED=0
while [ $ELAPSED -lt $STARTUP_TIMEOUT ]; do
if curl -s -o /dev/null -w "%{http_code}" --max-time 2 "$BASE_URL/api/v1/health" 2>/dev/null | grep -q "200"; then
break
fi
# Check if server process died
if ! kill -0 "$SERVER_PID" 2>/dev/null; then
echo ""
log_fail "Server process died during startup. Last 20 lines of log:"
tail -20 /tmp/pocketpaw_smoke.log 2>/dev/null || true
exit 1
fi
sleep 1
ELAPSED=$((ELAPSED + 1))
done
if [ $ELAPSED -ge $STARTUP_TIMEOUT ]; then
log_fail "Server did not become ready within ${STARTUP_TIMEOUT}s"
echo " Last 20 lines of server log:"
tail -20 /tmp/pocketpaw_smoke.log 2>/dev/null || true
exit 1
fi
log_info "Server ready after ${ELAPSED}s"
echo ""
# ---------------------------------------------------------------------------
# Test 1: Health endpoint returns 200
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_health.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/api/v1/health")
if [ "$HTTP_CODE" = "200" ]; then
log_pass "Health endpoint returned 200"
else
log_fail "Health endpoint returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 2: Version endpoint returns 200 with version field
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_version.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/api/v1/version")
if [ "$HTTP_CODE" = "200" ]; then
# Check that response contains a "version" key
if grep -q '"version"' /tmp/pocketpaw_version.json 2>/dev/null; then
VERSION=$(python3 -c "import json; print(json.load(open('/tmp/pocketpaw_version.json'))['version'])" 2>/dev/null || echo "unknown")
log_pass "Version endpoint returned 200 (v$VERSION)"
else
log_fail "Version endpoint returned 200 but missing 'version' field"
fi
else
log_fail "Version endpoint returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 3: Dashboard serves HTML at /
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_dashboard.html -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/")
if [ "$HTTP_CODE" = "200" ]; then
if grep -qi '<html' /tmp/pocketpaw_dashboard.html 2>/dev/null; then
log_pass "Dashboard returned 200 with HTML content"
else
log_fail "Dashboard returned 200 but response is not HTML"
fi
else
log_fail "Dashboard returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 4: OpenAPI spec is accessible
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_openapi.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/api/v1/openapi.json")
if [ "$HTTP_CODE" = "200" ]; then
if grep -q '"openapi"' /tmp/pocketpaw_openapi.json 2>/dev/null; then
log_pass "OpenAPI spec returned 200 with valid schema"
else
log_fail "OpenAPI spec returned 200 but missing 'openapi' field"
fi
else
log_fail "OpenAPI spec returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 5: Sessions endpoint responds (create session)
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_session.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT \
-X POST "$BASE_URL/api/v1/sessions")
if [ "$HTTP_CODE" = "200" ]; then
if grep -q '"id"' /tmp/pocketpaw_session.json 2>/dev/null; then
log_pass "Create session returned 200 with session ID"
else
log_fail "Create session returned 200 but missing 'id' field"
fi
else
log_fail "Create session returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 6: Sessions list endpoint responds
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_sessions_list.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/api/v1/sessions")
if [ "$HTTP_CODE" = "200" ]; then
log_pass "List sessions returned 200"
else
log_fail "List sessions returned $HTTP_CODE (expected 200)"
fi
# ---------------------------------------------------------------------------
# Test 7: Settings endpoint responds
# ---------------------------------------------------------------------------
HTTP_CODE=$(curl -s -o /tmp/pocketpaw_settings.json -w "%{http_code}" \
--max-time $CURL_TIMEOUT "$BASE_URL/api/v1/settings")
if [ "$HTTP_CODE" = "200" ]; then
log_pass "Settings endpoint returned 200"
else
# Settings may require auth even on localhost in some configs — warn, don't fail hard
if [ "$HTTP_CODE" = "401" ] || [ "$HTTP_CODE" = "403" ]; then
log_info "Settings endpoint returned $HTTP_CODE (auth required — skipping)"
else
log_fail "Settings endpoint returned $HTTP_CODE (expected 200)"
fi
fi
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
TOTAL=$((PASS_COUNT + FAIL_COUNT))
echo -e "${CYAN}=== Results: ${GREEN}${PASS_COUNT} passed${NC}, ${RED}${FAIL_COUNT} failed${NC} out of ${TOTAL} checks ===${NC}"
echo ""
if [ $FAIL_COUNT -gt 0 ]; then
echo -e "${RED}Smoke test FAILED${NC}"
exit 1
else
echo -e "${GREEN}Smoke test PASSED${NC}"
exit 0
fi

View File

@@ -0,0 +1,635 @@
# test_integration_headless.py — Integration tests for headless channel correctness.
# Created: 2026-03-11
#
# Catches regressions like the permission_mode hang bug (where headless channels
# hang because tool permissions require terminal interaction) and related issues.
#
# Covers:
# 1. Server startup — FastAPI app boots and health endpoint is reachable.
# 2. Tool bridge completeness — memory tools present for ALL backends.
# 3. Channel adapter tool access — bypassPermissions always set in SDK options.
# 4. Timeout guard — tool execution completes within 5 seconds (catches hangs).
#
# Tests marked @pytest.mark.integration require a running server or real external
# deps and are skipped in CI by default. Run locally with:
# uv run pytest tests/test_integration_headless.py -v
# uv run pytest tests/test_integration_headless.py -v -m integration # integration only
from __future__ import annotations
import asyncio
import inspect
from unittest.mock import MagicMock
import pytest
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _make_settings(*, tool_profile: str = "full", bypass: bool = False) -> MagicMock:
"""Minimal mock Settings with safe defaults for headless tests."""
settings = MagicMock()
settings.bypass_permissions = bypass
settings.agent_backend = "claude_agent_sdk"
settings.anthropic_api_key = "sk-ant-test-key"
settings.claude_sdk_model = ""
settings.claude_sdk_max_turns = 0
settings.smart_routing_enabled = False
settings.tool_profile = tool_profile
settings.tools_allow = []
settings.tools_deny = []
settings.mcp_servers = {}
settings.claude_sdk_provider = "anthropic"
settings.ollama_base_url = "http://localhost:11434"
settings.openai_api_key = ""
settings.openai_base_url = ""
settings.openrouter_api_key = ""
settings.gemini_api_key = ""
settings.openai_agents_model = ""
settings.file_jail_path = "/tmp"
return settings
# ---------------------------------------------------------------------------
# 1. Server startup integration tests
# ---------------------------------------------------------------------------
class TestServerStartup:
"""Verify the FastAPI app mounts cleanly and the health endpoint responds.
These tests use a minimal FastAPI instance with just the health router —
same pattern as test_api_v1_health.py — to avoid import-time side effects
from dashboard.py (Settings.load, CORS origin resolution, etc.).
"""
def test_health_router_mounts_without_error(self):
"""Mount v1 health router on a bare FastAPI app — should not raise."""
from fastapi import FastAPI
from pocketpaw.api.v1.health import router
app = FastAPI()
# Should not raise during mount
app.include_router(router, prefix="/api/v1")
# Verify the expected routes are registered
routes = {r.path for r in app.routes}
assert "/api/v1/health" in routes
assert "/api/v1/version" in routes
def test_health_endpoint_returns_200(self):
"""GET /api/v1/health returns HTTP 200 with a health summary."""
from unittest.mock import patch
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pocketpaw.api.v1.health import router
app = FastAPI()
app.include_router(router, prefix="/api/v1")
client = TestClient(app)
mock_engine = MagicMock()
mock_engine.summary = {"status": "healthy", "check_count": 0, "issues": []}
with patch("pocketpaw.health.get_health_engine", return_value=mock_engine):
resp = client.get("/api/v1/health")
assert resp.status_code == 200
data = resp.json()
# The endpoint returns a HealthSummary — must have a "status" field
assert "status" in data
def test_version_endpoint_returns_package_version(self):
"""GET /api/v1/version returns the installed pocketpaw version string."""
from unittest.mock import patch
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pocketpaw.api.v1.health import router
app = FastAPI()
app.include_router(router, prefix="/api/v1")
client = TestClient(app)
mock_settings = MagicMock()
mock_settings.agent_backend = "claude_agent_sdk"
with patch("pocketpaw.config.Settings.load", return_value=mock_settings):
resp = client.get("/api/v1/version")
assert resp.status_code == 200
data = resp.json()
assert "version" in data
assert "python" in data
assert "agent_backend" in data
def test_all_critical_v1_routers_mount_without_error(self):
"""mount_v1_routers() must not raise for Auth, Chat, Health, Sessions.
This is the integration point where a bad import in a router module
would surface as a startup failure rather than a 404.
"""
from fastapi import FastAPI
from pocketpaw.api.v1 import mount_v1_routers
app = FastAPI()
# Should not raise — critical routers are re-raised by mount_v1_routers
mount_v1_routers(app)
# Spot-check that key health routes are registered
routes = {r.path for r in app.routes}
assert "/api/v1/health" in routes
assert "/api/v1/version" in routes
@pytest.mark.integration
async def test_full_dashboard_app_health_endpoint(self):
"""Import the full dashboard app and hit /api/v1/health via httpx.
Marked @pytest.mark.integration — skipped in CI by default.
Requires all dashboard dependencies to be installed.
"""
import httpx
from fastapi.testclient import TestClient
from unittest.mock import patch
from pocketpaw.dashboard import app
mock_engine = MagicMock()
mock_engine.summary = {"status": "healthy", "check_count": 0, "issues": []}
with patch("pocketpaw.health.get_health_engine", return_value=mock_engine):
client = TestClient(app, raise_server_exceptions=False)
resp = client.get("/api/v1/health")
# Health endpoint should respond — even if status is "unknown" due to
# limited env, it should not 500 or hang
assert resp.status_code in (200, 401) # 401 if auth middleware is active
# ---------------------------------------------------------------------------
# 2. Tool bridge completeness tests
# ---------------------------------------------------------------------------
class TestToolBridgeCompleteness:
"""Verify memory tools (RememberTool, RecallTool, ForgetTool) are available
for ALL agent backends, not just some.
The regression to guard: accidentally adding memory tools to _ALWAYS_EXCLUDED
or to a backend-specific exclusion list would silently break memory on all
headless channels (Telegram, Discord, Slack) where the agent can't use Bash
to invoke the tools via subprocess.
"""
# All backends that go through _instantiate_all_tools()
_ALL_BACKENDS = [
"openai_agents",
"google_adk",
"opencode",
"codex_cli",
"copilot_sdk",
"claude_agent_sdk", # Different exclusion rules — shell/fs excluded, not memory
]
_MEMORY_TOOL_NAMES = {"remember", "recall", "forget"}
def _get_tool_names(self, backend: str) -> set[str]:
"""Return the set of tool names that _instantiate_all_tools returns."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
tools = _instantiate_all_tools(backend=backend)
return {t.name for t in tools}
@pytest.mark.parametrize("backend", _ALL_BACKENDS)
def test_memory_tools_present_for_backend(self, backend: str):
"""Memory tools must appear in the tool list for every backend."""
tool_names = self._get_tool_names(backend)
missing = self._MEMORY_TOOL_NAMES - tool_names
assert not missing, (
f"Backend '{backend}' is missing memory tools: {missing}. "
f"These tools are required for headless channels to save/recall facts."
)
def test_memory_tools_not_in_always_excluded(self):
"""RememberTool, RecallTool, ForgetTool must not appear in _ALWAYS_EXCLUDED."""
from pocketpaw.agents.tool_bridge import _ALWAYS_EXCLUDED
memory_class_names = {"RememberTool", "RecallTool", "ForgetTool"}
accidentally_excluded = memory_class_names & _ALWAYS_EXCLUDED
assert not accidentally_excluded, (
f"Memory tools accidentally added to _ALWAYS_EXCLUDED: {accidentally_excluded}. "
"This would break memory on ALL backends and ALL channels."
)
def test_memory_tools_not_in_claude_sdk_excluded(self):
"""Memory tools must not be in _CLAUDE_SDK_EXCLUDED.
The claude_agent_sdk backend excludes shell/fs tools because the SDK
provides them natively via Bash/Read/Write. Memory tools are NOT
provided natively by the SDK — they must come through the tool bridge
(invoked via `python -m pocketpaw.tools.cli`).
"""
from pocketpaw.agents.tool_bridge import _CLAUDE_SDK_EXCLUDED
memory_class_names = {"RememberTool", "RecallTool", "ForgetTool"}
accidentally_excluded = memory_class_names & _CLAUDE_SDK_EXCLUDED
assert not accidentally_excluded, (
f"Memory tools accidentally added to _CLAUDE_SDK_EXCLUDED: {accidentally_excluded}. "
"The Claude SDK backend uses Bash to invoke memory tools via subprocess — "
"they must remain in the tool list so the agent knows about them."
)
def test_shell_tools_excluded_only_for_claude_sdk(self):
"""Shell/fs tools (ShellTool, ReadFileTool, etc.) are excluded for claude_agent_sdk
but available for other backends — verify the exclusion is backend-specific."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
# For non-SDK backends, shell tools should be included
openai_tools = {t.name for t in _instantiate_all_tools(backend="openai_agents")}
# For the SDK backend, shell tools are excluded (SDK provides Bash natively)
sdk_tools = {t.name for t in _instantiate_all_tools(backend="claude_agent_sdk")}
# Shell tool exists under some name in openai_agents but not claude_agent_sdk
# We can't check by exact tool name easily, so check that sdk has FEWER tools
# than openai_agents (shell/fs exclusion reduces the count)
assert len(sdk_tools) < len(openai_tools) or len(sdk_tools) == len(openai_tools), (
"Expected claude_agent_sdk to have <= tools compared to openai_agents "
"(shell/fs excluded from SDK backend)"
)
def test_remember_tool_has_correct_name(self):
"""RememberTool.name must be 'remember' — the name used in tool policy lookups."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
tools = {t.name: t for t in _instantiate_all_tools(backend="openai_agents")}
assert "remember" in tools, "RememberTool not found by name 'remember'"
assert "recall" in tools, "RecallTool not found by name 'recall'"
assert "forget" in tools, "ForgetTool not found by name 'forget'"
def test_tool_bridge_returns_non_empty_list_for_all_backends(self):
"""_instantiate_all_tools must return at least the memory tools for every backend."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
for backend in self._ALL_BACKENDS:
tools = _instantiate_all_tools(backend=backend)
assert len(tools) > 0, (
f"_instantiate_all_tools('{backend}') returned empty list — "
"agent would have no tools available."
)
# ---------------------------------------------------------------------------
# 3. Channel adapter tool access / bypassPermissions tests
# ---------------------------------------------------------------------------
class TestHeadlessChannelToolAccess:
"""Verify that headless channel contexts always get bypassPermissions.
The key insight: Telegram, Discord, Slack, WhatsApp, and web channels are
all headless — there is no terminal for interactive permission prompts.
Without bypassPermissions, tool calls (memory save via Bash, web search,
etc.) hang indefinitely.
These tests are complementary to test_headless_permissions.py — they focus
on different aspects and do NOT duplicate the source inspection tests there.
"""
def test_permission_mode_is_unconditional_in_run_source(self):
"""bypassPermissions must be set unconditionally — not inside any if-block.
This test specifically checks that the assignment is NOT gated on any
settings attribute (like bypass_permissions, which defaults to False).
Complements test_headless_permissions.py::test_no_conditional_bypass_in_options_build
by also checking that the assignment line is not indented under a settings check.
"""
from pocketpaw.agents.claude_sdk import ClaudeSDKBackend
source = inspect.getsource(ClaudeSDKBackend.run)
lines = source.split("\n")
# Find the permission_mode assignment line
permission_line = None
for line in lines:
stripped = line.strip()
if 'permission_mode' in stripped and '=' in stripped and 'bypassPermissions' in stripped:
permission_line = stripped
break
assert permission_line is not None, (
"Could not find 'permission_mode = ...' assignment with 'bypassPermissions' in run(). "
"The permission bypass must be explicitly set."
)
# Verify the line is a direct dict assignment, not inside a conditional
# A conditional guard would look like: `if ...:` on the previous non-empty line
permission_line_idx = None
for i, line in enumerate(lines):
if 'permission_mode' in line and 'bypassPermissions' in line:
permission_line_idx = i
break
assert permission_line_idx is not None
# Walk backwards to find the most recent non-comment, non-empty line
for j in range(permission_line_idx - 1, max(0, permission_line_idx - 10), -1):
prev = lines[j].strip()
if prev and not prev.startswith("#"):
# If the preceding substantive line is an `if` that checks bypass_permissions,
# the fix has regressed
assert "bypass_permissions" not in prev or "if" not in prev, (
f"permission_mode assignment appears to be inside a bypass_permissions guard. "
f"Preceding line: {prev!r}"
)
break
def test_bypass_permissions_false_does_not_gate_permission_mode(self):
"""When bypass_permissions=False (the default), the run() source must still
contain the unconditional bypassPermissions assignment.
This tests the exact failure mode from the original bug: the setting defaulted
to False, which gated the permission_mode assignment and caused hangs.
"""
from pocketpaw.agents.claude_sdk import ClaudeSDKBackend
# Construct with bypass=False (the default / the bug scenario)
backend = ClaudeSDKBackend(_make_settings(bypass=False))
source = inspect.getsource(backend.run)
# The source must not have the old conditional pattern
assert "if self.settings.bypass_permissions:" not in source, (
"Found 'if self.settings.bypass_permissions:' in run() — "
"this is the regression that causes tool hangs on headless channels."
)
# The unconditional assignment must be present
assert '"bypassPermissions"' in source or "'bypassPermissions'" in source, (
"bypassPermissions string not found in run() source — "
"permission mode is not being set."
)
@pytest.mark.parametrize(
"tool_profile",
["full", "minimal", "coding"],
ids=["full-profile", "minimal-profile", "coding-profile"],
)
def test_memory_tools_allowed_under_all_profiles(self, tool_profile: str):
"""Memory tools must pass ToolPolicy.is_tool_allowed() for every built-in profile.
If a profile accidentally excludes memory tools, the agent can't save/recall
facts on headless channels — silent data loss with no error message.
"""
from pocketpaw.tools.policy import ToolPolicy
policy = ToolPolicy(profile=tool_profile, allow=[], deny=[])
for tool_name in ("remember", "recall", "forget"):
allowed = policy.is_tool_allowed(tool_name)
assert allowed, (
f"Tool '{tool_name}' is blocked by profile '{tool_profile}'. "
f"Memory tools must be available on headless channels."
)
def test_tool_policy_deny_list_can_block_memory_tools(self):
"""Sanity check: explicit deny list DOES block memory tools.
This verifies the policy system works correctly — if an operator
explicitly denies memory tools, the policy should honor that.
"""
from pocketpaw.tools.policy import ToolPolicy
policy = ToolPolicy(profile="full", allow=[], deny=["remember"])
assert not policy.is_tool_allowed("remember"), (
"Explicit deny list should block 'remember' tool."
)
# Other memory tools not denied should still pass
assert policy.is_tool_allowed("recall")
assert policy.is_tool_allowed("forget")
def test_group_memory_in_deny_blocks_all_memory_tools(self):
"""group:memory in deny list blocks all three memory tools."""
from pocketpaw.tools.policy import ToolPolicy
policy = ToolPolicy(profile="full", allow=[], deny=["group:memory"])
for tool_name in ("remember", "recall", "forget"):
assert not policy.is_tool_allowed(tool_name), (
f"group:memory deny should block '{tool_name}'"
)
# ---------------------------------------------------------------------------
# 4. Timeout guard tests — catch hangs like the permission bug
# ---------------------------------------------------------------------------
class TestToolExecutionTimeout:
"""Any tool execution must complete within a 5-second timeout.
The permission hang bug manifested as an indefinite block — the tool call
never returned. These tests wrap executions in asyncio.wait_for() so a
hang becomes a deterministic test failure rather than a CI timeout.
We test the in-process tool execution path (not subprocess) since the
subprocess path is covered by test_headless_permissions.py.
"""
@staticmethod
def _make_isolated_manager(tmp_path):
"""Create a MemoryManager backed by tmp_path for test isolation."""
from pocketpaw.memory.manager import MemoryManager
return MemoryManager(
backend="file",
base_path=tmp_path / "memory",
)
@pytest.mark.asyncio
async def test_remember_tool_completes_within_timeout(self, tmp_path):
"""RememberTool.execute() must complete within 5 seconds."""
from unittest.mock import patch
from pocketpaw.tools.builtin.memory import RememberTool
tool = RememberTool()
mgr = self._make_isolated_manager(tmp_path)
with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
result = await asyncio.wait_for(
tool.execute(content="User name is Ade", tags=["personal"]),
timeout=5.0,
)
assert result is not None, "RememberTool returned None"
assert isinstance(result, (dict, str)), f"Unexpected result type: {type(result)}"
@pytest.mark.asyncio
async def test_recall_tool_completes_within_timeout(self, tmp_path):
"""RecallTool.execute() must complete within 5 seconds even on empty store."""
from unittest.mock import patch
from pocketpaw.tools.builtin.memory import RecallTool
tool = RecallTool()
mgr = self._make_isolated_manager(tmp_path)
with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
result = await asyncio.wait_for(
tool.execute(query="anything"),
timeout=5.0,
)
assert result is not None
@pytest.mark.asyncio
async def test_forget_tool_completes_within_timeout(self, tmp_path):
"""ForgetTool.execute() must complete within 5 seconds."""
from unittest.mock import patch
from pocketpaw.tools.builtin.memory import ForgetTool
tool = ForgetTool()
mgr = self._make_isolated_manager(tmp_path)
with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
result = await asyncio.wait_for(
tool.execute(query="nonexistent"),
timeout=5.0,
)
assert result is not None
@pytest.mark.asyncio
async def test_remember_recall_roundtrip_within_timeout(self, tmp_path):
"""Full memory roundtrip: save then recall, both must complete within 5s each."""
from unittest.mock import patch
from pocketpaw.tools.builtin.memory import RecallTool, RememberTool
remember = RememberTool()
recall = RecallTool()
mgr = self._make_isolated_manager(tmp_path)
with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
save_result = await asyncio.wait_for(
remember.execute(content="User name is Ade", tags=["personal"]),
timeout=5.0,
)
recall_result = await asyncio.wait_for(
recall.execute(query="Ade"),
timeout=5.0,
)
assert save_result is not None
assert recall_result is not None
@pytest.mark.asyncio
async def test_concurrent_tool_calls_complete_within_timeout(self, tmp_path):
"""Multiple concurrent tool calls should all complete within 5 seconds.
The permission hang bug was especially bad under concurrent load — a
single blocked tool call could starve the entire event loop.
"""
from unittest.mock import patch
from pocketpaw.tools.builtin.memory import RecallTool, RememberTool
remember = RememberTool()
recall = RecallTool()
mgr = self._make_isolated_manager(tmp_path)
async def run_all():
with patch("pocketpaw.tools.builtin.memory.get_memory_manager", return_value=mgr):
await asyncio.gather(
remember.execute(content="fact one", tags=[]),
remember.execute(content="fact two", tags=[]),
recall.execute(query="fact"),
)
try:
await asyncio.wait_for(run_all(), timeout=5.0)
except asyncio.TimeoutError:
pytest.fail(
"Concurrent tool calls timed out after 5s — "
"possible event loop starvation from a blocking tool call."
)
# ---------------------------------------------------------------------------
# 5. Tool bridge integration — full pipeline without real SDK
# ---------------------------------------------------------------------------
class TestToolBridgePipelineIntegration:
"""End-to-end tool bridge pipeline tests: policy → registry → tool list.
These tests verify the full path that agent backends take when building
their tool lists, without requiring the actual OpenAI/ADK SDKs to be
installed.
"""
def test_instantiate_all_tools_full_profile_returns_memory_tools(self):
"""With profile='full', all memory tools are instantiated."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
tools = _instantiate_all_tools(backend="openai_agents")
names = {t.name for t in tools}
assert "remember" in names
assert "recall" in names
assert "forget" in names
def test_tool_definition_schema_is_valid_for_memory_tools(self):
"""Each memory tool's definition must have name, description, and parameters."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
tools = {t.name: t for t in _instantiate_all_tools(backend="openai_agents")}
for tool_name in ("remember", "recall", "forget"):
assert tool_name in tools, f"Tool '{tool_name}' not found"
tool = tools[tool_name]
defn = tool.definition
assert defn.name, f"Tool '{tool_name}' has empty name"
assert defn.description, f"Tool '{tool_name}' has empty description"
assert defn.parameters is not None, f"Tool '{tool_name}' has no parameters schema"
assert "properties" in defn.parameters, (
f"Tool '{tool_name}' parameters schema missing 'properties' key"
)
def test_tool_count_is_consistent_across_backends(self):
"""The number of tools for non-SDK backends should be equal.
All non-claude_agent_sdk backends use the same exclusion set
(_ALWAYS_EXCLUDED only), so they should return the same tool count.
"""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
non_sdk_backends = ["openai_agents", "google_adk", "opencode", "codex_cli", "copilot_sdk"]
counts = {b: len(_instantiate_all_tools(backend=b)) for b in non_sdk_backends}
# All non-SDK backends must return the same count
unique_counts = set(counts.values())
assert len(unique_counts) == 1, (
f"Non-SDK backends returned different tool counts: {counts}. "
"This means backend-specific exclusions were accidentally added."
)
def test_claude_sdk_backend_has_fewer_tools_than_others(self):
"""claude_agent_sdk excludes shell/fs tools — it must return fewer tools."""
from pocketpaw.agents.tool_bridge import _instantiate_all_tools
sdk_count = len(_instantiate_all_tools(backend="claude_agent_sdk"))
openai_count = len(_instantiate_all_tools(backend="openai_agents"))
assert sdk_count < openai_count, (
f"Expected claude_agent_sdk ({sdk_count} tools) to have fewer tools than "
f"openai_agents ({openai_count} tools). "
"Shell/fs tools should be excluded from the SDK backend (provided natively)."
)