fix(container): poll readiness probe within descriptor budget on start

ManagedContainer.start was firing the subclass `readinessProbe()` exactly once, the moment containerd reported the container as Up. For OpenClaw this raced the Node.js gateway's HTTP listener bind — containerd flips status as soon as the entrypoint process spawns, but the Express server takes a few hundred ms to start serving /readyz. Single-shot probe → unlucky → state='errored' with "Readiness probe failed after container reached running state". Pre-refactor (dev branch) didn't hit this because openclaw used a two-phase flow: `runtime.startGateway` (no probe) then `service.waitForReady` (polled /readyz for 30s). When the new runtime architecture folded openclaw under ManagedContainer, the polling was lost. Bring it into the base class: `ManagedContainer.start` now polls `readinessProbe()` within `descriptor.readinessProbe.timeoutMs` at `intervalMs` cadence. Deterministic probes (Hermes' `--version` exec) succeed on the first call and exit immediately — no extra latency. HTTP probes get the full budget they need. Also stops misapplying `descriptor.readinessProbe` to the containerd "Up" wait (which only takes ~50ms anyway — defaults are fine).
2026-05-13 15:46:22 +00:00 · 2026-05-11 20:41:43 +05:30
parent b6172a4109
commit e77031025b
3 changed files with 83 additions and 4 deletions
--- a/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts
+++ b/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts
@@ -200,14 +200,20 @@ export abstract class ManagedContainer {
        })
        await this.deps.cli.createContainer(spec, log)
        await this.deps.cli.startContainer(spec.name, log)
+        await this.deps.cli.waitForContainerRunning(spec.name)
+        // Poll the subclass-defined readiness probe within the
+        // descriptor's budget. The container being "Up" in containerd
+        // only means the entrypoint process spawned — for HTTP probes
+        // (e.g. openclaw's /readyz) the listener can take a few hundred
+        // ms after that to bind. For deterministic probes (e.g. hermes'
+        // `--version` exec) the first call succeeds and the loop exits
+        // immediately. Subclasses without a transient-readiness phase
+        // pay nothing extra.
        const probeOpts = this.descriptor.readinessProbe
-        await this.deps.cli.waitForContainerRunning(spec.name, {
+        const probeOk = await this.pollReadinessProbe({
          timeoutMs: probeOpts?.timeoutMs ?? 30_000,
          intervalMs: probeOpts?.intervalMs ?? 500,
        })
-        // Run the subclass-defined probe — usually a `--version` exec
-        // or HTTP /readyz call. Failing this is errored, not stopped.
-        const probeOk = await this.readinessProbe()
        if (!probeOk) {
          this.setState(
            'errored',
@@ -228,6 +234,26 @@ export abstract class ManagedContainer {
    })
  }

+  /** Poll the subclass `readinessProbe` until it returns true, errors
+   *  swallowed (treated as not-yet-ready), or the timeout elapses. */
+  private async pollReadinessProbe(opts: {
+    timeoutMs: number
+    intervalMs: number
+  }): Promise<boolean> {
+    const startedAt = Date.now()
+    while (Date.now() - startedAt <= opts.timeoutMs) {
+      try {
+        if (await this.readinessProbe()) return true
+      } catch {
+        // Treat thrown probes as transient — keep polling within budget.
+      }
+      const remainingMs = opts.timeoutMs - (Date.now() - startedAt)
+      if (remainingMs <= 0) break
+      await Bun.sleep(Math.min(opts.intervalMs, remainingMs))
+    }
+    return false
+  }
+
  /** Stop and remove the container. Image and per-agent data preserved. */
  async stop(): Promise<void> {
    return this.withLifecycleLock('stop', async () => {
--- a/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts
+++ b/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts
@@ -161,6 +161,12 @@ describe('HermesContainerRuntime', () => {
      browserosDir: '/host/browseros',
      hermesHarnessHostDir: '/host/browseros/vm/hermes/harness',
    })
+    // Tight budget so the polling loop doesn't drag the test out for
+    // the full 30s readinessProbe.timeoutMs.
+    ;(runtime.descriptor as { readinessProbe?: unknown }).readinessProbe = {
+      timeoutMs: 100,
+      intervalMs: 10,
+    }
    await expect(runtime.start()).rejects.toThrow(/probe failed/i)
    expect(runtime.getState()).toBe('errored')
  })
--- a/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts
+++ b/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts
@@ -48,6 +48,8 @@ class TestContainer extends ManagedContainer {
    defaultImage: 'docker.io/test:latest',
    containerName: 'test-container',
    platforms: ['darwin' as NodeJS.Platform],
+    // Snappy budget so probe-fails tests don't drag the suite.
+    readinessProbe: { timeoutMs: 100, intervalMs: 10 },
  }

  probeOutcome: boolean | Error = true
@@ -170,6 +172,51 @@ describe('ManagedContainer', () => {
      expect(c.getStatusSnapshot().lastError).toMatch(/probe failed/i)
    })

+    it('polls the readiness probe until it succeeds within the descriptor budget', async () => {
+      const lockDir = mkTempDir()
+      const deps = makeFakeDeps({ lockDir })
+      const c = new TestContainer(deps)
+      // Tight budget so the test stays snappy even on slow CI.
+      ;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
+        timeoutMs: 200,
+        intervalMs: 20,
+      }
+      // Probe fails the first two calls (mimics the HTTP-listener race
+      // openclaw's /readyz hits), then flips to success.
+      c.probeOutcome = false
+      let calls = 0
+      const original = c.readinessProbe.bind(c)
+      ;(
+        c as unknown as { readinessProbe: () => Promise<boolean> }
+      ).readinessProbe = async () => {
+        calls += 1
+        if (calls < 3) return false
+        return original.call(c)
+      }
+      c.probeOutcome = true
+
+      await c.start()
+
+      expect(c.getState()).toBe('running')
+      expect(calls).toBeGreaterThanOrEqual(3)
+    })
+
+    it('times out when the probe never succeeds, with errored state', async () => {
+      const lockDir = mkTempDir()
+      const deps = makeFakeDeps({ lockDir })
+      const c = new TestContainer(deps)
+      ;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
+        timeoutMs: 80,
+        intervalMs: 20,
+      }
+      c.probeOutcome = false
+
+      await expect(c.start()).rejects.toThrow(/probe failed/i)
+      expect(c.getState()).toBe('errored')
+      // Should have polled multiple times within the budget, not just once.
+      expect(c.probeCalls).toBeGreaterThanOrEqual(2)
+    })
+
    it('stop() force-transitions to stopped even from errored', async () => {
      const lockDir = mkTempDir()
      const deps = makeFakeDeps({ lockDir })