From e77031025b43bac2dc89f85bb8050472fdf6e8d1 Mon Sep 17 00:00:00 2001 From: DaniAkash Date: Mon, 11 May 2026 20:41:43 +0530 Subject: [PATCH] fix(container): poll readiness probe within descriptor budget on start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ManagedContainer.start was firing the subclass `readinessProbe()` exactly once, the moment containerd reported the container as Up. For OpenClaw this raced the Node.js gateway's HTTP listener bind — containerd flips status as soon as the entrypoint process spawns, but the Express server takes a few hundred ms to start serving /readyz. Single-shot probe → unlucky → state='errored' with "Readiness probe failed after container reached running state". Pre-refactor (dev branch) didn't hit this because openclaw used a two-phase flow: `runtime.startGateway` (no probe) then `service.waitForReady` (polled /readyz for 30s). When the new runtime architecture folded openclaw under ManagedContainer, the polling was lost. Bring it into the base class: `ManagedContainer.start` now polls `readinessProbe()` within `descriptor.readinessProbe.timeoutMs` at `intervalMs` cadence. Deterministic probes (Hermes' `--version` exec) succeed on the first call and exit immediately — no extra latency. HTTP probes get the full budget they need. Also stops misapplying `descriptor.readinessProbe` to the containerd "Up" wait (which only takes ~50ms anyway — defaults are fine). --- .../container/managed/managed-container.ts | 34 ++++++++++++-- .../runtime/hermes-container-runtime.test.ts | 6 +++ .../managed/managed-container.test.ts | 47 +++++++++++++++++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts b/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts index 388e11da3..5c98480aa 100644 --- a/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts +++ b/packages/browseros-agent/apps/server/src/lib/container/managed/managed-container.ts @@ -200,14 +200,20 @@ export abstract class ManagedContainer { }) await this.deps.cli.createContainer(spec, log) await this.deps.cli.startContainer(spec.name, log) + await this.deps.cli.waitForContainerRunning(spec.name) + // Poll the subclass-defined readiness probe within the + // descriptor's budget. The container being "Up" in containerd + // only means the entrypoint process spawned — for HTTP probes + // (e.g. openclaw's /readyz) the listener can take a few hundred + // ms after that to bind. For deterministic probes (e.g. hermes' + // `--version` exec) the first call succeeds and the loop exits + // immediately. Subclasses without a transient-readiness phase + // pay nothing extra. const probeOpts = this.descriptor.readinessProbe - await this.deps.cli.waitForContainerRunning(spec.name, { + const probeOk = await this.pollReadinessProbe({ timeoutMs: probeOpts?.timeoutMs ?? 30_000, intervalMs: probeOpts?.intervalMs ?? 500, }) - // Run the subclass-defined probe — usually a `--version` exec - // or HTTP /readyz call. Failing this is errored, not stopped. - const probeOk = await this.readinessProbe() if (!probeOk) { this.setState( 'errored', @@ -228,6 +234,26 @@ export abstract class ManagedContainer { }) } + /** Poll the subclass `readinessProbe` until it returns true, errors + * swallowed (treated as not-yet-ready), or the timeout elapses. */ + private async pollReadinessProbe(opts: { + timeoutMs: number + intervalMs: number + }): Promise { + const startedAt = Date.now() + while (Date.now() - startedAt <= opts.timeoutMs) { + try { + if (await this.readinessProbe()) return true + } catch { + // Treat thrown probes as transient — keep polling within budget. + } + const remainingMs = opts.timeoutMs - (Date.now() - startedAt) + if (remainingMs <= 0) break + await Bun.sleep(Math.min(opts.intervalMs, remainingMs)) + } + return false + } + /** Stop and remove the container. Image and per-agent data preserved. */ async stop(): Promise { return this.withLifecycleLock('stop', async () => { diff --git a/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts b/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts index fe792fc30..2f2b2d25d 100644 --- a/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts +++ b/packages/browseros-agent/apps/server/tests/lib/agents/runtime/hermes-container-runtime.test.ts @@ -161,6 +161,12 @@ describe('HermesContainerRuntime', () => { browserosDir: '/host/browseros', hermesHarnessHostDir: '/host/browseros/vm/hermes/harness', }) + // Tight budget so the polling loop doesn't drag the test out for + // the full 30s readinessProbe.timeoutMs. + ;(runtime.descriptor as { readinessProbe?: unknown }).readinessProbe = { + timeoutMs: 100, + intervalMs: 10, + } await expect(runtime.start()).rejects.toThrow(/probe failed/i) expect(runtime.getState()).toBe('errored') }) diff --git a/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts b/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts index f16ad0344..d494f90c4 100644 --- a/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts +++ b/packages/browseros-agent/apps/server/tests/lib/container/managed/managed-container.test.ts @@ -48,6 +48,8 @@ class TestContainer extends ManagedContainer { defaultImage: 'docker.io/test:latest', containerName: 'test-container', platforms: ['darwin' as NodeJS.Platform], + // Snappy budget so probe-fails tests don't drag the suite. + readinessProbe: { timeoutMs: 100, intervalMs: 10 }, } probeOutcome: boolean | Error = true @@ -170,6 +172,51 @@ describe('ManagedContainer', () => { expect(c.getStatusSnapshot().lastError).toMatch(/probe failed/i) }) + it('polls the readiness probe until it succeeds within the descriptor budget', async () => { + const lockDir = mkTempDir() + const deps = makeFakeDeps({ lockDir }) + const c = new TestContainer(deps) + // Tight budget so the test stays snappy even on slow CI. + ;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = { + timeoutMs: 200, + intervalMs: 20, + } + // Probe fails the first two calls (mimics the HTTP-listener race + // openclaw's /readyz hits), then flips to success. + c.probeOutcome = false + let calls = 0 + const original = c.readinessProbe.bind(c) + ;( + c as unknown as { readinessProbe: () => Promise } + ).readinessProbe = async () => { + calls += 1 + if (calls < 3) return false + return original.call(c) + } + c.probeOutcome = true + + await c.start() + + expect(c.getState()).toBe('running') + expect(calls).toBeGreaterThanOrEqual(3) + }) + + it('times out when the probe never succeeds, with errored state', async () => { + const lockDir = mkTempDir() + const deps = makeFakeDeps({ lockDir }) + const c = new TestContainer(deps) + ;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = { + timeoutMs: 80, + intervalMs: 20, + } + c.probeOutcome = false + + await expect(c.start()).rejects.toThrow(/probe failed/i) + expect(c.getState()).toBe('errored') + // Should have polled multiple times within the budget, not just once. + expect(c.probeCalls).toBeGreaterThanOrEqual(2) + }) + it('stop() force-transitions to stopped even from errored', async () => { const lockDir = mkTempDir() const deps = makeFakeDeps({ lockDir })