fix(container): poll readiness probe within descriptor budget on start

ManagedContainer.start was firing the subclass `readinessProbe()`
exactly once, the moment containerd reported the container as Up.
For OpenClaw this raced the Node.js gateway's HTTP listener bind —
containerd flips status as soon as the entrypoint process spawns, but
the Express server takes a few hundred ms to start serving /readyz.
Single-shot probe → unlucky → state='errored' with
"Readiness probe failed after container reached running state".

Pre-refactor (dev branch) didn't hit this because openclaw used a
two-phase flow: `runtime.startGateway` (no probe) then
`service.waitForReady` (polled /readyz for 30s). When the new
runtime architecture folded openclaw under ManagedContainer, the
polling was lost.

Bring it into the base class: `ManagedContainer.start` now polls
`readinessProbe()` within `descriptor.readinessProbe.timeoutMs` at
`intervalMs` cadence. Deterministic probes (Hermes' `--version` exec)
succeed on the first call and exit immediately — no extra latency.
HTTP probes get the full budget they need.

Also stops misapplying `descriptor.readinessProbe` to the containerd
"Up" wait (which only takes ~50ms anyway — defaults are fine).
This commit is contained in:
DaniAkash
2026-05-11 20:41:43 +05:30
parent b6172a4109
commit e77031025b
3 changed files with 83 additions and 4 deletions

View File

@@ -200,14 +200,20 @@ export abstract class ManagedContainer {
})
await this.deps.cli.createContainer(spec, log)
await this.deps.cli.startContainer(spec.name, log)
await this.deps.cli.waitForContainerRunning(spec.name)
// Poll the subclass-defined readiness probe within the
// descriptor's budget. The container being "Up" in containerd
// only means the entrypoint process spawned — for HTTP probes
// (e.g. openclaw's /readyz) the listener can take a few hundred
// ms after that to bind. For deterministic probes (e.g. hermes'
// `--version` exec) the first call succeeds and the loop exits
// immediately. Subclasses without a transient-readiness phase
// pay nothing extra.
const probeOpts = this.descriptor.readinessProbe
await this.deps.cli.waitForContainerRunning(spec.name, {
const probeOk = await this.pollReadinessProbe({
timeoutMs: probeOpts?.timeoutMs ?? 30_000,
intervalMs: probeOpts?.intervalMs ?? 500,
})
// Run the subclass-defined probe — usually a `--version` exec
// or HTTP /readyz call. Failing this is errored, not stopped.
const probeOk = await this.readinessProbe()
if (!probeOk) {
this.setState(
'errored',
@@ -228,6 +234,26 @@ export abstract class ManagedContainer {
})
}
/** Poll the subclass `readinessProbe` until it returns true, errors
* swallowed (treated as not-yet-ready), or the timeout elapses. */
private async pollReadinessProbe(opts: {
timeoutMs: number
intervalMs: number
}): Promise<boolean> {
const startedAt = Date.now()
while (Date.now() - startedAt <= opts.timeoutMs) {
try {
if (await this.readinessProbe()) return true
} catch {
// Treat thrown probes as transient — keep polling within budget.
}
const remainingMs = opts.timeoutMs - (Date.now() - startedAt)
if (remainingMs <= 0) break
await Bun.sleep(Math.min(opts.intervalMs, remainingMs))
}
return false
}
/** Stop and remove the container. Image and per-agent data preserved. */
async stop(): Promise<void> {
return this.withLifecycleLock('stop', async () => {

View File

@@ -161,6 +161,12 @@ describe('HermesContainerRuntime', () => {
browserosDir: '/host/browseros',
hermesHarnessHostDir: '/host/browseros/vm/hermes/harness',
})
// Tight budget so the polling loop doesn't drag the test out for
// the full 30s readinessProbe.timeoutMs.
;(runtime.descriptor as { readinessProbe?: unknown }).readinessProbe = {
timeoutMs: 100,
intervalMs: 10,
}
await expect(runtime.start()).rejects.toThrow(/probe failed/i)
expect(runtime.getState()).toBe('errored')
})

View File

@@ -48,6 +48,8 @@ class TestContainer extends ManagedContainer {
defaultImage: 'docker.io/test:latest',
containerName: 'test-container',
platforms: ['darwin' as NodeJS.Platform],
// Snappy budget so probe-fails tests don't drag the suite.
readinessProbe: { timeoutMs: 100, intervalMs: 10 },
}
probeOutcome: boolean | Error = true
@@ -170,6 +172,51 @@ describe('ManagedContainer', () => {
expect(c.getStatusSnapshot().lastError).toMatch(/probe failed/i)
})
it('polls the readiness probe until it succeeds within the descriptor budget', async () => {
const lockDir = mkTempDir()
const deps = makeFakeDeps({ lockDir })
const c = new TestContainer(deps)
// Tight budget so the test stays snappy even on slow CI.
;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
timeoutMs: 200,
intervalMs: 20,
}
// Probe fails the first two calls (mimics the HTTP-listener race
// openclaw's /readyz hits), then flips to success.
c.probeOutcome = false
let calls = 0
const original = c.readinessProbe.bind(c)
;(
c as unknown as { readinessProbe: () => Promise<boolean> }
).readinessProbe = async () => {
calls += 1
if (calls < 3) return false
return original.call(c)
}
c.probeOutcome = true
await c.start()
expect(c.getState()).toBe('running')
expect(calls).toBeGreaterThanOrEqual(3)
})
it('times out when the probe never succeeds, with errored state', async () => {
const lockDir = mkTempDir()
const deps = makeFakeDeps({ lockDir })
const c = new TestContainer(deps)
;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
timeoutMs: 80,
intervalMs: 20,
}
c.probeOutcome = false
await expect(c.start()).rejects.toThrow(/probe failed/i)
expect(c.getState()).toBe('errored')
// Should have polled multiple times within the budget, not just once.
expect(c.probeCalls).toBeGreaterThanOrEqual(2)
})
it('stop() force-transitions to stopped even from errored', async () => {
const lockDir = mkTempDir()
const deps = makeFakeDeps({ lockDir })