mirror of
https://github.com/browseros-ai/BrowserOS.git
synced 2026-05-13 15:46:22 +00:00
fix(container): poll readiness probe within descriptor budget on start
ManagedContainer.start was firing the subclass `readinessProbe()` exactly once, the moment containerd reported the container as Up. For OpenClaw this raced the Node.js gateway's HTTP listener bind — containerd flips status as soon as the entrypoint process spawns, but the Express server takes a few hundred ms to start serving /readyz. Single-shot probe → unlucky → state='errored' with "Readiness probe failed after container reached running state". Pre-refactor (dev branch) didn't hit this because openclaw used a two-phase flow: `runtime.startGateway` (no probe) then `service.waitForReady` (polled /readyz for 30s). When the new runtime architecture folded openclaw under ManagedContainer, the polling was lost. Bring it into the base class: `ManagedContainer.start` now polls `readinessProbe()` within `descriptor.readinessProbe.timeoutMs` at `intervalMs` cadence. Deterministic probes (Hermes' `--version` exec) succeed on the first call and exit immediately — no extra latency. HTTP probes get the full budget they need. Also stops misapplying `descriptor.readinessProbe` to the containerd "Up" wait (which only takes ~50ms anyway — defaults are fine).
This commit is contained in:
@@ -200,14 +200,20 @@ export abstract class ManagedContainer {
|
||||
})
|
||||
await this.deps.cli.createContainer(spec, log)
|
||||
await this.deps.cli.startContainer(spec.name, log)
|
||||
await this.deps.cli.waitForContainerRunning(spec.name)
|
||||
// Poll the subclass-defined readiness probe within the
|
||||
// descriptor's budget. The container being "Up" in containerd
|
||||
// only means the entrypoint process spawned — for HTTP probes
|
||||
// (e.g. openclaw's /readyz) the listener can take a few hundred
|
||||
// ms after that to bind. For deterministic probes (e.g. hermes'
|
||||
// `--version` exec) the first call succeeds and the loop exits
|
||||
// immediately. Subclasses without a transient-readiness phase
|
||||
// pay nothing extra.
|
||||
const probeOpts = this.descriptor.readinessProbe
|
||||
await this.deps.cli.waitForContainerRunning(spec.name, {
|
||||
const probeOk = await this.pollReadinessProbe({
|
||||
timeoutMs: probeOpts?.timeoutMs ?? 30_000,
|
||||
intervalMs: probeOpts?.intervalMs ?? 500,
|
||||
})
|
||||
// Run the subclass-defined probe — usually a `--version` exec
|
||||
// or HTTP /readyz call. Failing this is errored, not stopped.
|
||||
const probeOk = await this.readinessProbe()
|
||||
if (!probeOk) {
|
||||
this.setState(
|
||||
'errored',
|
||||
@@ -228,6 +234,26 @@ export abstract class ManagedContainer {
|
||||
})
|
||||
}
|
||||
|
||||
/** Poll the subclass `readinessProbe` until it returns true, errors
|
||||
* swallowed (treated as not-yet-ready), or the timeout elapses. */
|
||||
private async pollReadinessProbe(opts: {
|
||||
timeoutMs: number
|
||||
intervalMs: number
|
||||
}): Promise<boolean> {
|
||||
const startedAt = Date.now()
|
||||
while (Date.now() - startedAt <= opts.timeoutMs) {
|
||||
try {
|
||||
if (await this.readinessProbe()) return true
|
||||
} catch {
|
||||
// Treat thrown probes as transient — keep polling within budget.
|
||||
}
|
||||
const remainingMs = opts.timeoutMs - (Date.now() - startedAt)
|
||||
if (remainingMs <= 0) break
|
||||
await Bun.sleep(Math.min(opts.intervalMs, remainingMs))
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/** Stop and remove the container. Image and per-agent data preserved. */
|
||||
async stop(): Promise<void> {
|
||||
return this.withLifecycleLock('stop', async () => {
|
||||
|
||||
@@ -161,6 +161,12 @@ describe('HermesContainerRuntime', () => {
|
||||
browserosDir: '/host/browseros',
|
||||
hermesHarnessHostDir: '/host/browseros/vm/hermes/harness',
|
||||
})
|
||||
// Tight budget so the polling loop doesn't drag the test out for
|
||||
// the full 30s readinessProbe.timeoutMs.
|
||||
;(runtime.descriptor as { readinessProbe?: unknown }).readinessProbe = {
|
||||
timeoutMs: 100,
|
||||
intervalMs: 10,
|
||||
}
|
||||
await expect(runtime.start()).rejects.toThrow(/probe failed/i)
|
||||
expect(runtime.getState()).toBe('errored')
|
||||
})
|
||||
|
||||
@@ -48,6 +48,8 @@ class TestContainer extends ManagedContainer {
|
||||
defaultImage: 'docker.io/test:latest',
|
||||
containerName: 'test-container',
|
||||
platforms: ['darwin' as NodeJS.Platform],
|
||||
// Snappy budget so probe-fails tests don't drag the suite.
|
||||
readinessProbe: { timeoutMs: 100, intervalMs: 10 },
|
||||
}
|
||||
|
||||
probeOutcome: boolean | Error = true
|
||||
@@ -170,6 +172,51 @@ describe('ManagedContainer', () => {
|
||||
expect(c.getStatusSnapshot().lastError).toMatch(/probe failed/i)
|
||||
})
|
||||
|
||||
it('polls the readiness probe until it succeeds within the descriptor budget', async () => {
|
||||
const lockDir = mkTempDir()
|
||||
const deps = makeFakeDeps({ lockDir })
|
||||
const c = new TestContainer(deps)
|
||||
// Tight budget so the test stays snappy even on slow CI.
|
||||
;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
|
||||
timeoutMs: 200,
|
||||
intervalMs: 20,
|
||||
}
|
||||
// Probe fails the first two calls (mimics the HTTP-listener race
|
||||
// openclaw's /readyz hits), then flips to success.
|
||||
c.probeOutcome = false
|
||||
let calls = 0
|
||||
const original = c.readinessProbe.bind(c)
|
||||
;(
|
||||
c as unknown as { readinessProbe: () => Promise<boolean> }
|
||||
).readinessProbe = async () => {
|
||||
calls += 1
|
||||
if (calls < 3) return false
|
||||
return original.call(c)
|
||||
}
|
||||
c.probeOutcome = true
|
||||
|
||||
await c.start()
|
||||
|
||||
expect(c.getState()).toBe('running')
|
||||
expect(calls).toBeGreaterThanOrEqual(3)
|
||||
})
|
||||
|
||||
it('times out when the probe never succeeds, with errored state', async () => {
|
||||
const lockDir = mkTempDir()
|
||||
const deps = makeFakeDeps({ lockDir })
|
||||
const c = new TestContainer(deps)
|
||||
;(c.descriptor as { readinessProbe?: unknown }).readinessProbe = {
|
||||
timeoutMs: 80,
|
||||
intervalMs: 20,
|
||||
}
|
||||
c.probeOutcome = false
|
||||
|
||||
await expect(c.start()).rejects.toThrow(/probe failed/i)
|
||||
expect(c.getState()).toBe('errored')
|
||||
// Should have polled multiple times within the budget, not just once.
|
||||
expect(c.probeCalls).toBeGreaterThanOrEqual(2)
|
||||
})
|
||||
|
||||
it('stop() force-transitions to stopped even from errored', async () => {
|
||||
const lockDir = mkTempDir()
|
||||
const deps = makeFakeDeps({ lockDir })
|
||||
|
||||
Reference in New Issue
Block a user