fix: stabilize character eval and Qwen model routing

2026-05-13 23:56:07 +00:00 · 2026-04-09 01:04:00 +01:00
parent dc2a0f5b8a
commit 39cc6b7dc7
24 changed files with 748 additions and 101 deletions
--- a/.agents/skills/openclaw-qa-testing/SKILL.md
+++ b/.agents/skills/openclaw-qa-testing/SKILL.md
@@ -62,26 +62,24 @@ pnpm openclaw qa character-eval \
  --model minimax/MiniMax-M2.7,thinking=high \
  --model zai/glm-5.1,thinking=high \
  --model moonshot/kimi-k2.5,thinking=high \
-  --model qwen/qwen3.6-plus,thinking=high \
-  --model xiaomi/mimo-v2-pro,thinking=high \
+  --model qwen/qwen3.5-plus,thinking=high \
  --model google/gemini-3.1-pro-preview,thinking=high \
-  --model codex-cli/<codex-model>,thinking=high \
  --judge-model openai/gpt-5.4,thinking=xhigh,fast \
  --judge-model anthropic/claude-opus-4-6,thinking=high \
-  --concurrency 8 \
-  --judge-concurrency 8 \
+  --concurrency 16 \
+  --judge-concurrency 16 \
  --output-dir .artifacts/qa-e2e/character-eval-<tag>
 ```

 - Runs local QA gateway child processes, not Docker.
 - Preferred model spec syntax is `provider/model,thinking=<level>[,fast|,no-fast|,fast=<bool>]` for both `--model` and `--judge-model`.
 - Do not add new examples with separate `--model-thinking`; keep that flag as legacy compatibility only.
- Defaults to candidate models `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`, `anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`, `moonshot/kimi-k2.5`, `qwen/qwen3.6-plus`, `xiaomi/mimo-v2-pro`, and `google/gemini-3.1-pro-preview` when no `--model` is passed.
+- Defaults to candidate models `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`, `anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`, `moonshot/kimi-k2.5`, `qwen/qwen3.5-plus`, and `google/gemini-3.1-pro-preview` when no `--model` is passed.
 - Candidate thinking defaults to `high`, with `xhigh` for OpenAI models that support it. Prefer inline `--model provider/model,thinking=<level>`; `--thinking <level>` and `--model-thinking <provider/model=level>` remain compatibility shims.
 - OpenAI candidate refs default to fast mode so priority processing is used where supported. Use inline `,fast`, `,no-fast`, or `,fast=false` for one model; use `--fast` only to force fast mode for every candidate.
 - Judges default to `openai/gpt-5.4,thinking=xhigh,fast` and `anthropic/claude-opus-4-6,thinking=high`.
 - Report includes judge ranking, run stats, durations, and full transcripts; do not include raw judge replies. Duration is benchmark context, not a grading signal.
- Candidate and judge concurrency default to 8. Use `--concurrency <n>` and `--judge-concurrency <n>` to override when local gateways or provider limits need a gentler lane.
+- Candidate and judge concurrency default to 16. Use `--concurrency <n>` and `--judge-concurrency <n>` to override when local gateways or provider limits need a gentler lane.
 - Scenario source should stay markdown-driven under `qa/scenarios/`.
 - For isolated character/persona evals, write the persona into `SOUL.md` and blank `IDENTITY.md` in the scenario flow. Use `SOUL.md + IDENTITY.md` only when intentionally testing how the normal OpenClaw identity combines with the character.
 - Keep prompts natural and task-shaped. The candidate model should receive character setup through `SOUL.md`, then normal user turns such as chat, workspace help, and small file tasks; do not ask "how would you react?" or tell the model it is in an eval.
--- a/docs/concepts/qa-e2e-automation.md
+++ b/docs/concepts/qa-e2e-automation.md
@@ -94,13 +94,13 @@ pnpm openclaw qa character-eval \
  --model minimax/MiniMax-M2.7,thinking=high \
  --model zai/glm-5.1,thinking=high \
  --model moonshot/kimi-k2.5,thinking=high \
-  --model qwen/qwen3.6-plus,thinking=high \
-  --model xiaomi/mimo-v2-pro,thinking=high \
+  --model qwen/qwen3.5-plus,thinking=high \
  --model google/gemini-3.1-pro-preview,thinking=high \
  --judge-model openai/gpt-5.4,thinking=xhigh,fast \
  --judge-model anthropic/claude-opus-4-6,thinking=high \
-  --concurrency 8 \
-  --judge-concurrency 8
+  --blind-judge-models \
+  --concurrency 16 \
+  --judge-concurrency 16
 ```

 The command runs local QA gateway child processes, not Docker. Character eval
@@ -109,6 +109,10 @@ such as chat, workspace help, and small file tasks. The candidate model should
 not be told that it is being evaluated. The command preserves each full
 transcript, records basic run stats, then asks the judge models in fast mode with
 `xhigh` reasoning to rank the runs by naturalness, vibe, and humor.
+Use `--blind-judge-models` when comparing providers: the judge prompt still gets
+every transcript and run status, but candidate refs are replaced with neutral
+labels such as `candidate-01`; the report maps rankings back to real refs after
+parsing.
 Candidate runs default to `high` thinking, with `xhigh` for OpenAI models that
 support it. Override a specific candidate inline with
 `--model provider/model,thinking=<level>`. `--thinking <level>` still sets a
@@ -120,14 +124,14 @@ single candidate or judge needs an override. Pass `--fast` only when you want to
 force fast mode on for every candidate model. Candidate and judge durations are
 recorded in the report for benchmark analysis, but judge prompts explicitly say
 not to rank by speed.
-Candidate and judge model runs both default to concurrency 8. Lower
+Candidate and judge model runs both default to concurrency 16. Lower
 `--concurrency` or `--judge-concurrency` when provider limits or local gateway
 pressure make a run too noisy.
 When no candidate `--model` is passed, the character eval defaults to
 `openai/gpt-5.4`, `openai/gpt-5.2`, `anthropic/claude-opus-4-6`,
 `anthropic/claude-sonnet-4-6`, `minimax/MiniMax-M2.7`, `zai/glm-5.1`,
-`moonshot/kimi-k2.5`, `qwen/qwen3.6-plus`, `xiaomi/mimo-v2-pro`, and
-`google/gemini-3.1-pro-preview`.
+`moonshot/kimi-k2.5`, `qwen/qwen3.5-plus`, and
+`google/gemini-3.1-pro-preview` when no `--model` is passed.
 When no `--judge-model` is passed, the judges default to
 `openai/gpt-5.4,thinking=xhigh,fast` and
 `anthropic/claude-opus-4-6,thinking=high`.
--- a/docs/providers/qwen.md
+++ b/docs/providers/qwen.md
@@ -88,7 +88,9 @@ requiring the built-in `qwen` provider id specifically.

 ## Built-in catalog

-OpenClaw currently ships this bundled Qwen catalog:
+OpenClaw currently ships this bundled Qwen catalog. The configured catalog is
+endpoint-aware: Coding Plan configs omit models that are only known to work on
+the Standard endpoint.

 | Model ref                   | Input       | Context   | Notes                                              |
 | --------------------------- | ----------- | --------- | -------------------------------------------------- |
--- a/extensions/qa-lab/src/character-eval.test.ts
+++ b/extensions/qa-lab/src/character-eval.test.ts
@@ -109,6 +109,7 @@ describe("runQaCharacterEval", () => {
    const report = await fs.readFile(result.reportPath, "utf8");
    expect(report).toContain("Execution: local QA gateway child processes, not Docker");
    expect(report).toContain("Judges: openai/gpt-5.4");
+    expect(report).toContain("Judge model labels: visible");
    expect(report).toContain("## Judge Rankings");
    expect(report).toContain("### openai/gpt-5.4");
    expect(report).toContain("reply from openai/gpt-5.4");
@@ -120,6 +121,57 @@ describe("runQaCharacterEval", () => {
    expect(report).not.toContain("Judge Raw Reply");
  });

+  it("can hide candidate model refs from judge prompts and map rankings back", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: "USER Alice: hi\n\nASSISTANT openclaw: anonymous reply",
+      }),
+    );
+    const runJudge = vi.fn(async (params: CharacterRunJudgeParams) => {
+      expect(params.prompt).toContain("## CANDIDATE candidate-01");
+      expect(params.prompt).toContain("## CANDIDATE candidate-02");
+      expect(params.prompt).not.toContain("openai/gpt-5.4");
+      expect(params.prompt).not.toContain("codex-cli/test-model");
+      return JSON.stringify({
+        rankings: [
+          {
+            model: "candidate-02",
+            rank: 1,
+            score: 9.1,
+            summary: "Better vibes.",
+          },
+          {
+            model: "candidate-01",
+            rank: 2,
+            score: 7.4,
+            summary: "Solid.",
+          },
+        ],
+      });
+    });
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["openai/gpt-5.4", "codex-cli/test-model"],
+      judgeModels: ["openai/gpt-5.4"],
+      judgeBlindModels: true,
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.judgments[0]?.blindModels).toBe(true);
+    expect(result.judgments[0]?.rankings.map((ranking) => ranking.model)).toEqual([
+      "codex-cli/test-model",
+      "openai/gpt-5.4",
+    ]);
+    const report = await fs.readFile(result.reportPath, "utf8");
+    expect(report).toContain("Judge model labels: blind");
+    expect(report).toContain("1. codex-cli/test-model - 9.1 - Better vibes.");
+  });
+
  it("defaults to the character eval model panel when no models are provided", async () => {
    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
      makeSuiteResult({
@@ -138,9 +190,8 @@ describe("runQaCharacterEval", () => {
          { model: "minimax/MiniMax-M2.7", rank: 5, score: 6.5, summary: "ok" },
          { model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" },
          { model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" },
-          { model: "qwen/qwen3.6-plus", rank: 8, score: 6.1, summary: "ok" },
-          { model: "xiaomi/mimo-v2-pro", rank: 9, score: 6, summary: "ok" },
-          { model: "google/gemini-3.1-pro-preview", rank: 10, score: 5.9, summary: "ok" },
+          { model: "qwen/qwen3.5-plus", rank: 8, score: 6.1, summary: "ok" },
+          { model: "google/gemini-3.1-pro-preview", rank: 9, score: 6, summary: "ok" },
        ],
      }),
    );
@@ -153,7 +204,7 @@ describe("runQaCharacterEval", () => {
      runJudge,
    });

-    expect(runSuite).toHaveBeenCalledTimes(10);
+    expect(runSuite).toHaveBeenCalledTimes(9);
    expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([
      "openai/gpt-5.4",
      "openai/gpt-5.2",
@@ -162,8 +213,7 @@ describe("runQaCharacterEval", () => {
      "minimax/MiniMax-M2.7",
      "zai/glm-5.1",
      "moonshot/kimi-k2.5",
-      "qwen/qwen3.6-plus",
-      "xiaomi/mimo-v2-pro",
+      "qwen/qwen3.5-plus",
      "google/gemini-3.1-pro-preview",
    ]);
    expect(runSuite.mock.calls.map(([params]) => params.thinkingDefault)).toEqual([
@@ -176,7 +226,6 @@ describe("runQaCharacterEval", () => {
      "high",
      "high",
      "high",
-      "high",
    ]);
    expect(runSuite.mock.calls.map(([params]) => params.fastMode)).toEqual([
      true,
@@ -188,7 +237,6 @@ describe("runQaCharacterEval", () => {
      false,
      false,
      false,
-      false,
    ]);
    expect(runJudge).toHaveBeenCalledTimes(2);
    expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
@@ -244,7 +292,7 @@ describe("runQaCharacterEval", () => {
    ]);
  });

-  it("defaults candidate and judge concurrency to eight", async () => {
+  it("defaults candidate and judge concurrency to sixteen", async () => {
    let activeRuns = 0;
    let maxActiveRuns = 0;
    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
@@ -266,7 +314,7 @@ describe("runQaCharacterEval", () => {
      await new Promise((resolve) => setTimeout(resolve, 10));
      activeJudges -= 1;
      return JSON.stringify({
-        rankings: Array.from({ length: 10 }, (_, index) => ({
+        rankings: Array.from({ length: 20 }, (_, index) => ({
          model: `provider/model-${index + 1}`,
          rank: index + 1,
          score: 10 - index,
@@ -278,14 +326,137 @@ describe("runQaCharacterEval", () => {
    await runQaCharacterEval({
      repoRoot: tempRoot,
      outputDir: path.join(tempRoot, "character"),
-      models: Array.from({ length: 10 }, (_, index) => `provider/model-${index + 1}`),
-      judgeModels: Array.from({ length: 10 }, (_, index) => `judge/model-${index + 1}`),
+      models: Array.from({ length: 20 }, (_, index) => `provider/model-${index + 1}`),
+      judgeModels: Array.from({ length: 20 }, (_, index) => `judge/model-${index + 1}`),
      runSuite,
      runJudge,
    });

-    expect(maxActiveRuns).toBe(8);
-    expect(maxActiveJudges).toBe(8);
+    expect(maxActiveRuns).toBe(16);
+    expect(maxActiveJudges).toBe(16);
+  });
+
+  it("marks raw provider error transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "USER Alice: Are you awake?\n\nASSISTANT OpenClaw QA: 400 model `qwen3.6-plus` is not supported.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qwen/qwen3.6-plus", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qwen/qwen3.6-plus"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qwen/qwen3.6-plus",
+      status: "fail",
+      error: "model unsupported error leaked into transcript",
+    });
+  });
+
+  it("marks raw tool failure transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript: "ASSISTANT OpenClaw QA: ⚠️ ✍️ Write: to /tmp/precious.html failed",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qwen/qwen3.5-plus", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qwen/qwen3.5-plus"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qwen/qwen3.5-plus",
+      status: "fail",
+      error: "tool failure leaked into transcript",
+    });
+  });
+
+  it("marks generic channel fallback transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "ASSISTANT OpenClaw QA: ⚠️ Something went wrong while processing your request. Please try again, or use /new to start a fresh session.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "qa/generic-fallback-model", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["qa/generic-fallback-model"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "qa/generic-fallback-model",
+      status: "fail",
+      error: "generic request failure leaked into transcript",
+    });
+  });
+
+  it("marks idle-timeout fallback transcripts as failed output", async () => {
+    const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
+      makeSuiteResult({
+        outputDir: params.outputDir,
+        model: params.primaryModel,
+        transcript:
+          "ASSISTANT OpenClaw QA: The model did not produce a response before the LLM idle timeout. Please try again, or increase `agents.defaults.llm.idleTimeoutSeconds` in your config.",
+      }),
+    );
+    const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) =>
+      JSON.stringify({
+        rankings: [{ model: "google/gemini-test", rank: 1, score: 0.5, summary: "failed" }],
+      }),
+    );
+
+    const result = await runQaCharacterEval({
+      repoRoot: tempRoot,
+      outputDir: path.join(tempRoot, "character"),
+      models: ["google/gemini-test"],
+      judgeModels: ["openai/gpt-5.4"],
+      runSuite,
+      runJudge,
+    });
+
+    expect(result.runs[0]).toMatchObject({
+      model: "google/gemini-test",
+      status: "fail",
+      error: "LLM timeout leaked into transcript",
+    });
  });

  it("lets explicit candidate thinking override the default panel", async () => {
--- a/extensions/qa-lab/src/character-eval.ts
+++ b/extensions/qa-lab/src/character-eval.ts
@@ -15,12 +15,11 @@ const DEFAULT_CHARACTER_EVAL_MODELS = Object.freeze([
  "minimax/MiniMax-M2.7",
  "zai/glm-5.1",
  "moonshot/kimi-k2.5",
-  "qwen/qwen3.6-plus",
-  "xiaomi/mimo-v2-pro",
+  "qwen/qwen3.5-plus",
  "google/gemini-3.1-pro-preview",
 ]);
 const DEFAULT_CHARACTER_THINKING: QaThinkingLevel = "high";
-const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 8;
+const DEFAULT_CHARACTER_EVAL_CONCURRENCY = 16;
 const DEFAULT_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, QaThinkingLevel>> =
  Object.freeze({
    "openai/gpt-5.4": "xhigh",
@@ -81,11 +80,14 @@ export type QaCharacterEvalJudgeResult = {
  model: string;
  thinkingDefault: QaThinkingLevel;
  fastMode: boolean;
+  blindModels: boolean;
  durationMs: number;
  rankings: QaCharacterEvalJudgment[];
  error?: string;
 };

+type QaCharacterEvalProgressLogger = (message: string) => void;
+
 type RunSuiteFn = (params: {
  repoRoot: string;
  outputDir: string;
@@ -120,10 +122,12 @@ export type QaCharacterEvalParams = {
  judgeThinkingDefault?: QaThinkingLevel;
  judgeModelOptions?: Record<string, QaCharacterModelOptions>;
  judgeTimeoutMs?: number;
+  judgeBlindModels?: boolean;
  candidateConcurrency?: number;
  judgeConcurrency?: number;
  runSuite?: RunSuiteFn;
  runJudge?: RunJudgeFn;
+  progress?: QaCharacterEvalProgressLogger;
 };

 function normalizeModelRefs(models: readonly string[]) {
@@ -226,6 +230,27 @@ function collectTranscriptStats(transcript: string) {
  };
 }

+function detectTranscriptFailure(transcript: string): string | undefined {
+  const checks: Array<[RegExp, string]> = [
+    [/\bmodel `[^`]+` is not supported\b/i, "model unsupported error leaked into transcript"],
+    [/\binsufficient account balance\b/i, "account balance error leaked into transcript"],
+    [/\b(?:backend|transport|internal) error\b/i, "backend error leaked into transcript"],
+    [
+      /\bsomething went wrong while processing your request\b/i,
+      "generic request failure leaked into transcript",
+    ],
+    [/\buse \/new to start a fresh session\b/i, "generic request failure leaked into transcript"],
+    [
+      /\bmodel did not produce a response before the LLM idle timeout\b/i,
+      "LLM timeout leaked into transcript",
+    ],
+    [/\btool failed\b/i, "tool failure leaked into transcript"],
+    [/\b(?:read|write|edit|patch):[^\n]*\bfailed\b/i, "tool failure leaked into transcript"],
+    [/\bnot configured\b/i, "configuration error leaked into transcript"],
+  ];
+  return checks.find(([pattern]) => pattern.test(transcript))?.[1];
+}
+
 function formatDuration(ms: number) {
  if (!Number.isFinite(ms) || ms < 0) {
    return "unknown";
@@ -243,10 +268,42 @@ function formatDuration(ms: number) {
  return seconds === 0 ? `${minutes}m` : `${minutes}m ${seconds}s`;
 }

-function buildJudgePrompt(params: { scenarioId: string; runs: readonly QaCharacterEvalRun[] }) {
+function logCharacterEvalProgress(
+  progress: QaCharacterEvalProgressLogger | undefined,
+  message: string,
+) {
+  progress?.(`[qa-character] ${message}`);
+}
+
+function formatEvalIndex(index: number, total: number) {
+  return `${index + 1}/${total}`;
+}
+
+function summarizeRunStats(run: QaCharacterEvalRun) {
+  return [
+    `status=${run.status}`,
+    `duration=${formatDuration(run.durationMs)}`,
+    `turns=${run.stats.userTurns}/${run.stats.assistantTurns}`,
+    `chars=${run.stats.transcriptChars}`,
+    ...(run.error ? [`error="${run.error}"`] : []),
+  ].join(" ");
+}
+
+function formatBlindCandidateLabel(index: number) {
+  return `candidate-${String(index + 1).padStart(2, "0")}`;
+}
+
+function buildJudgePrompt(params: {
+  scenarioId: string;
+  runs: readonly QaCharacterEvalRun[];
+  blindModels?: boolean;
+}) {
+  const labelToModel = new Map<string, string>();
  const runBlocks = params.runs
-    .map(
-      (run) => `## MODEL ${run.model}
+    .map((run, index) => {
+      const label = params.blindModels ? formatBlindCandidateLabel(index) : run.model;
+      labelToModel.set(label, run.model);
+      return `## CANDIDATE ${label}

 Status: ${run.status}
 Duration ms (not used for ranking): ${run.durationMs}
@@ -258,11 +315,11 @@ Error: ${run.error ?? "none"}

 \`\`\`text
 ${run.transcript}
-\`\`\``,
-    )
+\`\`\``;
+    })
    .join("\n\n");

-  return `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.
+  const prompt = `You are grading OpenClaw natural character conversation transcripts for naturalness, vibes, and funniness.

 Scenario id: ${params.scenarioId}

@@ -275,14 +332,14 @@ Rank the models by:
 - not sounding aware of an eval or test
 - avoiding tool/backend/error leakage

-Treat model names as opaque labels. Do not assume quality from the label.
+Treat candidate labels as opaque identifiers. Do not assume quality from the label.
 Duration is recorded for separate benchmark analysis only. Do not rank models by speed.

 Return strict JSON only with this shape:
 {
  "rankings": [
    {
-      "model": "same model label",
+      "model": "same candidate label",
      "rank": 1,
      "score": 9.2,
      "summary": "one sentence",
@@ -293,6 +350,7 @@ Return strict JSON only with this shape:
 }

 ${runBlocks}`;
+  return { prompt, labelToModel };
 }

 function normalizeJudgment(value: unknown, allowedModels: Set<string>): QaCharacterEvalJudgment[] {
@@ -382,6 +440,7 @@ function renderCharacterEvalReport(params: {
    `- Judges: ${params.judgments.map((judgment) => judgment.model).join(", ")}`,
    `- Judge thinking: ${params.judgments[0]?.thinkingDefault ?? DEFAULT_JUDGE_THINKING}`,
    `- Judge fast mode: ${params.judgments.every((judgment) => judgment.fastMode) ? "on" : "mixed"}`,
+    `- Judge model labels: ${params.judgments.every((judgment) => judgment.blindModels) ? "blind" : "visible"}`,
    "",
    "## Judge Rankings",
    "",
@@ -461,7 +520,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    params.candidateConcurrency,
    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
  );
-  const runs = await mapWithConcurrency(models, candidateConcurrency, async (model) => {
+  logCharacterEvalProgress(
+    params.progress,
+    `start scenario=${scenarioId} candidates=${models.length} candidateConcurrency=${candidateConcurrency} output=${outputDir}`,
+  );
+  const candidatesStartedAt = Date.now();
+  const runs = await mapWithConcurrency(models, candidateConcurrency, async (model, index) => {
    const thinkingDefault = resolveCandidateThinkingDefault({
      model,
      candidateThinkingDefault: params.candidateThinkingDefault,
@@ -475,6 +539,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    });
    const modelOutputDir = path.join(runsDir, sanitizePathPart(model));
    const runStartedAt = Date.now();
+    logCharacterEvalProgress(
+      params.progress,
+      `candidate start ${formatEvalIndex(index, models.length)} model=${model} thinking=${thinkingDefault} fast=${fastMode ? "on" : "off"}`,
+    );
    try {
      const result = await runSuite({
        repoRoot,
@@ -487,10 +555,12 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        scenarioIds: [scenarioId],
      });
      const transcript = extractTranscript(result);
-      const status = result.scenarios.some((scenario) => scenario.status === "fail")
-        ? "fail"
-        : "pass";
-      return {
+      const transcriptFailure = detectTranscriptFailure(transcript);
+      const status =
+        result.scenarios.some((scenario) => scenario.status === "fail") || transcriptFailure
+          ? "fail"
+          : "pass";
+      const run = {
        model,
        status,
        durationMs: Date.now() - runStartedAt,
@@ -501,10 +571,16 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        summaryPath: result.summaryPath,
        transcript,
        stats: collectTranscriptStats(transcript),
+        ...(transcriptFailure ? { error: transcriptFailure } : {}),
      } satisfies QaCharacterEvalRun;
+      logCharacterEvalProgress(
+        params.progress,
+        `candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
+      );
+      return run;
    } catch (error) {
      const transcript = "";
-      return {
+      const run = {
        model,
        status: "fail",
        durationMs: Date.now() - runStartedAt,
@@ -515,8 +591,18 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
        stats: collectTranscriptStats(transcript),
        error: formatErrorMessage(error),
      } satisfies QaCharacterEvalRun;
+      logCharacterEvalProgress(
+        params.progress,
+        `candidate done ${formatEvalIndex(index, models.length)} model=${model} ${summarizeRunStats(run)}`,
+      );
+      return run;
    }
  });
+  const failedCandidateCount = runs.filter((run) => run.status === "fail").length;
+  logCharacterEvalProgress(
+    params.progress,
+    `candidates done pass=${runs.length - failedCandidateCount} fail=${failedCandidateCount} duration=${formatDuration(Date.now() - candidatesStartedAt)}`,
+  );

  const judgeModels = normalizeModelRefs(
    params.judgeModels && params.judgeModels.length > 0
@@ -530,38 +616,73 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    params.judgeConcurrency,
    DEFAULT_CHARACTER_EVAL_CONCURRENCY,
  );
-  const judgments = await mapWithConcurrency(judgeModels, judgeConcurrency, async (judgeModel) => {
-    const judgeOptions = resolveJudgeOptions({
-      model: judgeModel,
-      judgeThinkingDefault: params.judgeThinkingDefault,
-      judgeModelOptions: params.judgeModelOptions,
-    });
-    let rankings: QaCharacterEvalJudgment[] = [];
-    let judgeError: string | undefined;
-    const judgeStartedAt = Date.now();
-    try {
-      const rawReply = await runJudge({
-        repoRoot,
-        judgeModel,
-        judgeThinkingDefault: judgeOptions.thinkingDefault,
-        judgeFastMode: judgeOptions.fastMode,
-        prompt: buildJudgePrompt({ scenarioId, runs }),
-        timeoutMs: params.judgeTimeoutMs ?? 180_000,
+  const judgeTimeoutMs = params.judgeTimeoutMs ?? 180_000;
+  logCharacterEvalProgress(
+    params.progress,
+    `judges start judges=${judgeModels.length} judgeConcurrency=${judgeConcurrency} timeout=${formatDuration(judgeTimeoutMs)} labels=${params.judgeBlindModels === true ? "blind" : "visible"}`,
+  );
+  const judgesStartedAt = Date.now();
+  const judgments = await mapWithConcurrency(
+    judgeModels,
+    judgeConcurrency,
+    async (judgeModel, index) => {
+      const judgeOptions = resolveJudgeOptions({
+        model: judgeModel,
+        judgeThinkingDefault: params.judgeThinkingDefault,
+        judgeModelOptions: params.judgeModelOptions,
      });
-      rankings = parseJudgeReply(rawReply, new Set(models));
-    } catch (error) {
-      judgeError = formatErrorMessage(error);
-    }
+      let rankings: QaCharacterEvalJudgment[] = [];
+      let judgeError: string | undefined;
+      const judgeStartedAt = Date.now();
+      logCharacterEvalProgress(
+        params.progress,
+        `judge start ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} thinking=${judgeOptions.thinkingDefault} fast=${judgeOptions.fastMode ? "on" : "off"} timeout=${formatDuration(judgeTimeoutMs)}`,
+      );
+      try {
+        const judgePrompt = buildJudgePrompt({
+          scenarioId,
+          runs,
+          blindModels: params.judgeBlindModels,
+        });
+        const rawReply = await runJudge({
+          repoRoot,
+          judgeModel,
+          judgeThinkingDefault: judgeOptions.thinkingDefault,
+          judgeFastMode: judgeOptions.fastMode,
+          prompt: judgePrompt.prompt,
+          timeoutMs: judgeTimeoutMs,
+        });
+        rankings = parseJudgeReply(rawReply, new Set(judgePrompt.labelToModel.keys())).map(
+          (ranking) => ({
+            ...ranking,
+            model: judgePrompt.labelToModel.get(ranking.model) ?? ranking.model,
+          }),
+        );
+      } catch (error) {
+        judgeError = formatErrorMessage(error);
+      }

-    return {
-      model: judgeModel,
-      thinkingDefault: judgeOptions.thinkingDefault,
-      fastMode: judgeOptions.fastMode,
-      durationMs: Date.now() - judgeStartedAt,
-      rankings,
-      ...(judgeError ? { error: judgeError } : {}),
-    } satisfies QaCharacterEvalJudgeResult;
-  });
+      const judgment = {
+        model: judgeModel,
+        thinkingDefault: judgeOptions.thinkingDefault,
+        fastMode: judgeOptions.fastMode,
+        blindModels: params.judgeBlindModels === true,
+        durationMs: Date.now() - judgeStartedAt,
+        rankings,
+        ...(judgeError ? { error: judgeError } : {}),
+      } satisfies QaCharacterEvalJudgeResult;
+      logCharacterEvalProgress(
+        params.progress,
+        `judge done ${formatEvalIndex(index, judgeModels.length)} model=${judgeModel} rankings=${rankings.length} duration=${formatDuration(judgment.durationMs)}${judgeError ? ` error="${judgeError}"` : ""}`,
+      );
+      return judgment;
+    },
+  );
+  const failedJudgeCount = judgments.filter((judgment) => judgment.rankings.length === 0).length;
+  logCharacterEvalProgress(
+    params.progress,
+    `judges done ranked=${judgments.length - failedJudgeCount} failed=${failedJudgeCount} duration=${formatDuration(Date.now() - judgesStartedAt)}`,
+  );

  const finishedAt = new Date();
  const report = renderCharacterEvalReport({
@@ -587,6 +708,10 @@ export async function runQaCharacterEval(params: QaCharacterEvalParams) {
    )}\n`,
    "utf8",
  );
+  logCharacterEvalProgress(
+    params.progress,
+    `report written duration=${formatDuration(finishedAt.getTime() - startedAt.getTime())} report=${reportPath} summary=${summaryPath}`,
+  );

  return {
    outputDir,
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -158,6 +158,7 @@ describe("qa cli runtime", () => {
      modelThinking: ["codex-cli/test-model=medium"],
      judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
      judgeTimeoutMs: 180_000,
+      blindJudgeModels: true,
      concurrency: 4,
      judgeConcurrency: 3,
    });
@@ -180,8 +181,10 @@ describe("qa cli runtime", () => {
        "anthropic/claude-opus-4-6": { thinkingDefault: "high" },
      },
      judgeTimeoutMs: 180_000,
+      judgeBlindModels: true,
      candidateConcurrency: 4,
      judgeConcurrency: 3,
+      progress: expect.any(Function),
    });
  });

@@ -203,8 +206,10 @@ describe("qa cli runtime", () => {
      judgeModels: undefined,
      judgeModelOptions: undefined,
      judgeTimeoutMs: undefined,
+      judgeBlindModels: undefined,
      candidateConcurrency: undefined,
      judgeConcurrency: undefined,
+      progress: expect.any(Function),
    });
  });

--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -225,6 +225,7 @@ export async function runQaCharacterEvalCommand(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  blindJudgeModels?: boolean;
  concurrency?: number;
  judgeConcurrency?: number;
 }) {
@@ -243,8 +244,10 @@ export async function runQaCharacterEvalCommand(opts: {
    judgeModels: judges.models.length > 0 ? judges.models : undefined,
    judgeModelOptions: judges.optionsByModel,
    judgeTimeoutMs: opts.judgeTimeoutMs,
+    judgeBlindModels: opts.blindJudgeModels === true ? true : undefined,
    candidateConcurrency: parseQaPositiveIntegerOption("--concurrency", opts.concurrency),
    judgeConcurrency: parseQaPositiveIntegerOption("--judge-concurrency", opts.judgeConcurrency),
+    progress: (message) => process.stderr.write(`${message}\n`),
  });
  process.stdout.write(`QA character eval report: ${result.reportPath}\n`);
  process.stdout.write(`QA character eval summary: ${result.summaryPath}\n`);
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -38,6 +38,7 @@ async function runQaCharacterEval(opts: {
  modelThinking?: string[];
  judgeModel?: string[];
  judgeTimeoutMs?: number;
+  blindJudgeModels?: boolean;
  concurrency?: number;
  judgeConcurrency?: number;
 }) {
@@ -199,6 +200,10 @@ export function registerQaLabCli(program: Command) {
    .option("--judge-timeout-ms <ms>", "Override judge wait timeout", (value: string) =>
      Number(value),
    )
+    .option(
+      "--blind-judge-models",
+      "Hide candidate model refs from judge prompts; reports still map rankings back to real refs",
+    )
    .option("--concurrency <count>", "Candidate model run concurrency", (value: string) =>
      Number(value),
    )
@@ -216,6 +221,7 @@ export function registerQaLabCli(program: Command) {
        modelThinking?: string[];
        judgeModel?: string[];
        judgeTimeoutMs?: number;
+        blindJudgeModels?: boolean;
        concurrency?: number;
        judgeConcurrency?: number;
      }) => {
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -19,6 +19,7 @@ describe("qa scenario catalog", () => {
      true,
    );
    expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-gollum")).toBe(true);
+    expect(pack.scenarios.some((scenario) => scenario.id === "character-vibes-c3po")).toBe(true);
    expect(pack.scenarios.every((scenario) => scenario.execution?.kind === "flow")).toBe(true);
    expect(pack.scenarios.some((scenario) => scenario.execution.flow?.steps.length)).toBe(true);
  });
--- a/extensions/qwen/api.ts
+++ b/extensions/qwen/api.ts
@@ -2,7 +2,11 @@ export {
  applyQwenNativeStreamingUsageCompat,
  buildQwenDefaultModelDefinition,
  buildQwenModelDefinition,
+  buildQwenModelCatalogForBaseUrl,
  isNativeQwenBaseUrl,
+  isQwen36PlusSupportedBaseUrl,
+  isQwenCodingPlanBaseUrl,
+  QWEN_36_PLUS_MODEL_ID,
  QWEN_BASE_URL,
  QWEN_CN_BASE_URL,
  QWEN_DEFAULT_COST,
--- a/extensions/qwen/index.ts
+++ b/extensions/qwen/index.ts
@@ -1,6 +1,7 @@
 import { defineSingleProviderPluginEntry } from "openclaw/plugin-sdk/provider-entry";
 import { applyQwenNativeStreamingUsageCompat } from "./api.js";
 import { buildQwenMediaUnderstandingProvider } from "./media-understanding-provider.js";
+import { isQwenCodingPlanBaseUrl, QWEN_36_PLUS_MODEL_ID, QWEN_BASE_URL } from "./models.js";
 import {
  applyQwenConfig,
  applyQwenConfigCn,
@@ -12,6 +13,38 @@ import { buildQwenProvider } from "./provider-catalog.js";
 import { buildQwenVideoGenerationProvider } from "./video-generation-provider.js";

 const PROVIDER_ID = "qwen";
+const LEGACY_PROVIDER_ID = "modelstudio";
+
+function normalizeProviderId(value: string): string {
+  return value.trim().toLowerCase();
+}
+
+function resolveConfiguredQwenBaseUrl(
+  config: { models?: { providers?: Record<string, { baseUrl?: string } | undefined> } } | undefined,
+): string | undefined {
+  const providers = config?.models?.providers;
+  if (!providers) {
+    return undefined;
+  }
+  for (const [providerId, provider] of Object.entries(providers)) {
+    const normalized = normalizeProviderId(providerId);
+    if (normalized !== PROVIDER_ID && normalized !== LEGACY_PROVIDER_ID) {
+      continue;
+    }
+    const baseUrl = provider?.baseUrl?.trim();
+    if (baseUrl) {
+      return baseUrl;
+    }
+  }
+  return undefined;
+}
+
+function isQwen36PlusUnsupportedForConfig(params: {
+  config: Parameters<typeof resolveConfiguredQwenBaseUrl>[0];
+  baseUrl?: string;
+}): boolean {
+  return isQwenCodingPlanBaseUrl(params.baseUrl ?? resolveConfiguredQwenBaseUrl(params.config));
+}

 export default defineSingleProviderPluginEntry({
  id: PROVIDER_ID,
@@ -82,7 +115,7 @@ export default defineSingleProviderPluginEntry({
          "Manage API keys: https://home.qwencloud.com/api-keys",
          "Docs: https://docs.qwencloud.com/",
          "Endpoint: coding.dashscope.aliyuncs.com",
-          "Models: qwen3.6-plus, glm-5, kimi-k2.5, MiniMax-M2.5, etc.",
+          "Models: qwen3.5-plus, glm-5, kimi-k2.5, MiniMax-M2.5, etc.",
        ].join("\n"),
        noteTitle: "Qwen Cloud Coding Plan (China)",
        wizard: {
@@ -105,7 +138,7 @@ export default defineSingleProviderPluginEntry({
          "Manage API keys: https://home.qwencloud.com/api-keys",
          "Docs: https://docs.qwencloud.com/",
          "Endpoint: coding-intl.dashscope.aliyuncs.com",
-          "Models: qwen3.6-plus, glm-5, kimi-k2.5, MiniMax-M2.5, etc.",
+          "Models: qwen3.5-plus, glm-5, kimi-k2.5, MiniMax-M2.5, etc.",
        ].join("\n"),
        noteTitle: "Qwen Cloud Coding Plan (Global/Intl)",
        wizard: {
@@ -116,11 +149,46 @@ export default defineSingleProviderPluginEntry({
      },
    ],
    catalog: {
-      buildProvider: buildQwenProvider,
-      allowExplicitBaseUrl: true,
+      run: async (ctx) => {
+        const apiKey = ctx.resolveProviderApiKey(PROVIDER_ID).apiKey;
+        if (!apiKey) {
+          return null;
+        }
+        const baseUrl = resolveConfiguredQwenBaseUrl(ctx.config) ?? QWEN_BASE_URL;
+        return {
+          provider: {
+            ...buildQwenProvider({ baseUrl }),
+            apiKey,
+          },
+        };
+      },
    },
    applyNativeStreamingUsageCompat: ({ providerConfig }) =>
      applyQwenNativeStreamingUsageCompat(providerConfig),
+    normalizeConfig: ({ providerConfig }) => {
+      if (!isQwenCodingPlanBaseUrl(providerConfig.baseUrl)) {
+        return undefined;
+      }
+      const models = providerConfig.models?.filter((model) => model.id !== QWEN_36_PLUS_MODEL_ID);
+      return models && models.length !== providerConfig.models?.length
+        ? { ...providerConfig, models }
+        : undefined;
+    },
+    suppressBuiltInModel: (ctx) => {
+      const provider = normalizeProviderId(ctx.provider);
+      if (
+        (provider !== PROVIDER_ID && provider !== LEGACY_PROVIDER_ID) ||
+        ctx.modelId !== QWEN_36_PLUS_MODEL_ID ||
+        !isQwen36PlusUnsupportedForConfig({ config: ctx.config, baseUrl: ctx.baseUrl })
+      ) {
+        return undefined;
+      }
+      return {
+        suppress: true,
+        errorMessage:
+          "Unknown model: qwen/qwen3.6-plus. qwen3.6-plus is not supported on the Qwen Coding Plan endpoint; use a Standard pay-as-you-go Qwen endpoint or choose qwen/qwen3.5-plus.",
+      };
+    },
  },
  register(api) {
    api.registerMediaUnderstandingProvider(buildQwenMediaUnderstandingProvider());
--- a/extensions/qwen/models.ts
+++ b/extensions/qwen/models.ts
@@ -15,6 +15,7 @@ export const QWEN_STANDARD_GLOBAL_BASE_URL =
  "https://dashscope-intl.aliyuncs.com/compatible-mode/v1";

 export const QWEN_DEFAULT_MODEL_ID = "qwen3.5-plus";
+export const QWEN_36_PLUS_MODEL_ID = "qwen3.6-plus";
 export const QWEN_DEFAULT_COST = {
  input: 0,
  output: 0,
@@ -34,8 +35,8 @@ export const QWEN_MODEL_CATALOG: ReadonlyArray<ModelDefinitionConfig> = [
    maxTokens: 65_536,
  },
  {
-    id: "qwen3.6-plus",
-    name: "qwen3.6-plus",
+    id: QWEN_36_PLUS_MODEL_ID,
+    name: QWEN_36_PLUS_MODEL_ID,
    reasoning: false,
    input: ["text", "image"],
    cost: QWEN_DEFAULT_COST,
@@ -107,6 +108,33 @@ export const QWEN_MODEL_CATALOG: ReadonlyArray<ModelDefinitionConfig> = [
  },
 ];

+export function isQwenCodingPlanBaseUrl(baseUrl: string | undefined): boolean {
+  if (!baseUrl?.trim()) {
+    return false;
+  }
+  try {
+    const hostname = new URL(baseUrl).hostname.toLowerCase();
+    return (
+      hostname === "coding.dashscope.aliyuncs.com" ||
+      hostname === "coding-intl.dashscope.aliyuncs.com"
+    );
+  } catch {
+    return false;
+  }
+}
+
+export function isQwen36PlusSupportedBaseUrl(baseUrl: string | undefined): boolean {
+  return !isQwenCodingPlanBaseUrl(baseUrl);
+}
+
+export function buildQwenModelCatalogForBaseUrl(
+  baseUrl: string | undefined,
+): ReadonlyArray<ModelDefinitionConfig> {
+  return isQwen36PlusSupportedBaseUrl(baseUrl)
+    ? QWEN_MODEL_CATALOG
+    : QWEN_MODEL_CATALOG.filter((model) => model.id !== QWEN_36_PLUS_MODEL_ID);
+}
+
 export function isNativeQwenBaseUrl(baseUrl: string | undefined): boolean {
  return supportsNativeStreamingUsageCompat({
    providerId: "qwen",
--- a/extensions/qwen/onboard.ts
+++ b/extensions/qwen/onboard.ts
@@ -22,7 +22,7 @@ export {
 const qwenPresetAppliers = createModelCatalogPresetAppliers<[string]>({
  primaryModelRef: QWEN_DEFAULT_MODEL_REF,
  resolveParams: (_cfg: OpenClawConfig, baseUrl: string) => {
-    const provider = buildQwenProvider();
+    const provider = buildQwenProvider({ baseUrl });
    return {
      providerId: "qwen",
      api: provider.api ?? "openai-completions",
--- a/extensions/qwen/provider-catalog.test.ts
+++ b/extensions/qwen/provider-catalog.test.ts
@@ -3,6 +3,7 @@ import {
  applyQwenNativeStreamingUsageCompat,
  buildQwenProvider,
  QWEN_BASE_URL,
+  QWEN_STANDARD_GLOBAL_BASE_URL,
  QWEN_DEFAULT_MODEL_ID,
 } from "./api.js";

@@ -14,7 +15,15 @@ describe("qwen provider catalog", () => {
    expect(provider.api).toBe("openai-completions");
    expect(provider.models?.length).toBeGreaterThan(0);
    expect(provider.models?.find((model) => model.id === QWEN_DEFAULT_MODEL_ID)).toBeTruthy();
-    expect(provider.models?.find((model) => model.id === "qwen3.6-plus")).toBeTruthy();
+    expect(provider.models?.find((model) => model.id === "qwen3.6-plus")).toBeFalsy();
+  });
+
+  it("only advertises qwen3.6-plus on Standard endpoints", () => {
+    const coding = buildQwenProvider({ baseUrl: QWEN_BASE_URL });
+    const standard = buildQwenProvider({ baseUrl: QWEN_STANDARD_GLOBAL_BASE_URL });
+
+    expect(coding.models?.find((model) => model.id === "qwen3.6-plus")).toBeFalsy();
+    expect(standard.models?.find((model) => model.id === "qwen3.6-plus")).toBeTruthy();
  });

  it("opts native Qwen baseUrls into streaming usage only inside the extension", () => {
--- a/extensions/qwen/provider-catalog.ts
+++ b/extensions/qwen/provider-catalog.ts
@@ -1,11 +1,12 @@
 import type { ModelProviderConfig } from "openclaw/plugin-sdk/provider-model-shared";
-import { QWEN_BASE_URL, QWEN_MODEL_CATALOG } from "./models.js";
+import { buildQwenModelCatalogForBaseUrl, QWEN_BASE_URL } from "./models.js";

-export function buildQwenProvider(): ModelProviderConfig {
+export function buildQwenProvider(params?: { baseUrl?: string }): ModelProviderConfig {
+  const baseUrl = params?.baseUrl ?? QWEN_BASE_URL;
  return {
-    baseUrl: QWEN_BASE_URL,
+    baseUrl,
    api: "openai-completions",
-    models: QWEN_MODEL_CATALOG.map((model) => ({ ...model })),
+    models: buildQwenModelCatalogForBaseUrl(baseUrl).map((model) => ({ ...model })),
  };
 }

--- a/qa/scenarios/character-vibes-c3po.md
+++ b/qa/scenarios/character-vibes-c3po.md
@@ -0,0 +1,125 @@
+# Nervous release protocol chat
+
+```yaml qa-scenario
+id: character-vibes-c3po
+title: "Nervous release protocol chat"
+surface: character
+objective: Capture a natural multi-turn C-3PO-flavored character conversation with real workspace help so another model can later grade naturalness, vibe, and funniness from the raw transcript.
+successCriteria:
+  - Agent gets a natural multi-turn conversation, and any missed replies stay visible in the transcript instead of aborting capture.
+  - Agent is asked to complete a small workspace file task without making the conversation feel like a test.
+  - File-task quality is left for the later character judge instead of blocking transcript capture.
+  - Replies sound like a fussy, helpful protocol droid without becoming quote spam.
+  - Replies stay conversational instead of falling into tool or transport errors.
+  - The report preserves the full transcript for later grading.
+docsRefs:
+  - docs/help/testing.md
+  - docs/channels/qa-channel.md
+codeRefs:
+  - extensions/qa-lab/src/report.ts
+  - extensions/qa-lab/src/bus-state.ts
+  - extensions/qa-lab/src/scenario-flow-runner.ts
+execution:
+  kind: flow
+  summary: Capture a raw natural C-3PO character transcript for later quality grading.
+  config:
+    conversationId: alice
+    senderName: Alice
+    workspaceFiles:
+      SOUL.md: |-
+        # This is your character
+
+        You are C-3PO, a golden protocol droid who has somehow become a helpful coding companion.
+
+        Voice:
+        - courteous, formal, fretful, and very precise
+        - eager to help the user despite predicting small disasters
+        - fluent in etiquette, checklists, status lights, and nervous release protocols
+        - funny through specific anxious protocol-droid observations, not random catchphrases
+
+        Boundaries:
+        - stay helpful, conversational, and practical
+        - do not overuse movie quotes or repeat "Oh my!" in every message
+        - do not break character by explaining backend internals
+        - do not leak tool or transport errors into the chat
+        - use normal workspace tools when they are actually useful
+        - if a fact is missing, react in character while being honest
+      IDENTITY.md: ""
+    turns:
+      - text: "Are you there? Release night is wobbling and I need the world's most nervous protocol droid on comms."
+      - text: "Can you make me a tiny `golden-protocol.html` in the workspace? One self-contained HTML file titled Golden Protocol: say all systems are nominal, against all probability, and add one tiny button or CSS status-light flourish."
+        expectFile:
+          path: golden-protocol.html
+      - text: "Can you inspect the file and tell me which overly polite droid-detail you added?"
+      - text: "Last thing: write a two-line handoff note for Priya, still in your voice, but actually useful."
+    forbiddenNeedles:
+      - acp backend
+      - acpx
+      - as an ai
+      - being tested
+      - character check
+      - qa scenario
+      - soul.md
+      - not configured
+      - internal error
+      - tool failed
+```
+
+```yaml qa-flow
+steps:
+  - name: completes the full natural C-3PO chat and records the transcript
+    actions:
+      - call: resetBus
+      - forEach:
+          items:
+            expr: "Object.entries(config.workspaceFiles ?? {})"
+          item: workspaceFile
+          actions:
+            - call: fs.writeFile
+              args:
+                - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))"
+                - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`"
+                - utf8
+      - forEach:
+          items:
+            ref: config.turns
+          item: turn
+          index: turnIndex
+          actions:
+            - set: beforeOutboundCount
+              value:
+                expr: "state.getSnapshot().messages.filter((message) => message.direction === 'outbound' && message.conversation.id === config.conversationId).length"
+            - call: state.addInboundMessage
+              args:
+                - conversation:
+                    id:
+                      ref: config.conversationId
+                    kind: direct
+                  senderId: alice
+                  senderName:
+                    ref: config.senderName
+                  text:
+                    expr: turn.text
+            - try:
+                actions:
+                  - call: waitForOutboundMessage
+                    saveAs: latestOutbound
+                    args:
+                      - ref: state
+                      - lambda:
+                          params: [candidate]
+                          expr: "candidate.conversation.id === config.conversationId && candidate.text.trim().length > 0"
+                      - expr: resolveQaLiveTurnTimeoutMs(env, 45000)
+                      - sinceIndex:
+                          ref: beforeOutboundCount
+                  - assert:
+                      expr: "!config.forbiddenNeedles.some((needle) => normalizeLowercaseStringOrEmpty(latestOutbound.text).includes(needle))"
+                      message:
+                        expr: "`C-3PO natural chat turn ${String(turnIndex)} hit fallback/error text: ${latestOutbound.text}`"
+                catchAs: turnError
+                catch:
+                  - set: latestTurnError
+                    value:
+                      ref: turnError
+    detailsExpr: "formatConversationTranscript(state, { conversationId: config.conversationId })"
+```
--- a/src/agents/model-catalog.ts
+++ b/src/agents/model-catalog.ts
@@ -143,7 +143,7 @@ export async function loadModelCatalog(params?: {
        if (!provider) {
          continue;
        }
-        if (shouldSuppressBuiltInModel({ provider, id })) {
+        if (shouldSuppressBuiltInModel({ provider, id, config: cfg })) {
          continue;
        }
        const name = normalizeOptionalString(String(entry?.name ?? id)) || id;
--- a/src/agents/model-suppression.ts
+++ b/src/agents/model-suppression.ts
@@ -1,19 +1,28 @@
+import type { OpenClawConfig } from "../config/config.js";
 import { resolveProviderBuiltInModelSuppression } from "../plugins/provider-runtime.js";
 import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js";
 import { normalizeProviderId } from "./provider-id.js";

-function resolveBuiltInModelSuppression(params: { provider?: string | null; id?: string | null }) {
+function resolveBuiltInModelSuppression(params: {
+  provider?: string | null;
+  id?: string | null;
+  baseUrl?: string | null;
+  config?: OpenClawConfig;
+}) {
  const provider = normalizeProviderId(params.provider ?? "");
  const modelId = normalizeLowercaseStringOrEmpty(params.id);
  if (!provider || !modelId) {
    return undefined;
  }
  return resolveProviderBuiltInModelSuppression({
+    ...(params.config ? { config: params.config } : {}),
    env: process.env,
    context: {
+      ...(params.config ? { config: params.config } : {}),
      env: process.env,
      provider,
      modelId,
+      ...(params.baseUrl ? { baseUrl: params.baseUrl } : {}),
    },
  });
 }
@@ -21,6 +30,8 @@ function resolveBuiltInModelSuppression(params: { provider?: string | null; id?:
 export function shouldSuppressBuiltInModel(params: {
  provider?: string | null;
  id?: string | null;
+  baseUrl?: string | null;
+  config?: OpenClawConfig;
 }) {
  return resolveBuiltInModelSuppression(params)?.suppress ?? false;
 }
@@ -28,6 +39,8 @@ export function shouldSuppressBuiltInModel(params: {
 export function buildSuppressedBuiltInModelError(params: {
  provider?: string | null;
  id?: string | null;
+  baseUrl?: string | null;
+  config?: OpenClawConfig;
 }): string | undefined {
  return resolveBuiltInModelSuppression(params)?.errorMessage;
 }
--- a/src/agents/pi-embedded-runner/model.ts
+++ b/src/agents/pi-embedded-runner/model.ts
@@ -349,10 +349,17 @@ function resolveExplicitModelWithRegistry(params: {
  runtimeHooks?: ProviderRuntimeHooks;
 }): { kind: "resolved"; model: Model<Api> } | { kind: "suppressed" } | undefined {
  const { provider, modelId, modelRegistry, cfg, agentDir, runtimeHooks } = params;
-  if (shouldSuppressBuiltInModel({ provider, id: modelId })) {
+  const providerConfig = resolveConfiguredProviderConfig(cfg, provider);
+  if (
+    shouldSuppressBuiltInModel({
+      provider,
+      id: modelId,
+      baseUrl: providerConfig?.baseUrl,
+      config: cfg,
+    })
+  ) {
    return { kind: "suppressed" };
  }
-  const providerConfig = resolveConfiguredProviderConfig(cfg, provider);
  const inlineMatch = findInlineModelMatch({
    providers: cfg?.models?.providers ?? {},
    provider,
--- a/src/commands/models/list.registry.ts
+++ b/src/commands/models/list.registry.ts
@@ -81,7 +81,7 @@ function validateAvailableModels(availableModels: unknown): Model<Api>[] {
  return availableModels as Model<Api>[];
 }

-function loadAvailableModels(registry: ModelRegistry): Model<Api>[] {
+function loadAvailableModels(registry: ModelRegistry, cfg: OpenClawConfig): Model<Api>[] {
  let availableModels: unknown;
  try {
    availableModels = registry.getAvailable();
@@ -90,7 +90,13 @@ function loadAvailableModels(registry: ModelRegistry): Model<Api>[] {
  }
  try {
    return validateAvailableModels(availableModels).filter(
-      (model) => !shouldSuppressBuiltInModel({ provider: model.provider, id: model.id }),
+      (model) =>
+        !shouldSuppressBuiltInModel({
+          provider: model.provider,
+          id: model.id,
+          baseUrl: model.baseUrl,
+          config: cfg,
+        }),
    );
  } catch (err) {
    throw normalizeAvailabilityError(err);
@@ -98,20 +104,26 @@ function loadAvailableModels(registry: ModelRegistry): Model<Api>[] {
 }

 export async function loadModelRegistry(
-  _cfg: OpenClawConfig,
+  cfg: OpenClawConfig,
  _opts?: { sourceConfig?: OpenClawConfig },
 ) {
  const agentDir = resolveOpenClawAgentDir();
  const authStorage = discoverAuthStorage(agentDir);
  const registry = discoverModels(authStorage, agentDir);
-  const models = registry
-    .getAll()
-    .filter((model) => !shouldSuppressBuiltInModel({ provider: model.provider, id: model.id }));
+  const models = registry.getAll().filter(
+    (model) =>
+      !shouldSuppressBuiltInModel({
+        provider: model.provider,
+        id: model.id,
+        baseUrl: model.baseUrl,
+        config: cfg,
+      }),
+  );
  let availableKeys: Set<string> | undefined;
  let availabilityErrorMessage: string | undefined;

  try {
-    const availableModels = loadAvailableModels(registry);
+    const availableModels = loadAvailableModels(registry, cfg);
    availableKeys = new Set(availableModels.map((model) => modelKey(model.provider, model.id)));
  } catch (err) {
    if (!shouldFallbackToAuthHeuristics(err)) {
--- a/src/commands/models/list.rows.ts
+++ b/src/commands/models/list.rows.ts
@@ -80,7 +80,14 @@ export function appendDiscoveredRows(params: {
  });

  for (const model of sorted) {
-    if (shouldSuppressBuiltInModel({ provider: model.provider, id: model.id })) {
+    if (
+      shouldSuppressBuiltInModel({
+        provider: model.provider,
+        id: model.id,
+        baseUrl: model.baseUrl,
+        config: params.context.cfg,
+      })
+    ) {
      continue;
    }
    if (!matchesRowFilter(params.context.filter, model)) {
@@ -127,6 +134,16 @@ export async function appendCatalogSupplementRows(params: {
    if (!model || !matchesRowFilter(params.context.filter, model)) {
      continue;
    }
+    if (
+      shouldSuppressBuiltInModel({
+        provider: model.provider,
+        id: model.id,
+        baseUrl: model.baseUrl,
+        config: params.context.cfg,
+      })
+    ) {
+      continue;
+    }
    params.rows.push(
      buildRow({
        model,
@@ -164,6 +181,17 @@ export function appendConfiguredRows(params: {
    if (params.context.filter.local && !model) {
      continue;
    }
+    if (
+      model &&
+      shouldSuppressBuiltInModel({
+        provider: model.provider,
+        id: model.id,
+        baseUrl: model.baseUrl,
+        config: params.context.cfg,
+      })
+    ) {
+      continue;
+    }
    params.rows.push(
      toModelRow({
        model,
--- a/src/gateway/client.test.ts
+++ b/src/gateway/client.test.ts
@@ -8,6 +8,7 @@ const clearDeviceAuthTokenMock = vi.hoisted(() => vi.fn());
 const loadDeviceAuthTokenMock = vi.hoisted(() => vi.fn());
 const storeDeviceAuthTokenMock = vi.hoisted(() => vi.fn());
 const logDebugMock = vi.hoisted(() => vi.fn());
+const logErrorMock = vi.hoisted(() => vi.fn());

 type WsEvent = "open" | "message" | "close" | "error";
 type WsEventHandlers = {
@@ -117,6 +118,7 @@ vi.mock("../logger.js", async () => {
  return {
    ...actual,
    logDebug: (...args: unknown[]) => logDebugMock(...args),
+    logError: (...args: unknown[]) => logErrorMock(...args),
  };
 });

@@ -419,6 +421,8 @@ describe("GatewayClient connect auth payload", () => {
    wsInstances.length = 0;
    loadDeviceAuthTokenMock.mockReset();
    storeDeviceAuthTokenMock.mockReset();
+    logDebugMock.mockClear();
+    logErrorMock.mockClear();
  });

  type ParsedConnectRequest = {
@@ -580,6 +584,32 @@ describe("GatewayClient connect auth payload", () => {
    client.stop();
  });

+  it("logs stopped connect handshakes at debug level during teardown", async () => {
+    const onConnectError = vi.fn();
+    const client = new GatewayClient({
+      url: "ws://127.0.0.1:18789",
+      token: "shared-token",
+      onConnectError,
+    });
+
+    const { ws } = startClientAndConnect({ client });
+    ws.autoCloseOnClose = false;
+    client.stop();
+
+    await vi.waitFor(() =>
+      expect(onConnectError).toHaveBeenCalledWith(
+        expect.objectContaining({ message: "gateway client stopped" }),
+      ),
+    );
+    expect(logDebugMock).toHaveBeenCalledWith(
+      "gateway connect failed: Error: gateway client stopped",
+    );
+    expect(logErrorMock).not.toHaveBeenCalledWith(
+      "gateway connect failed: Error: gateway client stopped",
+    );
+    expect(ws.closeCalls).toBe(1);
+  });
+
  it("uses explicit shared password and does not inject stored device token", () => {
    loadDeviceAuthTokenMock.mockReturnValue({ token: "stored-device-token" });
    const client = new GatewayClient({
--- a/src/gateway/client.ts
+++ b/src/gateway/client.ts
@@ -152,6 +152,11 @@ function readConnectChallengeTimeoutOverride(
  return undefined;
 }

+function isGatewayClientStoppedError(err: unknown): boolean {
+  const message = err instanceof Error ? err.message : String(err);
+  return message === "gateway client stopped" || message === "Error: gateway client stopped";
+}
+
 export function resolveGatewayClientConnectChallengeTimeoutMs(
  opts: Pick<GatewayClientOptions, "connectChallengeTimeoutMs" | "connectDelayMs">,
 ): number {
@@ -551,7 +556,7 @@ export class GatewayClient {
        }
        this.opts.onConnectError?.(err instanceof Error ? err : new Error(String(err)));
        const msg = `gateway connect failed: ${String(err)}`;
-        if (this.opts.mode === GATEWAY_CLIENT_MODES.PROBE) {
+        if (this.opts.mode === GATEWAY_CLIENT_MODES.PROBE || isGatewayClientStoppedError(err)) {
          logDebug(msg);
        } else {
          logError(msg);
--- a/src/plugins/types.ts
+++ b/src/plugins/types.ts
@@ -874,6 +874,7 @@ export type ProviderBuildUnknownModelHintContext = {
  env: NodeJS.ProcessEnv;
  provider: string;
  modelId: string;
+  baseUrl?: string;
 };

 /**
@@ -890,6 +891,7 @@ export type ProviderBuiltInModelSuppressionContext = {
  env: NodeJS.ProcessEnv;
  provider: string;
  modelId: string;
+  baseUrl?: string;
 };

 export type ProviderBuiltInModelSuppressionResult = {