diff --git a/infra/monitoring.ts b/infra/monitoring.ts index c08d39f262..edc70fecd1 100644 --- a/infra/monitoring.ts +++ b/infra/monitoring.ts @@ -111,6 +111,34 @@ const providerHttpErrorsQuery = (product: "go" | "zen") => { }).json } +const modelLowTpsQuery = (product: "go" | "zen") => { + const filters = [ + { column: "model", op: "exists" }, + { column: "event_type", op: "=", value: "completions" }, + { column: "user_agent", op: "contains", value: "opencode" }, + { column: "isGoTier", op: "=", value: product === "go" ? "true" : "false" }, + { column: "status", op: ">=", value: "200" }, + { column: "status", op: "<", value: "400" }, + { column: "tps.output", op: "exists" }, + ] + + return honeycomb.getQuerySpecificationOutput({ + breakdowns: ["model"], + calculations: [ + { op: "COUNT", name: "TOTAL", filterCombination: "AND", filters }, + { + op: "P50", + name: "TPS", + column: "tps.output", + filterCombination: "AND", + filters, + }, + ], + formulas: [{ name: "LOW_TPS", expression: "IF(GTE($TOTAL, 100), $TPS, 999)" }], + timeRange: 900, + }).json +} + new honeycomb.Trigger("IncreasedModelHttpErrorsGo", { name: "Increased Model HTTP Errors [Go]", description, @@ -149,6 +177,46 @@ new honeycomb.Trigger("IncreasedModelHttpErrorsZen", { ], }) +new honeycomb.Trigger("LowModelTpsGo", { + disabled: true, + name: "Low Model TPS [Go]", + description, + queryJson: modelLowTpsQuery("go"), + alertType: "on_change", + frequency: 300, + thresholds: [{ op: "<", value: 20, exceededLimit: 1 }], + recipients: [ + { + id: webhookRecipient.id, + notificationDetails: [ + { + variables: [{ name: "type", value: "model_low_tps" }], + }, + ], + }, + ], +}) + +new honeycomb.Trigger("LowModelTpsZen", { + disabled: true, + name: "Low Model TPS [Zen]", + description, + queryJson: modelLowTpsQuery("zen"), + alertType: "on_change", + frequency: 300, + thresholds: [{ op: "<", value: 20, exceededLimit: 1 }], + recipients: [ + { + id: webhookRecipient.id, + notificationDetails: [ + { + variables: [{ name: "type", value: "model_low_tps" }], + }, + ], + }, + ], +}) + new honeycomb.Trigger("IncreasedProviderHttpErrorsGo", { name: "Increased Provider HTTP Errors [Go]", description, diff --git a/packages/console/app/src/routes/honeycomb/webhook.ts b/packages/console/app/src/routes/honeycomb/webhook.ts index 367a93aeb0..ae76d86fe2 100644 --- a/packages/console/app/src/routes/honeycomb/webhook.ts +++ b/packages/console/app/src/routes/honeycomb/webhook.ts @@ -12,13 +12,19 @@ const basePayload = z.object({ url: z.string(), }) -const groups = z.object({ group: z.object({ key: z.string(), value: z.string() }).array() }).array() +const groups = z + .object({ result: z.union([z.number(), z.string()]).nullish(), group: z.object({ key: z.string(), value: z.string() }).array() }) + .array() const honeycombWebhookPayload = z.discriminatedUnion("type", [ basePayload.extend({ type: z.literal("model_http_errors"), groups, }), + basePayload.extend({ + type: z.literal("model_low_tps"), + groups, + }), basePayload.extend({ type: z.literal("provider_http_errors"), groups, @@ -29,14 +35,25 @@ const honeycombWebhookPayload = z.discriminatedUnion("type", [ ]) const postDiscordMessage = async (payload: z.infer) => { - const group = - payload.type === "model_http_errors" ? "model" : payload.type === "provider_http_errors" ? "provider" : undefined - const names = payload.type === "custom" ? [] : payload.groups.flatMap((item) => item.group.map((g) => g.value)) + const names = + payload.type === "custom" + ? [] + : payload.groups.flatMap((item) => + item.group.map((g) => { + const result = item.result == null ? undefined : Number(item.result) + return `- ${g.value}${ + result !== undefined && Number.isFinite(result) + ? payload.type === "model_low_tps" + ? ` (${Math.round(result)} TPS)` + : ` (${Math.round(result * 100)}% errors)` + : "" + }` + }), + ) const content = [ `[**${payload.isTest ? "[TEST] " : ""}${payload.name ?? "Honeycomb alert"}**](${payload.url})`, - group && names.length > 0 ? `Affected ${group}s:` : undefined, - ...names.map((name) => `- ${name}`), + ...names, "", `<@&${DISCORD_ALERT_ROLE_ID}>`, ]