diff --git a/deployments/local-scripts/grafana-github-bridge/README.md b/deployments/local-scripts/grafana-github-bridge/README.md new file mode 100644 index 0000000000..83f1f57840 --- /dev/null +++ b/deployments/local-scripts/grafana-github-bridge/README.md @@ -0,0 +1,104 @@ +# grafana-github-bridge + +Cloudflare Worker that converts Grafana Alertmanager webhook POSTs into GitHub issues, triggering a Claude Code agent to diagnose and act on bridge alerts. + +## Flow + +```mermaid +flowchart TD + P["Prometheus :9615"] --> G["Grafana Alert Rules"] + G --> AM["Alertmanager"] + AM -->|webhook| W["Cloudflare Worker"] + AM -->|webhook| M["Matrix"] + + W -->|"warning + known"| T["Issue: label claude"] + W -->|"critical / unknown"| E["Issue: label claude-escalate"] + + T -->|Haiku| H["Fast Triage"] + E -->|Sonnet| S["Deep Investigation"] + + H --> V["Engineer Reviews"] + S --> V +``` + +## Tiered model strategy + +The worker routes alerts to different Claude models based on severity and category to optimize API costs: + +| Condition | Label | Model | Cost | +|-----------|-------|-------|------| +| `severity=critical` | `claude-escalate` | Sonnet | ~$0.01/alert | +| `category=other` (unknown) | `claude-escalate` | Sonnet | ~$0.01/alert | +| Everything else | `claude` | Haiku | ~$0.001/alert | + +Haiku handles ~90% of alerts (known categories, warning severity) at 1/10th the cost. Sonnet only runs for critical alerts or unrecognized patterns that need deeper investigation. + +## Alert categories + +| Category | Metric Pattern | Suggested Action | +|----------|---------------|------------------| +| `relay-down` | `up{container="bridges-common-relay"}` | Check relay pod status and restart | +| `version-guard` | Loki: `"Aborting"` in relay logs | Redeploy relay with new runtime | +| `headers-mismatch` | `*_is_source_and_source_at_target_using_different_forks` | Re-sync headers from canonical fork | +| `finality-lag` | `*_Sync_best_source_at_target_block_number` | Check relay logs and source chain finality | +| `delivery-lag` | `*_MessageLane_*_lane_state_nonces` (generated > received) | Check message relay process | +| `confirmation-lag` | `*_lane_state_nonces` (received vs confirmed) | Check confirmation relay | +| `reward-lag` | `*_lane_state_nonces` (confirmed src vs confirmed tgt) | Check reward mechanism | +| `low-balance` | `at_*_relay_*Messages_balance` | Top up relay account | + +## Grafana configuration + +### Contact point + +```yaml +- orgId: 1 + name: GitHub parity-bridges-common + receivers: + - uid: github_parity_bridges_common + type: webhook + settings: + url: https://grafana-github-bridge.parity-bridges.workers.dev + disableResolveMessage: false +``` + +### Notification policy + +Route bridge alerts to GitHub **and** continue to Matrix: + +```yaml +- receiver: GitHub parity-bridges-common + matchers: + - alertname =~ ".*Bridge.*|.*bridge.*|.*headers mismatch" + continue: true +``` + +`continue: true` ensures the alert also falls through to the default receiver (Matrix). + +## Deploy + +```bash +cd deployments/local-scripts/grafana-github-bridge +npm install +npx wrangler secret put GITHUB_TOKEN # PAT with issues:write scope +npx wrangler secret put WEBHOOK_SECRET # optional, shared secret +npx wrangler deploy +``` + +Deployed at `https://grafana-github-bridge.parity-bridges.workers.dev`. + +## Test + +```bash +# Local +npx wrangler dev +WORKER_URL=http://localhost:8787 node test.js + +# Production (dry run — creates a real issue) +WORKER_URL=https://grafana-github-bridge.parity-bridges.workers.dev node test.js +``` + +## Monitor + +- **Worker metrics**: Cloudflare dashboard → Workers → grafana-github-bridge +- **Logs**: `npx wrangler tail` +- **GitHub side**: search `label:alert label:claude` or `label:claude-escalate` in the repo issues diff --git a/deployments/local-scripts/grafana-github-bridge/package.json b/deployments/local-scripts/grafana-github-bridge/package.json new file mode 100644 index 0000000000..81190ae4b4 --- /dev/null +++ b/deployments/local-scripts/grafana-github-bridge/package.json @@ -0,0 +1,12 @@ +{ + "name": "grafana-github-bridge", + "private": true, + "scripts": { + "dev": "wrangler dev", + "deploy": "wrangler deploy", + "test": "node test.js" + }, + "devDependencies": { + "wrangler": "^3" + } +} diff --git a/deployments/local-scripts/grafana-github-bridge/src/index.js b/deployments/local-scripts/grafana-github-bridge/src/index.js new file mode 100644 index 0000000000..645357f56f --- /dev/null +++ b/deployments/local-scripts/grafana-github-bridge/src/index.js @@ -0,0 +1,279 @@ +/** + * Grafana → GitHub Issue bridge. + * + * Receives Grafana Alertmanager webhook POSTs and creates GitHub issues + * with the "alert" label, categorised by bridge alert type. + * + * Environment variables (set as Worker secrets): + * GITHUB_TOKEN – GitHub PAT with `issues:write` scope + * WEBHOOK_SECRET – (optional) shared secret for request validation + */ + +const REPO = 'paritytech/parity-bridges-common'; + +// --------------------------------------------------------------------------- +// Alert classification +// --------------------------------------------------------------------------- + +const ALERT_CATEGORIES = [ + { + id: 'relay-down', + label: 'relay-down', + match: (t) => /node is down/i.test(t), + emoji: '🔴', + action: 'Check relay pod status and restart if needed.', + }, + { + id: 'version-guard', + label: 'version-guard', + match: (t) => /version guard|abort/i.test(t), + emoji: '⛔', + action: + 'A chain was upgraded — redeploy the relay with the new runtime.', + }, + { + id: 'headers-mismatch', + label: 'headers-mismatch', + match: (t) => /headers? mismatch|different.?forks/i.test(t), + emoji: '🔀', + action: + 'Source chain forked — the relay may need to re-sync headers from the canonical fork.', + }, + { + id: 'finality-lag', + label: 'finality-lag', + match: (t) => /finality.*lag|sync.*lag/i.test(t), + emoji: '⏳', + action: + 'Finality headers are not advancing — check relay logs and source chain finality.', + }, + { + id: 'delivery-lag', + label: 'delivery-lag', + match: (t) => /delivery.*lag/i.test(t), + emoji: '📦', + action: + 'Messages generated but not delivered — check message relay process.', + }, + { + id: 'confirmation-lag', + label: 'confirmation-lag', + match: (t) => /confirmation.*lag/i.test(t), + emoji: '✅', + action: + 'Messages delivered but not confirmed back to source — check confirmation relay.', + }, + { + id: 'reward-lag', + label: 'reward-lag', + match: (t) => /reward.*lag/i.test(t), + emoji: '💰', + action: + 'Confirmations not being rewarded — check reward mechanism and relay balance.', + }, + { + id: 'low-balance', + label: 'low-balance', + match: (t) => /balance/i.test(t), + emoji: '💸', + action: 'Relay account balance is low — top up the account.', + }, +]; + +function classify(alertname) { + for (const cat of ALERT_CATEGORIES) { + if (cat.match(alertname)) return cat; + } + return { + id: 'other', + label: 'bridge-alert', + emoji: '⚠️', + action: null, + }; +} + +// Extract environment (prod vs testnet) from labels or title +function detectEnv(alert) { + const domain = alert.labels?.domain || ''; + const title = alert.labels?.alertname || ''; + if (domain === 'parity-testnet' || /rococo|westend/i.test(title)) + return 'testnet'; + if (domain === 'parity-chains' || /polkadot|kusama/i.test(title)) + return 'production'; + return 'unknown'; +} + +// Extract the bridge pair from the alert title, e.g. "Polkadot <> Kusama" +function detectBridgePair(alert) { + const title = alert.labels?.alertname || ''; + // Match patterns like "Polkadot -> KusamaBridgeHub" or "KusamaBridgeHub <> PolkadotBridgeHub" + const m = title.match( + /(\w+?)(?:BridgeHub)?\s*(?:->|<>|to)\s*(\w+?)(?:BridgeHub)?[\s_]/i, + ); + if (m) return `${m[1]} ↔ ${m[2]}`; + return alert.labels?.bridge || null; +} + +// --------------------------------------------------------------------------- +// Issue formatting +// --------------------------------------------------------------------------- + +function formatTitle(alert, category) { + const alertname = alert.labels?.alertname || 'Unknown alert'; + return `${category.emoji} [Alert] ${alertname}`; +} + +function formatBody(alert, payload, category, env, bridgePair) { + const labels = alert.labels || {}; + const annotations = alert.annotations || {}; + const values = alert.values || {}; + + const lines = [ + `## ${category.emoji} ${labels.alertname || 'Alert'}`, + '', + `| Field | Value |`, + `|-------|-------|`, + `| **Status** | \`${alert.status}\` |`, + `| **Severity** | \`${labels.severity || 'unknown'}\` |`, + `| **Category** | \`${category.id}\` |`, + `| **Environment** | \`${env}\` |`, + bridgePair ? `| **Bridge** | \`${bridgePair}\` |` : null, + `| **Started** | ${alert.startsAt || 'N/A'} |`, + '', + ]; + + if (annotations.summary) { + lines.push(`### Summary`, '', annotations.summary, ''); + } + if (annotations.description) { + lines.push(`### Description`, '', annotations.description, ''); + } + + if (category.action) { + lines.push(`### Suggested Action`, '', `> ${category.action}`, ''); + } + + if (Object.keys(values).length > 0) { + lines.push('### Metric Values', ''); + for (const [key, val] of Object.entries(values)) { + lines.push(`- **${key}:** \`${val}\``); + } + lines.push(''); + } + + // Links + const linkLines = []; + if (alert.generatorURL) linkLines.push(`- [Alert rule](${alert.generatorURL})`); + if (payload.externalURL) linkLines.push(`- [Grafana](${payload.externalURL})`); + if (annotations.__dashboardUid__) { + const base = payload.externalURL || 'https://grafana.teleport.parity.io'; + const dashUrl = `${base}/d/${annotations.__dashboardUid__}`; + linkLines.push(`- [Dashboard](${dashUrl})`); + } + if (linkLines.length) { + lines.push('### Links', '', ...linkLines, ''); + } + + // All labels + lines.push( + '
All labels', + '', + '```json', + JSON.stringify(labels, null, 2), + '```', + '', + '
', + '', + '
Raw alert payload', + '', + '```json', + JSON.stringify(alert, null, 2), + '```', + '', + '
', + ); + + return lines.filter((l) => l !== null).join('\n'); +} + +// --------------------------------------------------------------------------- +// Worker +// --------------------------------------------------------------------------- + +export default { + async fetch(request, env) { + if (request.method !== 'POST') { + return new Response('Method not allowed', { status: 405 }); + } + + if (env.WEBHOOK_SECRET) { + const auth = request.headers.get('Authorization'); + if (auth !== `Bearer ${env.WEBHOOK_SECRET}`) { + return new Response('Unauthorized', { status: 401 }); + } + } + + let payload; + try { + payload = await request.json(); + } catch { + return new Response('Invalid JSON', { status: 400 }); + } + + const alerts = payload.alerts || []; + const results = []; + + for (const alert of alerts) { + if (alert.status !== 'firing') continue; + + const alertname = alert.labels?.alertname || 'Unknown alert'; + const category = classify(alertname); + const env_name = detectEnv(alert); + const bridgePair = detectBridgePair(alert); + + const title = formatTitle(alert, category); + const body = formatBody(alert, payload, category, env_name, bridgePair); + + const severity = alert.labels?.severity || 'warning'; + const ghLabels = ['alert', category.label]; + if (env_name === 'testnet') ghLabels.push('testnet'); + if (env_name === 'production') ghLabels.push('production'); + + // Tiered model: critical/unknown → Sonnet (escalate), others → Haiku (triage) + if (severity === 'critical' || category.id === 'other') { + ghLabels.push('claude-escalate'); + } else { + ghLabels.push('claude'); + } + + const resp = await fetch( + `https://api.github.com/repos/${REPO}/issues`, + { + method: 'POST', + headers: { + Authorization: `Bearer ${env.GITHUB_TOKEN}`, + Accept: 'application/vnd.github+json', + 'User-Agent': 'grafana-github-bridge', + }, + body: JSON.stringify({ + title, + body, + labels: ghLabels, + assignees: [], + }), + }, + ); + + results.push({ + alertname, + category: category.id, + env: env_name, + status: resp.status, + issue: + resp.status === 201 ? (await resp.json()).html_url : null, + }); + } + + return Response.json({ processed: results.length, results }); + }, +}; diff --git a/deployments/local-scripts/grafana-github-bridge/test.js b/deployments/local-scripts/grafana-github-bridge/test.js new file mode 100644 index 0000000000..d743234954 --- /dev/null +++ b/deployments/local-scripts/grafana-github-bridge/test.js @@ -0,0 +1,117 @@ +/** + * Smoke test — sends fake Grafana alert payloads to the worker. + * + * Usage: + * WORKER_URL=http://localhost:8787 node test.js + * WORKER_URL=https://grafana-github-bridge.parity-bridges.workers.dev node test.js + */ + +const url = process.env.WORKER_URL || 'http://localhost:8787'; +const secret = process.env.WEBHOOK_SECRET; + +const payload = { + receiver: 'github-parity-bridges-common', + status: 'firing', + alerts: [ + { + status: 'firing', + labels: { + alertname: + 'Polkadot -> KusamaBridgeHub finality sync lags (00000001)', + severity: 'critical', + domain: 'parity-chains', + }, + annotations: { + summary: + 'Less than 5000 Polkadot headers (~1/2 era) have been synced to KusamaBridgeHub in last 25 hours. Relay is not running?', + __dashboardUid__: 'zqjpkXxnk', + __panelId__: '2', + }, + values: { + A: '312', + C: '312', + }, + startsAt: new Date().toISOString(), + generatorURL: + 'https://grafana.teleport.parity.io/alerting/list', + }, + { + status: 'firing', + labels: { + alertname: + 'KusamaBridgeHub <> PolkadotBridgeHub relay (00000001) node is down', + severity: 'critical', + domain: 'parity-chains', + container: 'bridges-common-relay', + }, + annotations: { + summary: + 'KusamaBridgeHub <> PolkadotBridgeHub relay (00000001) node is down', + __dashboardUid__: 'UFsgpJtVz', + __panelId__: '16', + }, + values: { A: '0' }, + startsAt: new Date().toISOString(), + generatorURL: + 'https://grafana.teleport.parity.io/alerting/list', + }, + { + status: 'firing', + labels: { + alertname: 'Relay balances at PolkadotBridgeHub', + severity: 'warning', + domain: 'parity-chains', + }, + annotations: { + summary: + 'Relay balance at PolkadotBridgeHub is getting low', + }, + values: { A: '1.23' }, + startsAt: new Date().toISOString(), + }, + { + status: 'firing', + labels: { + alertname: + 'Version guard has aborted RococoBridgeHub <> WestendBridgeHub relay (00000002)', + severity: 'critical', + domain: 'parity-testnet', + }, + annotations: { + summary: + 'The RococoBridgeHub <> WestendBridgeHub relay (00000002) has been aborted by version guard', + }, + startsAt: new Date().toISOString(), + }, + { + status: 'resolved', + labels: { + alertname: 'Should be skipped (resolved)', + }, + }, + ], + externalURL: 'https://grafana.teleport.parity.io', +}; + +const headers = { 'Content-Type': 'application/json' }; +if (secret) headers['Authorization'] = `Bearer ${secret}`; + +fetch(url, { + method: 'POST', + headers, + body: JSON.stringify(payload), +}) + .then(async (r) => { + console.log(`Status: ${r.status}`); + const data = await r.json(); + console.log(JSON.stringify(data, null, 2)); + console.log( + `\nProcessed ${data.processed} alerts (1 resolved skipped)`, + ); + for (const r of data.results) { + console.log( + ` [${r.category}] ${r.env} — ${r.alertname} → ${r.issue || `HTTP ${r.status}`}`, + ); + } + }) + .catch(console.error); diff --git a/deployments/local-scripts/grafana-github-bridge/wrangler.toml b/deployments/local-scripts/grafana-github-bridge/wrangler.toml new file mode 100644 index 0000000000..dca82bb218 --- /dev/null +++ b/deployments/local-scripts/grafana-github-bridge/wrangler.toml @@ -0,0 +1,7 @@ +name = "grafana-github-bridge" +main = "src/index.js" +compatibility_date = "2024-01-01" + +# Secrets (set via `wrangler secret put`): +# GITHUB_TOKEN – GitHub PAT with issues:write scope +# WEBHOOK_SECRET – optional shared secret for Grafana Authorization header