From 2759a7092e173ac80df6314c4f262f692c343dae Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 7 May 2026 18:26:24 +0400 Subject: [PATCH 1/4] init --- docs/configuration.md | 58 +++- examples/example-config.yaml | 29 ++ src/go.mod | 7 + src/go.sum | 18 ++ src/lib/config/config.go | 4 + src/lib/config/trayTypeConfig.go | 17 + src/lib/trays/providers/nomadProvider.go | 292 ++++++++++++++++++ .../trays/providers/trayProviderFactory.go | 4 + 8 files changed, 428 insertions(+), 1 deletion(-) create mode 100644 src/lib/trays/providers/nomadProvider.go diff --git a/docs/configuration.md b/docs/configuration.md index 04cacbc..8752e69 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -32,6 +32,12 @@ providers: project: my-gcp-project credentialsFile: my-gcp-creds.json + - name: nomad-scw + type: nomad + address: https://nomad.internal:4646 + token: + namespace: runners + trayTypes: - name: cattery-docker-local provider: docker-local @@ -59,6 +65,17 @@ trayTypes: - europe-west1-d machineType: e2-standard-4 instanceTemplate: global/instanceTemplates/cattery-default + + - name: cattery-nomad + provider: nomad-scw + githubOrg: my-org + runnerGroupId: 3 + maxTrays: 5 + shutdown: true + config: + jobId: scw-cattery-runner-tray + script: | + echo "extra setup for $TRAY_NAME" ``` ### Config sections @@ -98,7 +115,7 @@ Common fields for all providers: | Key | Type | Required | Description | |------|--------|----------|-----------------------------------------------------| | name | string | yes | Provider name to reference from trayTypes. | -| type | enum | yes | Provider type. Currently implemented: docker, google (GCE). | +| type | enum | yes | Provider type. Currently implemented: docker, google (GCE), nomad. | Provider-specific fields: @@ -113,6 +130,19 @@ Provider-specific fields: | project | string | yes | GCP project ID | | credentialsFile | string | no | Path to GCP service account JSON credentials. If omitted, uses Application Default Credentials. | +- nomad + + Cattery dispatches each tray as a child of a **parameterized parent job** that must already be registered in your Nomad cluster. The provider supplies `tray_name`, `bootstrap_token` and `cattery_url` as dispatch meta plus a generated bash payload that downloads and execs the cattery agent. Resources, driver and constraints come from the parent job spec — Nomad does not allow overriding them at dispatch time, so use distinct parameterized jobs for distinct resource shapes. + + | Key | Type | Required | Description | + |-----------|--------|----------|---------------------------------------------------------------------------------------------------| + | address | string | yes | Nomad agent HTTP(S) address, e.g. `https://nomad.internal:4646`. | + | token | string | no | Nomad ACL token. Should be scoped to `dispatch-job` on the parameterized parent job(s). | + | namespace | string | no | Nomad namespace to dispatch into. Defaults to `default`. | + | region | string | no | Nomad region. Defaults to the agent's region. | + | tlsCaFile | string | no | Path to a PEM CA bundle for verifying the Nomad agent's TLS certificate. | + | insecure | bool | no | Skip TLS verification. Dev-only. | + #### trayTypes Defines one or more tray "profiles" that the Tray Manager can maintain. @@ -147,6 +177,32 @@ Provider-specific config under trayType.config: | instanceTemplate | string | yes | Template to base instances on (e.g. `global/instanceTemplates/cattery-default`) | | namePrefix | string | no | Prefix for VM names | +- nomad config + + | Key | Type | Required | Description | + |--------|--------|----------|------------------------------------------------------------------------------------------------------| + | jobId | string | yes | ID of a parameterized parent job already registered in Nomad. Cattery dispatches one child per tray. | + | script | string | no | Inline bash, executed after the agent binary is downloaded and before the agent is exec'd. Use YAML's `\|` block scalar for multi-line. | + + **Parent-job contract.** The parameterized parent job must declare: + + ```hcl + parameterized { + payload = "required" + meta_required = ["tray_name", "bootstrap_token", "cattery_url"] + } + ``` + + Inside the alloc, the dispatched payload is the bash script the provider generated. Before exec'ing it, your parent job is responsible for exporting the meta values as `TRAY_NAME`, `BOOTSTRAP_TOKEN` and `CATTERY_URL` (e.g. via cloud-init writing `/etc/cattery/bootstrap.env` and sourcing it). The default payload assumes those env vars are present. + + **Lifecycle.** + + - Cattery dispatches the parent job, stores `dispatchedJobId` + `evalId` in the tray's provider data. + - Cattery blocks until the dispatch evaluation leaves `pending`. `complete` → success; `blocked` → returned as `ErrCapacityBlocked` (Nomad has no capacity for this alloc); `failed`/`canceled` → error. + - On tray cleanup, the dispatched child job is deregistered with `purge=true`. + + **Resource shapes.** Resources, driver, constraints and reschedule policy are baked into the parent job spec — they cannot be set per-dispatch. To run trays at different sizes, register multiple parameterized parent jobs and reference them by `jobId` from different trayTypes. + Notes: - Ensure runnerGroupId corresponds to an existing Runner Group in your GitHub org and that your GitHub App has permission to register runners. diff --git a/examples/example-config.yaml b/examples/example-config.yaml index a057ba9..05f7ebc 100644 --- a/examples/example-config.yaml +++ b/examples/example-config.yaml @@ -42,6 +42,16 @@ providers: project: my-gcp-project credentialsFile: path/to/credentials.json + - name: nomad-scw + type: nomad + address: https://nomad.internal:4646 + # ACL token scoped to dispatch-job on the parameterized parent job(s). + token: + namespace: runners # optional + region: global # optional + tlsCaFile: path/to/nomad-ca.pem # optional + insecure: false # optional, skip TLS verification (dev only) + trayTypes: - name: cattery-tiny provider: docker-local @@ -68,3 +78,22 @@ trayTypes: provider: gce runnerGroupId: 3 # check in github org settings -> Runner groups shutdown: true + + - name: cattery-nomad + provider: nomad-scw + githubOrg: My-Github-Org + runnerGroupId: 3 + maxTrays: 5 + shutdown: true + config: + # ID of a parameterized parent job already registered in Nomad. Resources, + # driver and constraints come from that job spec — Nomad does not allow + # overriding them at dispatch time. Use distinct parent jobs for distinct + # resource shapes. + jobId: scw-cattery-runner-tray + # Optional inline bash, executed after the agent binary is downloaded + # and before the agent is exec'd. The provider's default bootstrap + # exports TRAY_NAME, BOOTSTRAP_TOKEN and CATTERY_URL from meta. + script: | + echo "extra setup for $TRAY_NAME" + # mkfs / mount scratch volume, install build tools, etc. diff --git a/src/go.mod b/src/go.mod index b921082..c0d8e14 100644 --- a/src/go.mod +++ b/src/go.mod @@ -10,6 +10,7 @@ require ( github.com/go-playground/validator/v10 v10.30.2 github.com/go-viper/mapstructure/v2 v2.5.0 github.com/google/go-github/v84 v84.0.0 + github.com/hashicorp/nomad/api v0.0.0-20260507064547-505b8f595ce4 github.com/prometheus/client_golang v1.23.2 github.com/sirupsen/logrus v1.9.4 github.com/spf13/cobra v1.10.2 @@ -39,11 +40,17 @@ require ( github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.15 // indirect github.com/googleapis/gax-go/v2 v2.22.0 // indirect + github.com/gorilla/websocket v1.5.3 // indirect + github.com/hashicorp/cronexpr v1.1.3 // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect github.com/hashicorp/go-retryablehttp v0.7.8 // indirect + github.com/hashicorp/go-rootcerts v1.0.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.18.5 // indirect github.com/leodido/go-urn v1.4.0 // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.3.0 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect diff --git a/src/go.sum b/src/go.sum index dc804cc..414ecfe 100644 --- a/src/go.sum +++ b/src/go.sum @@ -19,6 +19,8 @@ github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -63,12 +65,24 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.15 h1:xolVQTEXusUcAA5Ugt github.com/googleapis/enterprise-certificate-proxy v0.3.15/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg= github.com/googleapis/gax-go/v2 v2.22.0 h1:PjIWBpgGIVKGoCXuiCoP64altEJCj3/Ei+kSU5vlZD4= github.com/googleapis/gax-go/v2 v2.22.0/go.mod h1:irWBbALSr0Sk3qlqb9SyJ1h68WjgeFuiOzI4Rqw5+aY= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/hashicorp/cronexpr v1.1.3 h1:rl5IkxXN2m681EfivTlccqIryzYJSXRGRNa0xeG7NA4= +github.com/hashicorp/cronexpr v1.1.3/go.mod h1:P4wA0KBl9C5q2hABiMO7cp6jcIg96CDh1Efb3g1PWA4= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48= github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw= +github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= +github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= +github.com/hashicorp/nomad/api v0.0.0-20260507064547-505b8f595ce4 h1:jRgobXGG/+ZsFRz8Iy0xB4OE7qBSw/8xR2kPF4AJz5s= +github.com/hashicorp/nomad/api v0.0.0-20260507064547-505b8f595ce4/go.mod h1:KkLNLU0Nyfh5jWsFoF/PsmMbKpRIAoIV4lmQoJWgKCk= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= @@ -85,6 +99,8 @@ github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxec github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/pelletier/go-toml/v2 v2.3.0 h1:k59bC/lIZREW0/iVaQR8nDHxVq8OVlIzYCOJf421CaM= @@ -104,6 +120,8 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.12.0 h1:/NQhBAkUb4+fH1jivKHWusDYFjMOOKU88eegjfxfHb4= github.com/sagikazarmark/locafero v0.12.0/go.mod h1:sZh36u/YSZ918v0Io+U9ogLYQJ9tLLBmM4eneO6WwsI= +github.com/shoenig/test v1.12.2 h1:ZVT8NeIUwGWpZcKaepPmFMoNQ3sVpxvqUh/MAqwFiJI= +github.com/shoenig/test v1.12.2/go.mod h1:UxJ6u/x2v/TNs/LoLxBNJRV9DiwBBKYxXSyczsBHFoI= github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= diff --git a/src/lib/config/config.go b/src/lib/config/config.go index fbdc4ed..408106d 100644 --- a/src/lib/config/config.go +++ b/src/lib/config/config.go @@ -126,6 +126,10 @@ func LoadConfig(configPath *string) (*CatteryConfig, error) { var dc DockerTrayConfig decodeError = mapstructure.Decode(trayType.Config, &dc) trayType.Config = dc + case "nomad": + var nc NomadTrayConfig + decodeError = mapstructure.Decode(trayType.Config, &nc) + trayType.Config = nc //case "scaleway": default: diff --git a/src/lib/config/trayTypeConfig.go b/src/lib/config/trayTypeConfig.go index af98604..a4c5c54 100644 --- a/src/lib/config/trayTypeConfig.go +++ b/src/lib/config/trayTypeConfig.go @@ -17,3 +17,20 @@ type DockerTrayConfig struct { Image string `yaml:"image"` NamePrefix string `yaml:"namePrefix"` } + +// NomadTrayConfig configures a Nomad-dispatched tray. +// +// JobId is the ID of a parameterized parent job already registered in Nomad. +// Resources, driver and constraints come from that job spec — Nomad does not +// allow overriding them at dispatch time. Use distinct parameterized jobs for +// distinct resource shapes. +// +// Script is an optional inline bash snippet inlined into the dispatched +// payload before the agent is exec'd. Use it for per-tray-type setup +// (mounting volumes, installing tools, etc.). Use YAML's `|` block scalar to +// embed multi-line scripts. +type NomadTrayConfig struct { + TrayConfig + JobId string `yaml:"jobId" validate:"required"` + Script string `yaml:"script"` +} diff --git a/src/lib/trays/providers/nomadProvider.go b/src/lib/trays/providers/nomadProvider.go new file mode 100644 index 0000000..eb6f579 --- /dev/null +++ b/src/lib/trays/providers/nomadProvider.go @@ -0,0 +1,292 @@ +package providers + +import ( + "cattery/lib/config" + "cattery/lib/trays" + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "strings" + + "github.com/hashicorp/nomad/api" + "github.com/sirupsen/logrus" +) + +// ErrCapacityBlocked indicates Nomad accepted the dispatch but cannot place +// the alloc due to insufficient capacity or unsatisfied constraints. The eval +// remains queued; a future fallback provider will use this sentinel to +// reroute to another provider. +var ErrCapacityBlocked = errors.New("nomad: dispatch blocked, no capacity") + +const ( + nomadProviderDataDispatchedJobID = "dispatchedJobId" + nomadProviderDataEvalID = "evalId" + nomadProviderDataNamespace = "namespace" +) + +// defaultBootstrapTemplate is the payload script the provider synthesizes per +// dispatch. The {{USER_SCRIPT}} marker is replaced with NomadTrayConfig.Script +// (empty string if not configured). The script assumes that meta values +// TRAY_NAME, BOOTSTRAP_TOKEN, CATTERY_URL are already exported in the +// environment (the parameterized parent job is responsible for sourcing them +// from /etc/cattery/bootstrap.env before exec'ing this script — see +// scw-cattery-runner-tray.nomad.hcl for the canonical setup). +const defaultBootstrapTemplate = `#!/bin/bash +set -euo pipefail + +curl -fsSL "$CATTERY_URL/agent/binary" -o /usr/local/bin/cattery +chmod +x /usr/local/bin/cattery + +{{USER_SCRIPT}} + +exec /usr/local/bin/cattery agent -i "$TRAY_NAME" -s "$CATTERY_URL" +` + +type NomadProvider struct { + name string + providerConfig config.ProviderConfig + + client *api.Client + namespace string + + logger *logrus.Entry +} + +func NewNomadProvider(name string, providerConfig config.ProviderConfig) *NomadProvider { + logger := logrus.WithFields(logrus.Fields{ + "name": "NomadProvider", + "providerName": name, + "providerType": "nomad", + }) + + address := providerConfig.Get("address") + if address == "" { + logger.Error("nomad provider missing required 'address'") + return nil + } + + cfg := api.DefaultConfig() + cfg.Address = address + if region := providerConfig.Get("region"); region != "" { + cfg.Region = region + } + if ns := providerConfig.Get("namespace"); ns != "" { + cfg.Namespace = ns + } + if token := providerConfig.Get("token"); token != "" { + cfg.SecretID = token + } + if caFile := providerConfig.Get("tlscafile"); caFile != "" { + if cfg.TLSConfig == nil { + cfg.TLSConfig = &api.TLSConfig{} + } + cfg.TLSConfig.CACert = caFile + } + if strings.EqualFold(providerConfig.Get("insecure"), "true") { + if cfg.TLSConfig == nil { + cfg.TLSConfig = &api.TLSConfig{} + } + cfg.TLSConfig.Insecure = true + } + + client, err := api.NewClient(cfg) + if err != nil { + logger.Errorf("failed to create nomad client: %v", err) + return nil + } + + return &NomadProvider{ + name: name, + providerConfig: providerConfig, + client: client, + namespace: cfg.Namespace, + logger: logger, + } +} + +func (n *NomadProvider) GetProviderName() string { + return n.name +} + +// StartDeploy submits a parameterized-job dispatch to Nomad. ProviderData is +// populated *before* the call returns so a partial failure (or a process +// restart between StartDeploy and CleanTray) still leaves enough context for +// CleanTray to attempt cleanup. +func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error { + trayConfig, ok := tray.TrayConfig().(config.NomadTrayConfig) + if !ok { + return fmt.Errorf("unexpected tray config type for nomad provider, tray %s", tray.Id) + } + if trayConfig.JobId == "" { + return fmt.Errorf("nomad tray config missing jobId, tray %s", tray.Id) + } + + bootstrapToken, err := generateBootstrapToken() + if err != nil { + return fmt.Errorf("failed to generate bootstrap token: %w", err) + } + + payload := buildBootstrapPayload(trayConfig.Script) + + meta := map[string]string{ + "tray_name": tray.Id, + "bootstrap_token": bootstrapToken, + "cattery_url": config.Get().Server.AdvertiseUrl, + } + if tt := tray.TrayType(); tt != nil { + for k, v := range tt.ExtraMetadata { + meta[k] = v + } + } + + tray.ProviderData[nomadProviderDataNamespace] = n.namespace + + resp, _, err := n.client.Jobs().Dispatch( + trayConfig.JobId, + meta, + payload, + "", + (&api.WriteOptions{Namespace: n.namespace}).WithContext(ctx), + ) + if err != nil { + n.logger.Errorf("Failed to dispatch nomad job %s for tray %s: %v", trayConfig.JobId, tray.Id, err) + return err + } + + tray.ProviderData[nomadProviderDataDispatchedJobID] = resp.DispatchedJobID + tray.ProviderData[nomadProviderDataEvalID] = resp.EvalID + + n.logger.Infof("Dispatched nomad job %s for tray %s (dispatchedJobId=%s, evalId=%s)", + trayConfig.JobId, tray.Id, resp.DispatchedJobID, resp.EvalID) + + return nil +} + +// WaitDeploy blocks until the dispatch evaluation leaves the `pending` state. +// Mapping: +// +// - complete: nil (Nomad scheduled the alloc; agent registration +// is the readiness signal from here) +// - blocked: ErrCapacityBlocked +// - failed/canceled: plain error +// - ctx cancellation: ctx error (caller-imposed timeout) +func (n *NomadProvider) WaitDeploy(ctx context.Context, tray *trays.Tray) error { + evalID := tray.ProviderData[nomadProviderDataEvalID] + if evalID == "" { + n.logger.Tracef("No eval id stored for tray %s; skipping wait", tray.Id) + return nil + } + + var waitIndex uint64 + for { + if err := ctx.Err(); err != nil { + return err + } + + q := (&api.QueryOptions{ + Namespace: n.namespace, + WaitIndex: waitIndex, + }).WithContext(ctx) + + eval, meta, err := n.client.Evaluations().Info(evalID, q) + if err != nil { + if ctx.Err() != nil { + return ctx.Err() + } + return fmt.Errorf("failed to query eval %s: %w", evalID, err) + } + + switch eval.Status { + case "complete": + return nil + case "blocked": + n.logger.Warnf("Nomad eval %s blocked for tray %s: %s", evalID, tray.Id, eval.StatusDescription) + return fmt.Errorf("%w: %s", ErrCapacityBlocked, formatBlockedReason(eval)) + case "failed", "canceled": + return fmt.Errorf("nomad eval %s ended with status %s: %s", evalID, eval.Status, eval.StatusDescription) + case "pending", "": + // keep waiting; advance WaitIndex so the next call blocks server-side + if meta != nil && meta.LastIndex > waitIndex { + waitIndex = meta.LastIndex + } + default: + return fmt.Errorf("nomad eval %s returned unexpected status %q", evalID, eval.Status) + } + } +} + +// CleanTray deregisters the dispatched child job. Safe to call on a tray that +// StartDeploy never finished — missing DispatchedJobID is treated as a no-op. +func (n *NomadProvider) CleanTray(ctx context.Context, tray *trays.Tray) error { + dispatchedJobID := tray.ProviderData[nomadProviderDataDispatchedJobID] + if dispatchedJobID == "" { + n.logger.Warnf("CleanTray called without dispatchedJobId for tray %s; nothing to deregister", tray.Id) + return nil + } + + ns := tray.ProviderData[nomadProviderDataNamespace] + if ns == "" { + ns = n.namespace + } + + _, _, err := n.client.Jobs().Deregister( + dispatchedJobID, + true, + (&api.WriteOptions{Namespace: ns}).WithContext(ctx), + ) + if err != nil { + if isNomad404(err) { + n.logger.Tracef("Dispatched job %s already gone; nothing to do", dispatchedJobID) + return nil + } + return err + } + return nil +} + +func generateBootstrapToken() (string, error) { + b := make([]byte, 32) + if _, err := rand.Read(b); err != nil { + return "", err + } + return hex.EncodeToString(b), nil +} + +func buildBootstrapPayload(userScript string) []byte { + return []byte(strings.Replace(defaultBootstrapTemplate, "{{USER_SCRIPT}}", userScript, 1)) +} + +// formatBlockedReason summarizes the first FailedTGAllocs entry into something +// loggable. Nomad's AllocationMetric carries node counters +// (NodesExhausted/ConstraintFiltered/etc.) that almost always answer "why +// didn't this place" without parsing the entire structure. +func formatBlockedReason(eval *api.Evaluation) string { + if len(eval.FailedTGAllocs) == 0 { + return eval.StatusDescription + } + parts := make([]string, 0, len(eval.FailedTGAllocs)) + for tg, m := range eval.FailedTGAllocs { + if m == nil { + continue + } + parts = append(parts, fmt.Sprintf( + "%s: nodesEvaluated=%d nodesFiltered=%d nodesExhausted=%d classFiltered=%d constraintFiltered=%d", + tg, m.NodesEvaluated, m.NodesFiltered, m.NodesExhausted, + len(m.ClassFiltered), len(m.ConstraintFiltered), + )) + } + return strings.Join(parts, "; ") +} + +// isNomad404 returns true if err is a Nomad "job not found" response. Nomad's +// api package returns plain errors with the HTTP status embedded in the +// message; there is no typed 404 to match against. +func isNomad404(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "not found") || strings.Contains(msg, "404") +} diff --git a/src/lib/trays/providers/trayProviderFactory.go b/src/lib/trays/providers/trayProviderFactory.go index 7e4cfa0..e099d1a 100644 --- a/src/lib/trays/providers/trayProviderFactory.go +++ b/src/lib/trays/providers/trayProviderFactory.go @@ -68,6 +68,10 @@ func GetProvider(providerName string) (TrayProvider, error) { if p := NewGceProvider(providerName, provider); p != nil { result = p } + case "nomad": + if p := NewNomadProvider(providerName, provider); p != nil { + result = p + } default: return nil, errors.New("unknown provider type: " + provider["type"]) } From ac98ff954479d6aa65a55494b3c96f8e38fc6cf0 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 7 May 2026 18:39:12 +0400 Subject: [PATCH 2/4] runner folder --- docs/configuration.md | 22 +++++++-- examples/example-config.yaml | 12 +++-- src/lib/config/trayTypeConfig.go | 13 ++++- src/lib/trays/providers/nomadProvider.go | 62 ++++++++++++++++-------- 4 files changed, 79 insertions(+), 30 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 8752e69..5277d2e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -74,6 +74,7 @@ trayTypes: shutdown: true config: jobId: scw-cattery-runner-tray + runnerFolder: /cattery script: | echo "extra setup for $TRAY_NAME" ``` @@ -137,7 +138,7 @@ Provider-specific fields: | Key | Type | Required | Description | |-----------|--------|----------|---------------------------------------------------------------------------------------------------| | address | string | yes | Nomad agent HTTP(S) address, e.g. `https://nomad.internal:4646`. | - | token | string | no | Nomad ACL token. Should be scoped to `dispatch-job` on the parameterized parent job(s). | + | token | string | no | Nomad ACL token. Needs `dispatch-job` (StartDeploy), `read-job` (WaitDeploy reads the evaluation), and `deregister-job`/`purge-job` (CleanTray purges the dispatched child) on the parent job's namespace. See [Nomad ACL policies](https://developer.hashicorp.com/nomad/docs/secure/acl/policies) for the exact capability names in your Nomad version. | | namespace | string | no | Nomad namespace to dispatch into. Defaults to `default`. | | region | string | no | Nomad region. Defaults to the agent's region. | | tlsCaFile | string | no | Path to a PEM CA bundle for verifying the Nomad agent's TLS certificate. | @@ -179,10 +180,19 @@ Provider-specific config under trayType.config: - nomad config - | Key | Type | Required | Description | - |--------|--------|----------|------------------------------------------------------------------------------------------------------| - | jobId | string | yes | ID of a parameterized parent job already registered in Nomad. Cattery dispatches one child per tray. | - | script | string | no | Inline bash, executed after the agent binary is downloaded and before the agent is exec'd. Use YAML's `\|` block scalar for multi-line. | + | Key | Type | Required | Description | + |--------------|--------|----------|------------------------------------------------------------------------------------------------------| + | jobId | string | yes | ID of a parameterized parent job already registered in Nomad. Cattery dispatches one child per tray. | + | runnerFolder | string | no | Path inside the guest where the GitHub Actions runner distribution lives. Passed as `--runner-folder` to `cattery agent`. Defaults to `/cattery`. | + | script | string | no | Inline bash, executed after the agent binary is downloaded and before the agent is exec'd. Use YAML's `\|` block scalar for multi-line. | + + **Bootstrap composition.** The provider builds the dispatched payload from three pieces: + + 1. A fixed prelude that downloads the cattery agent binary from `$CATTERY_URL/agent/binary` to `/usr/local/bin/cattery`. + 2. The optional `script` field, executed as a pre-agent hook. + 3. An `exec /usr/local/bin/cattery agent -i "$TRAY_NAME" -s "$CATTERY_URL" --runner-folder `, where `` defaults to `/cattery`. + + To take over the agent invocation entirely (e.g. when the image starts the agent itself via systemd), put your own `exec ...` at the end of `script` — the default exec emitted afterwards becomes unreachable. **Parent-job contract.** The parameterized parent job must declare: @@ -203,6 +213,8 @@ Provider-specific config under trayType.config: **Resource shapes.** Resources, driver, constraints and reschedule policy are baked into the parent job spec — they cannot be set per-dispatch. To run trays at different sizes, register multiple parameterized parent jobs and reference them by `jobId` from different trayTypes. + **`extraMetadata` and Nomad meta.** Any keys in the trayType's `extraMetadata` are forwarded as Nomad dispatch meta alongside `tray_name` / `bootstrap_token` / `cattery_url`. Nomad rejects dispatch meta keys that are not declared in the parent job's `meta_required` or `meta_optional`, so any keys you add via `extraMetadata` must also be declared `meta_optional` in the parameterized parent job. + Notes: - Ensure runnerGroupId corresponds to an existing Runner Group in your GitHub org and that your GitHub App has permission to register runners. diff --git a/examples/example-config.yaml b/examples/example-config.yaml index 05f7ebc..0ccdfe8 100644 --- a/examples/example-config.yaml +++ b/examples/example-config.yaml @@ -45,7 +45,9 @@ providers: - name: nomad-scw type: nomad address: https://nomad.internal:4646 - # ACL token scoped to dispatch-job on the parameterized parent job(s). + # ACL token. Needs dispatch-job (StartDeploy), read-job (WaitDeploy reads + # the evaluation), and deregister-job/purge-job (CleanTray purges the + # dispatched child) on the parent job's namespace. token: namespace: runners # optional region: global # optional @@ -91,9 +93,13 @@ trayTypes: # overriding them at dispatch time. Use distinct parent jobs for distinct # resource shapes. jobId: scw-cattery-runner-tray + # Path inside the guest where the GitHub Actions runner distribution + # lives. Passed as --runner-folder to `cattery agent`. Defaults to + # /cattery if omitted. + runnerFolder: /cattery # Optional inline bash, executed after the agent binary is downloaded - # and before the agent is exec'd. The provider's default bootstrap - # exports TRAY_NAME, BOOTSTRAP_TOKEN and CATTERY_URL from meta. + # and before the agent is exec'd. The parent job is expected to have + # exported TRAY_NAME, BOOTSTRAP_TOKEN and CATTERY_URL from meta. script: | echo "extra setup for $TRAY_NAME" # mkfs / mount scratch volume, install build tools, etc. diff --git a/src/lib/config/trayTypeConfig.go b/src/lib/config/trayTypeConfig.go index a4c5c54..56ac388 100644 --- a/src/lib/config/trayTypeConfig.go +++ b/src/lib/config/trayTypeConfig.go @@ -29,8 +29,17 @@ type DockerTrayConfig struct { // payload before the agent is exec'd. Use it for per-tray-type setup // (mounting volumes, installing tools, etc.). Use YAML's `|` block scalar to // embed multi-line scripts. +// +// RunnerFolder is the path inside the guest where the GitHub Actions runner +// distribution lives. The provider's default bootstrap passes it as the +// `--runner-folder` flag to `cattery agent` (which is required by the agent). +// Defaults to /cattery if empty. To take over the agent invocation entirely +// (e.g. when the image starts the agent itself via systemd), put your own +// `exec ...` at the end of Script — the default exec emitted afterwards +// becomes unreachable. type NomadTrayConfig struct { TrayConfig - JobId string `yaml:"jobId" validate:"required"` - Script string `yaml:"script"` + JobId string `yaml:"jobId"` + Script string `yaml:"script"` + RunnerFolder string `yaml:"runnerFolder"` } diff --git a/src/lib/trays/providers/nomadProvider.go b/src/lib/trays/providers/nomadProvider.go index eb6f579..db07d40 100644 --- a/src/lib/trays/providers/nomadProvider.go +++ b/src/lib/trays/providers/nomadProvider.go @@ -26,23 +26,27 @@ const ( nomadProviderDataNamespace = "namespace" ) -// defaultBootstrapTemplate is the payload script the provider synthesizes per -// dispatch. The {{USER_SCRIPT}} marker is replaced with NomadTrayConfig.Script -// (empty string if not configured). The script assumes that meta values -// TRAY_NAME, BOOTSTRAP_TOKEN, CATTERY_URL are already exported in the -// environment (the parameterized parent job is responsible for sourcing them -// from /etc/cattery/bootstrap.env before exec'ing this script — see -// scw-cattery-runner-tray.nomad.hcl for the canonical setup). -const defaultBootstrapTemplate = `#!/bin/bash -set -euo pipefail - -curl -fsSL "$CATTERY_URL/agent/binary" -o /usr/local/bin/cattery -chmod +x /usr/local/bin/cattery - -{{USER_SCRIPT}} - -exec /usr/local/bin/cattery agent -i "$TRAY_NAME" -s "$CATTERY_URL" -` +// defaultRunnerFolder is used when NomadTrayConfig.RunnerFolder is empty. +// It is the path inside the guest where the GitHub Actions runner +// distribution is expected to live and is passed as `--runner-folder` to +// `cattery agent` (which is required by the agent CLI). To take over the +// agent invocation (e.g. when the image starts the agent itself), put your +// own `exec ...` at the end of NomadTrayConfig.Script — the default exec +// emitted afterwards becomes unreachable. +const defaultRunnerFolder = "/cattery" + +// The provider synthesizes the dispatched payload from three pieces: +// +// 1. A fixed prelude that downloads the cattery agent binary. +// 2. The user's optional Script, executed as a pre-agent hook. +// 3. An exec of `cattery agent ... --runner-folder `, using +// defaultRunnerFolder when RunnerFolder is empty. +// +// The script assumes meta values TRAY_NAME, BOOTSTRAP_TOKEN, CATTERY_URL are +// exported in the environment. The parameterized parent job is responsible +// for sourcing them from /etc/cattery/bootstrap.env (or equivalent) before +// exec'ing this script — see scw-cattery-runner-tray.nomad.hcl for the +// canonical setup. type NomadProvider struct { name string @@ -128,7 +132,7 @@ func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error return fmt.Errorf("failed to generate bootstrap token: %w", err) } - payload := buildBootstrapPayload(trayConfig.Script) + payload := buildBootstrapPayload(trayConfig.Script, trayConfig.RunnerFolder) meta := map[string]string{ "tray_name": tray.Id, @@ -254,8 +258,26 @@ func generateBootstrapToken() (string, error) { return hex.EncodeToString(b), nil } -func buildBootstrapPayload(userScript string) []byte { - return []byte(strings.Replace(defaultBootstrapTemplate, "{{USER_SCRIPT}}", userScript, 1)) +// buildBootstrapPayload composes the dispatched bash payload. runnerFolder +// defaults to defaultRunnerFolder when empty. +func buildBootstrapPayload(userScript, runnerFolder string) []byte { + if runnerFolder == "" { + runnerFolder = defaultRunnerFolder + } + var sb strings.Builder + sb.WriteString("#!/bin/bash\n") + sb.WriteString("set -euo pipefail\n\n") + sb.WriteString(`curl -fsSL "$CATTERY_URL/agent/binary" -o /usr/local/bin/cattery` + "\n") + sb.WriteString("chmod +x /usr/local/bin/cattery\n\n") + if userScript != "" { + sb.WriteString(userScript) + if !strings.HasSuffix(userScript, "\n") { + sb.WriteString("\n") + } + sb.WriteString("\n") + } + fmt.Fprintf(&sb, "exec /usr/local/bin/cattery agent -i \"$TRAY_NAME\" -s \"$CATTERY_URL\" --runner-folder %q\n", runnerFolder) + return []byte(sb.String()) } // formatBlockedReason summarizes the first FailedTGAllocs entry into something From 25109fbac2526fc6a87dfd799da8cc8ed2aa6890 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 7 May 2026 22:43:36 +0400 Subject: [PATCH 3/4] fixes --- docs/configuration.md | 43 ++++++-- examples/example-config.yaml | 5 +- src/lib/trayManager/trayManager.go | 6 ++ src/lib/trays/providers/nomadProvider.go | 120 +++++++++++++++++++---- 4 files changed, 143 insertions(+), 31 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 5277d2e..ecdcacc 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -138,7 +138,7 @@ Provider-specific fields: | Key | Type | Required | Description | |-----------|--------|----------|---------------------------------------------------------------------------------------------------| | address | string | yes | Nomad agent HTTP(S) address, e.g. `https://nomad.internal:4646`. | - | token | string | no | Nomad ACL token. Needs `dispatch-job` (StartDeploy), `read-job` (WaitDeploy reads the evaluation), and `deregister-job`/`purge-job` (CleanTray purges the dispatched child) on the parent job's namespace. See [Nomad ACL policies](https://developer.hashicorp.com/nomad/docs/secure/acl/policies) for the exact capability names in your Nomad version. | + | token | string | no | Nomad ACL token. Needs `dispatch-job` (StartDeploy), `read-job` (WaitDeploy reads the evaluation), `list-jobs` (CleanTray's leaked-child recovery scan), and `deregister-job`/`purge-job` (CleanTray purges the dispatched child) on the parent job's namespace. See [Nomad ACL policies](https://developer.hashicorp.com/nomad/docs/secure/acl/policies) for the exact capability names in your Nomad version. | | namespace | string | no | Nomad namespace to dispatch into. Defaults to `default`. | | region | string | no | Nomad region. Defaults to the agent's region. | | tlsCaFile | string | no | Path to a PEM CA bundle for verifying the Nomad agent's TLS certificate. | @@ -194,26 +194,51 @@ Provider-specific config under trayType.config: To take over the agent invocation entirely (e.g. when the image starts the agent itself via systemd), put your own `exec ...` at the end of `script` — the default exec emitted afterwards becomes unreachable. - **Parent-job contract.** The parameterized parent job must declare: + **Parent-job contract.** The parameterized parent job must declare the `parameterized` stanza at the job level *and* materialize the dispatched payload at the task level via `dispatch_payload`. Without `dispatch_payload`, Nomad accepts the dispatch but never writes the payload bytes anywhere the task can read. ```hcl - parameterized { - payload = "required" - meta_required = ["tray_name", "bootstrap_token", "cattery_url"] + job "my-runner-tray" { + type = "batch" + + parameterized { + payload = "required" + meta_required = ["tray_name", "bootstrap_token", "cattery_url"] + } + + group "g" { + task "t" { + // Nomad writes the dispatched payload to ${NOMAD_TASK_DIR}/bootstrap.sh + // before the task starts. + dispatch_payload { + file = "bootstrap.sh" + } + + // ... driver, config, resources ... + } + } } ``` - Inside the alloc, the dispatched payload is the bash script the provider generated. Before exec'ing it, your parent job is responsible for exporting the meta values as `TRAY_NAME`, `BOOTSTRAP_TOKEN` and `CATTERY_URL` (e.g. via cloud-init writing `/etc/cattery/bootstrap.env` and sourcing it). The default payload assumes those env vars are present. + The dispatched bytes land at `${NOMAD_TASK_DIR}/bootstrap.sh`. Your task is responsible for executing that file *with the dispatch meta values exported as env vars* — the script generated by cattery references `$CATTERY_URL`, `$TRAY_NAME` and `$BOOTSTRAP_TOKEN`. Two common ways to wire that up: + + - For raw_exec / exec drivers running directly on the host: source a small env file and exec the payload, e.g. + ``` + set -a; . /etc/cattery/bootstrap.env; set +a + bash "$NOMAD_TASK_DIR/bootstrap.sh" + ``` + - For VM-style drivers (qemu, firecracker, custom `nomad-runner-vm` wrappers): render a cloud-init userdata template that uses `write_files` to drop the meta values into an env file (e.g. `/etc/cattery/bootstrap.env`) and a `runcmd` that sources it before exec'ing the dispatched payload. The wrapper bakes the rendered userdata into the guest's cidata seed iso. + + Either approach must produce an environment where `TRAY_NAME`, `BOOTSTRAP_TOKEN` and `CATTERY_URL` are exported when the payload script runs. **Lifecycle.** - - Cattery dispatches the parent job, stores `dispatchedJobId` + `evalId` in the tray's provider data. + - Cattery dispatches the parent job with `idPrefixTemplate = tray.Id` and `IdempotencyToken = tray.Id`, and stores `dispatchedJobId` + `evalId` + `parentJobId` in the tray's provider data. The provider stages `parentJobId` and `namespace` in memory before the dispatch call; the trayManager persists provider data once `StartDeploy` returns (success path) or right before cleanup (error path). This recovers the case where Dispatch creates the child but the HTTP response is lost — it does *not* recover a process crash mid-dispatch (parentJobId never reaches the database in that window). - Cattery blocks until the dispatch evaluation leaves `pending`. `complete` → success; `blocked` → returned as `ErrCapacityBlocked` (Nomad has no capacity for this alloc); `failed`/`canceled` → error. - - On tray cleanup, the dispatched child job is deregistered with `purge=true`. + - On tray cleanup, the dispatched child job is deregistered with `purge=true`. If `dispatchedJobId` is missing (e.g., the dispatch response was lost in transit), cattery lists the parent's dispatched children with the prefix `/dispatch-` and deregisters any whose ID starts with `/dispatch--` (the shape Nomad assigns when `idPrefixTemplate = tray.Id`). **Resource shapes.** Resources, driver, constraints and reschedule policy are baked into the parent job spec — they cannot be set per-dispatch. To run trays at different sizes, register multiple parameterized parent jobs and reference them by `jobId` from different trayTypes. - **`extraMetadata` and Nomad meta.** Any keys in the trayType's `extraMetadata` are forwarded as Nomad dispatch meta alongside `tray_name` / `bootstrap_token` / `cattery_url`. Nomad rejects dispatch meta keys that are not declared in the parent job's `meta_required` or `meta_optional`, so any keys you add via `extraMetadata` must also be declared `meta_optional` in the parameterized parent job. + **`extraMetadata` and Nomad meta.** Any keys in the trayType's `extraMetadata` are forwarded as Nomad dispatch meta alongside `tray_name` / `bootstrap_token` / `cattery_url`. The provider-owned keys are written *last* and cannot be clobbered by `extraMetadata`. Nomad rejects dispatch meta keys that are not declared in the parent job's `meta_required` or `meta_optional`, so any keys you add via `extraMetadata` must also be declared `meta_optional` in the parameterized parent job. Notes: diff --git a/examples/example-config.yaml b/examples/example-config.yaml index 0ccdfe8..f58f905 100644 --- a/examples/example-config.yaml +++ b/examples/example-config.yaml @@ -46,8 +46,9 @@ providers: type: nomad address: https://nomad.internal:4646 # ACL token. Needs dispatch-job (StartDeploy), read-job (WaitDeploy reads - # the evaluation), and deregister-job/purge-job (CleanTray purges the - # dispatched child) on the parent job's namespace. + # the evaluation), list-jobs (CleanTray's leaked-child recovery scan), + # and deregister-job/purge-job (CleanTray purges the dispatched child) + # on the parent job's namespace. token: namespace: runners # optional region: global # optional diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 249551e..7ba2095 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -111,6 +111,12 @@ func (tm *TrayManager) CreateTray(ctx context.Context, trayType *config.TrayType if err := provider.StartDeploy(ctx, tray); err != nil { log.Errorf("Failed start deploy for tray %s: %v", tray.Id, err) metrics.TrayProviderErrors(tray.GitHubOrgName, tray.ProviderName, tray.TrayTypeName, "create") + // Persist any provider data the failed StartDeploy populated (e.g., + // nomad's parentJobId for leaked-child recovery) before DeleteTray + // reloads the row and dispatches CleanTray on it. + if _, pErr := tm.trayRepository.SetProviderData(ctx, tray.Id, tray.ProviderData); pErr != nil { + log.Errorf("Failed to persist provider data after start deploy error for tray %s: %v", tray.Id, pErr) + } if _, dErr := tm.DeleteTray(ctx, tray.Id); dErr != nil { log.Errorf("Failed to delete tray %s after start deploy error: %v", tray.Id, dErr) } diff --git a/src/lib/trays/providers/nomadProvider.go b/src/lib/trays/providers/nomadProvider.go index db07d40..8a1c71f 100644 --- a/src/lib/trays/providers/nomadProvider.go +++ b/src/lib/trays/providers/nomadProvider.go @@ -24,6 +24,7 @@ const ( nomadProviderDataDispatchedJobID = "dispatchedJobId" nomadProviderDataEvalID = "evalId" nomadProviderDataNamespace = "namespace" + nomadProviderDataParentJobID = "parentJobId" ) // defaultRunnerFolder is used when NomadTrayConfig.RunnerFolder is empty. @@ -45,8 +46,8 @@ const defaultRunnerFolder = "/cattery" // The script assumes meta values TRAY_NAME, BOOTSTRAP_TOKEN, CATTERY_URL are // exported in the environment. The parameterized parent job is responsible // for sourcing them from /etc/cattery/bootstrap.env (or equivalent) before -// exec'ing this script — see scw-cattery-runner-tray.nomad.hcl for the -// canonical setup. +// exec'ing this script — see the "Parent-job contract" section in +// docs/configuration.md for the wiring patterns. type NomadProvider struct { name string @@ -117,7 +118,16 @@ func (n *NomadProvider) GetProviderName() string { // StartDeploy submits a parameterized-job dispatch to Nomad. ProviderData is // populated *before* the call returns so a partial failure (or a process // restart between StartDeploy and CleanTray) still leaves enough context for -// CleanTray to attempt cleanup. +// CleanTray to attempt cleanup — even when the dispatch HTTP response was +// lost in transit, the persisted parentJobId + tray.Id let CleanTray scan +// for leaked children via Nomad's job ID prefix and purge them. +// +// To make this safe under retry, the dispatch sets: +// +// - idPrefixTemplate = tray.Id, so the child job's ID always contains +// tray.Id and can be located by prefix scan. +// - IdempotencyToken = tray.Id, so a retried Dispatch with the same token +// does not create a duplicate child. func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error { trayConfig, ok := tray.TrayConfig().(config.NomadTrayConfig) if !ok { @@ -134,25 +144,32 @@ func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error payload := buildBootstrapPayload(trayConfig.Script, trayConfig.RunnerFolder) - meta := map[string]string{ - "tray_name": tray.Id, - "bootstrap_token": bootstrapToken, - "cattery_url": config.Get().Server.AdvertiseUrl, - } + // Provider-owned keys are written *last* so that user-supplied + // extraMetadata cannot accidentally clobber the bootstrap contract. + meta := map[string]string{} if tt := tray.TrayType(); tt != nil { for k, v := range tt.ExtraMetadata { meta[k] = v } } + meta["tray_name"] = tray.Id + meta["bootstrap_token"] = bootstrapToken + meta["cattery_url"] = config.Get().Server.AdvertiseUrl + // Persisted *before* the dispatch call so cleanup can recover from a + // lost response. tray.ProviderData[nomadProviderDataNamespace] = n.namespace + tray.ProviderData[nomadProviderDataParentJobID] = trayConfig.JobId resp, _, err := n.client.Jobs().Dispatch( trayConfig.JobId, meta, payload, - "", - (&api.WriteOptions{Namespace: n.namespace}).WithContext(ctx), + tray.Id, + (&api.WriteOptions{ + Namespace: n.namespace, + IdempotencyToken: tray.Id, + }).WithContext(ctx), ) if err != nil { n.logger.Errorf("Failed to dispatch nomad job %s for tray %s: %v", trayConfig.JobId, tray.Id, err) @@ -221,28 +238,45 @@ func (n *NomadProvider) WaitDeploy(ctx context.Context, tray *trays.Tray) error } } -// CleanTray deregisters the dispatched child job. Safe to call on a tray that -// StartDeploy never finished — missing DispatchedJobID is treated as a no-op. +// CleanTray deregisters the dispatched child job. Safe to call on a tray +// that StartDeploy never finished: +// +// - If dispatchedJobId is stored, deregister it directly (fast path). +// - Otherwise, if parentJobId is stored, scan the parent's dispatched +// children for any whose ID contains tray.Id and deregister them. +// This recovers the leaked-child scenario where Dispatch succeeded on +// the server but the response was lost (network error / timeout) and +// dispatchedJobId never made it into ProviderData. +// - If neither is stored, nothing to do. func (n *NomadProvider) CleanTray(ctx context.Context, tray *trays.Tray) error { - dispatchedJobID := tray.ProviderData[nomadProviderDataDispatchedJobID] - if dispatchedJobID == "" { - n.logger.Warnf("CleanTray called without dispatchedJobId for tray %s; nothing to deregister", tray.Id) - return nil - } - ns := tray.ProviderData[nomadProviderDataNamespace] if ns == "" { ns = n.namespace } + dispatchedJobID := tray.ProviderData[nomadProviderDataDispatchedJobID] + if dispatchedJobID != "" { + return n.deregister(ctx, ns, dispatchedJobID) + } + + parentJobID := tray.ProviderData[nomadProviderDataParentJobID] + if parentJobID == "" { + n.logger.Warnf("CleanTray called without dispatchedJobId or parentJobId for tray %s; nothing to do", tray.Id) + return nil + } + + return n.cleanupLeakedDispatch(ctx, ns, parentJobID, tray.Id) +} + +func (n *NomadProvider) deregister(ctx context.Context, ns, jobID string) error { _, _, err := n.client.Jobs().Deregister( - dispatchedJobID, + jobID, true, (&api.WriteOptions{Namespace: ns}).WithContext(ctx), ) if err != nil { if isNomad404(err) { - n.logger.Tracef("Dispatched job %s already gone; nothing to do", dispatchedJobID) + n.logger.Tracef("Dispatched job %s already gone; nothing to do", jobID) return nil } return err @@ -250,6 +284,52 @@ func (n *NomadProvider) CleanTray(ctx context.Context, tray *trays.Tray) error { return nil } +// cleanupLeakedDispatch finds child jobs of parentJobID that were dispatched +// for trayID and deregisters each. Used when StartDeploy could not persist +// the returned DispatchedJobID. The provider always dispatches with +// idPrefixTemplate = tray.Id, so a leaked child's ID has the shape +// "/dispatch---" — matched by prefix. +func (n *NomadProvider) cleanupLeakedDispatch(ctx context.Context, ns, parentJobID, trayID string) error { + expectedPrefix := parentJobID + "/dispatch-" + trayID + "-" + + q := (&api.QueryOptions{ + Namespace: ns, + Prefix: parentJobID + "/dispatch-", + }).WithContext(ctx) + + stubs, _, err := n.client.Jobs().List(q) + if err != nil { + if isNomad404(err) { + return nil + } + return fmt.Errorf("failed to list nomad jobs for cleanup recovery: %w", err) + } + + matched := 0 + var firstErr error + for _, stub := range stubs { + if stub.ParentID != parentJobID { + continue + } + if !strings.HasPrefix(stub.ID, expectedPrefix) { + continue + } + matched++ + if err := n.deregister(ctx, ns, stub.ID); err != nil { + n.logger.Errorf("Failed to deregister leaked dispatched job %s for tray %s: %v", stub.ID, trayID, err) + if firstErr == nil { + firstErr = err + } + continue + } + n.logger.Infof("Deregistered leaked dispatched job %s for tray %s", stub.ID, trayID) + } + if matched == 0 { + n.logger.Tracef("No leaked dispatched children found for tray %s under parent %s", trayID, parentJobID) + } + return firstErr +} + func generateBootstrapToken() (string, error) { b := make([]byte, 32) if _, err := rand.Read(b); err != nil { From 85128787dafd440b847962e919614f2e9f14a7c1 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Fri, 8 May 2026 15:40:24 +0400 Subject: [PATCH 4/4] comments --- src/lib/trays/providers/nomadProvider.go | 53 +++++++++++++++--------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/lib/trays/providers/nomadProvider.go b/src/lib/trays/providers/nomadProvider.go index 8a1c71f..4a01e7d 100644 --- a/src/lib/trays/providers/nomadProvider.go +++ b/src/lib/trays/providers/nomadProvider.go @@ -50,8 +50,7 @@ const defaultRunnerFolder = "/cattery" // docs/configuration.md for the wiring patterns. type NomadProvider struct { - name string - providerConfig config.ProviderConfig + name string client *api.Client namespace string @@ -103,11 +102,10 @@ func NewNomadProvider(name string, providerConfig config.ProviderConfig) *NomadP } return &NomadProvider{ - name: name, - providerConfig: providerConfig, - client: client, - namespace: cfg.Namespace, - logger: logger, + name: name, + client: client, + namespace: cfg.Namespace, + logger: logger, } } @@ -115,17 +113,25 @@ func (n *NomadProvider) GetProviderName() string { return n.name } -// StartDeploy submits a parameterized-job dispatch to Nomad. ProviderData is -// populated *before* the call returns so a partial failure (or a process -// restart between StartDeploy and CleanTray) still leaves enough context for -// CleanTray to attempt cleanup — even when the dispatch HTTP response was -// lost in transit, the persisted parentJobId + tray.Id let CleanTray scan -// for leaked children via Nomad's job ID prefix and purge them. +// StartDeploy submits a parameterized-job dispatch to Nomad. +// +// ProviderData ordering matters for cleanup recovery: +// +// - parentJobId and namespace are staged on tray.ProviderData *before* the +// Dispatch call, so that when trayManager persists ProviderData (after +// StartDeploy returns, on either the success or error path) those keys +// are durable. CleanTray uses them to scan for leaked children when +// dispatchedJobId is missing — recovering the case where Dispatch +// created the child but the HTTP response was lost. This does NOT +// recover a process crash *during* the in-flight Dispatch, since +// ProviderData hasn't been persisted yet at that point. +// - dispatchedJobId and evalId are written from the Dispatch response. // // To make this safe under retry, the dispatch sets: // -// - idPrefixTemplate = tray.Id, so the child job's ID always contains -// tray.Id and can be located by prefix scan. +// - idPrefixTemplate = tray.Id, so the child job's ID has the shape +// "/dispatch---" and can be located +// by prefix scan. // - IdempotencyToken = tray.Id, so a retried Dispatch with the same token // does not create a duplicate child. func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error { @@ -137,6 +143,10 @@ func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error return fmt.Errorf("nomad tray config missing jobId, tray %s", tray.Id) } + // bootstrapToken is forwarded as a meta value and surfaces inside the + // guest as $BOOTSTRAP_TOKEN. It is not validated by the cattery server + // today; the field is plumbed through so a future change can adopt + // per-dispatch token validation without touching the parent-job contract. bootstrapToken, err := generateBootstrapToken() if err != nil { return fmt.Errorf("failed to generate bootstrap token: %w", err) @@ -156,8 +166,10 @@ func (n *NomadProvider) StartDeploy(ctx context.Context, tray *trays.Tray) error meta["bootstrap_token"] = bootstrapToken meta["cattery_url"] = config.Get().Server.AdvertiseUrl - // Persisted *before* the dispatch call so cleanup can recover from a - // lost response. + // Staged on tray.ProviderData before the Dispatch call so that when + // trayManager persists ProviderData after StartDeploy returns, cleanup + // can recover a leaked child via the parent-job prefix scan. See the + // StartDeploy doc comment above for the recovery model and its limits. tray.ProviderData[nomadProviderDataNamespace] = n.namespace tray.ProviderData[nomadProviderDataParentJobID] = trayConfig.JobId @@ -243,9 +255,10 @@ func (n *NomadProvider) WaitDeploy(ctx context.Context, tray *trays.Tray) error // // - If dispatchedJobId is stored, deregister it directly (fast path). // - Otherwise, if parentJobId is stored, scan the parent's dispatched -// children for any whose ID contains tray.Id and deregister them. -// This recovers the leaked-child scenario where Dispatch succeeded on -// the server but the response was lost (network error / timeout) and +// children for any whose ID matches the prefix +// "/dispatch--" and deregister them. This +// recovers the leaked-child scenario where Dispatch succeeded on the +// server but the response was lost (network error / timeout) and // dispatchedJobId never made it into ProviderData. // - If neither is stored, nothing to do. func (n *NomadProvider) CleanTray(ctx context.Context, tray *trays.Tray) error {