diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fa1bdfa..b0ae41c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -114,7 +114,22 @@ jobs: run: go mod download -C src - name: Build - run: go build -C src -o ../bin/cattery_linux_x86_${{ env.VERSION }} -ldflags="-X cattery/lib/version.Version=${{ env.VERSION }}" + env: + CGO_ENABLED: 0 + GOOS: linux + GOARCH: amd64 + run: | + go build -C src \ + -trimpath \ + -ldflags="-s -w -X cattery/lib/version.Version=${{ env.VERSION }}" \ + -o ../bin/cattery_linux_x86_${{ env.VERSION }} + + - name: Verify binary is statically linked + run: | + file bin/cattery_linux_x86_${{ env.VERSION }} + # ldd prints "not a dynamic executable" on fully-static binaries; + # exit 0 either way and just surface the result. + ldd bin/cattery_linux_x86_${{ env.VERSION }} || true - run: ls -la bin/ diff --git a/src/lib/scaleSetClient/scaleSetClient.go b/src/lib/scaleSetClient/scaleSetClient.go index ae06474..e7832b7 100644 --- a/src/lib/scaleSetClient/scaleSetClient.go +++ b/src/lib/scaleSetClient/scaleSetClient.go @@ -40,7 +40,10 @@ func NewScaleSetClient(org *config.GitHubOrganization, trayType *config.TrayType InstallationID: org.InstallationId, PrivateKey: string(privateKey), }, - }, scaleset.WithLogger(newSlogLogger(logger))) + }, + scaleset.WithLogger(newSlogLogger(logger)), + scaleset.WithRetryableHTTPClint(newRetryableClient(logger)), + ) if err != nil { return nil, fmt.Errorf("failed to create scale set client: %w", err) } @@ -88,7 +91,10 @@ func (sc *ScaleSetClient) CreateSession(ctx context.Context) error { const retryDelay = 30 * time.Second for attempt := range maxRetries { - session, err := sc.client.MessageSessionClient(ctx, sc.scaleSet.ID, hostname, scaleset.WithLogger(newSlogLogger(sc.logger))) + session, err := sc.client.MessageSessionClient(ctx, sc.scaleSet.ID, hostname, + scaleset.WithLogger(newSlogLogger(sc.logger)), + scaleset.WithRetryableHTTPClint(newRetryableClient(sc.logger)), + ) if err == nil { sc.session = session sc.logger.Info("Message session created") diff --git a/src/lib/scaleSetClient/slog_bridge.go b/src/lib/scaleSetClient/slog_bridge.go index b6de823..83fac42 100644 --- a/src/lib/scaleSetClient/slog_bridge.go +++ b/src/lib/scaleSetClient/slog_bridge.go @@ -4,9 +4,29 @@ import ( "context" "log/slog" + "github.com/hashicorp/go-retryablehttp" log "github.com/sirupsen/logrus" ) +// newRetryableClient builds a retryablehttp.Client whose Logger is our slog +// bridge so that retry/debug HTTP lines flow through logrus instead of the +// stdlib stderr default. +// +// Why this exists: actions/scaleset@v0.3.0/common_client.go:123 only sets the +// retry client's Logger if it is nil — but retryablehttp.NewClient() always +// populates Logger with a default `log.New(os.Stderr, "", log.LstdFlags)`. +// Result: scaleset.WithLogger(...) is silently dropped for the inner retry +// client, and you see lines like `2026/05/09 23:04:30 [DEBUG] GET ...` +// bypassing logrus formatting. +// +// Pre-setting Logger here makes scaleset's nil guard a no-op and our bridge +// stays in place. *slog.Logger satisfies retryablehttp.LeveledLogger. +func newRetryableClient(entry *log.Entry) *retryablehttp.Client { + rc := retryablehttp.NewClient() + rc.Logger = newSlogLogger(entry) + return rc +} + // logrusHandler bridges slog into logrus so that third-party libraries using // slog (e.g. actions/scaleset / go-retryablehttp) respect cattery's log level // and emit records in the same format as the rest of the application. diff --git a/src/lib/trays/providers/nomadProvider.go b/src/lib/trays/providers/nomadProvider.go index 4a01e7d..975379b 100644 --- a/src/lib/trays/providers/nomadProvider.go +++ b/src/lib/trays/providers/nomadProvider.go @@ -360,7 +360,7 @@ func buildBootstrapPayload(userScript, runnerFolder string) []byte { var sb strings.Builder sb.WriteString("#!/bin/bash\n") sb.WriteString("set -euo pipefail\n\n") - sb.WriteString(`curl -fsSL "$CATTERY_URL/agent/binary" -o /usr/local/bin/cattery` + "\n") + sb.WriteString(`curl -fsSL "$CATTERY_URL/agent/download" -o /usr/local/bin/cattery` + "\n") sb.WriteString("chmod +x /usr/local/bin/cattery\n\n") if userScript != "" { sb.WriteString(userScript) diff --git a/src/lib/trays/providers/nomadProvider_api_test.go b/src/lib/trays/providers/nomadProvider_api_test.go new file mode 100644 index 0000000..9176371 --- /dev/null +++ b/src/lib/trays/providers/nomadProvider_api_test.go @@ -0,0 +1,455 @@ +package providers + +import ( + "cattery/lib/config" + "cattery/lib/trays" + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" + + "github.com/hashicorp/nomad/api" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeNomad is a minimal HTTP fixture for the Nomad endpoints the provider +// uses. Each handler is overridable per-test; the defaults 404 so that an +// unexpected call shows up as a test failure rather than silently passing. +type fakeNomad struct { + t *testing.T + + onDispatch func(jobID string, req *api.JobDispatchRequest, q url.Values) (*api.JobDispatchResponse, int) + onEvalInfo func(evalID string, q url.Values) (*api.Evaluation, int) + onJobsList func(q url.Values) ([]*api.JobListStub, int) + onDeregister func(jobID string, q url.Values) int + + dispatchCount int + deregCalls []string +} + +func (f *fakeNomad) ServeHTTP(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + q := r.URL.Query() + + switch { + case r.Method == http.MethodGet && r.URL.Path == "/v1/jobs": + if f.onJobsList == nil { + http.Error(w, "no onJobsList handler", http.StatusNotImplemented) + return + } + stubs, code := f.onJobsList(q) + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(stubs) + + case r.Method == http.MethodPut && strings.HasPrefix(r.URL.Path, "/v1/job/") && strings.HasSuffix(r.URL.Path, "/dispatch"): + jobID := strings.TrimSuffix(strings.TrimPrefix(r.URL.Path, "/v1/job/"), "/dispatch") + var req api.JobDispatchRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "bad body: "+err.Error(), http.StatusBadRequest) + return + } + if f.onDispatch == nil { + http.Error(w, "no onDispatch handler", http.StatusNotImplemented) + return + } + f.dispatchCount++ + resp, code := f.onDispatch(jobID, &req, q) + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(resp) + + case r.Method == http.MethodDelete && strings.HasPrefix(r.URL.Path, "/v1/job/"): + jobID := strings.TrimPrefix(r.URL.Path, "/v1/job/") + f.deregCalls = append(f.deregCalls, jobID) + if f.onDeregister == nil { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{}`)) + return + } + code := f.onDeregister(jobID, q) + w.WriteHeader(code) + _, _ = w.Write([]byte(`{}`)) + + case r.Method == http.MethodGet && strings.HasPrefix(r.URL.Path, "/v1/evaluation/"): + // Path is /v1/evaluation/{id}/allocations or /v1/evaluation/{id} + rest := strings.TrimPrefix(r.URL.Path, "/v1/evaluation/") + if strings.Contains(rest, "/") { + http.Error(w, "unsupported eval subpath: "+r.URL.Path, http.StatusNotImplemented) + return + } + if f.onEvalInfo == nil { + http.Error(w, "no onEvalInfo handler", http.StatusNotImplemented) + return + } + eval, code := f.onEvalInfo(rest, q) + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(eval) + + default: + http.Error(w, "unexpected request: "+r.Method+" "+r.URL.String(), http.StatusNotImplemented) + } +} + +// startFake spins up the fake Nomad and a NomadProvider pointing at it. The +// provider's namespace defaults to "ns-test"; pass "" to leave it empty. +func startFake(t *testing.T, namespace string) (*fakeNomad, *NomadProvider) { + t.Helper() + fake := &fakeNomad{t: t} + srv := httptest.NewServer(fake) + t.Cleanup(srv.Close) + + pc := config.ProviderConfig{ + "name": "test-nomad", + "type": "nomad", + "address": srv.URL, + } + if namespace != "" { + pc["namespace"] = namespace + } + p := NewNomadProvider("test-nomad", pc) + require.NotNil(t, p, "provider construction must succeed") + return fake, p +} + +// setupTrayConfig installs a CatteryConfig with the given NomadTrayConfig +// and an advertise URL the provider will plumb into the dispatched meta. +func setupTrayConfig(t *testing.T, trayTypeName string, nc config.NomadTrayConfig, advertiseURL string) { + t.Helper() + cfg := &config.CatteryConfig{ + Server: config.ServerConfig{ + ListenAddress: ":0", + AdvertiseUrl: advertiseURL, + }, + TrayTypes: []*config.TrayType{ + { + Name: trayTypeName, + Provider: "test-nomad", + GitHubOrg: "org", + RunnerGroupId: 1, + Config: nc, + }, + }, + } + config.SetForTest(t, cfg) +} + +func newTestTray(trayTypeName, id string) *trays.Tray { + return &trays.Tray{ + Id: id, + TrayTypeName: trayTypeName, + ProviderData: map[string]string{}, + } +} + +// --------------------------------------------------------------------------- +// StartDeploy +// --------------------------------------------------------------------------- + +func TestStartDeploy_Success(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{ + JobId: "cattery-runner-tray", + RunnerFolder: "/cattery", + Script: "echo hi", + }, "https://cattery.test") + + fake, p := startFake(t, "ci") + + var seenReq *api.JobDispatchRequest + var seenJobID string + var seenNS string + fake.onDispatch = func(jobID string, req *api.JobDispatchRequest, q url.Values) (*api.JobDispatchResponse, int) { + seenJobID = jobID + seenReq = req + seenNS = q.Get("namespace") + return &api.JobDispatchResponse{ + DispatchedJobID: "cattery-runner-tray/dispatch-trayid-001-abcd", + EvalID: "eval-1", + }, http.StatusOK + } + + tray := newTestTray("tt", "trayid-001") + err := p.StartDeploy(context.Background(), tray) + require.NoError(t, err) + + // Response is plumbed onto ProviderData. + assert.Equal(t, "cattery-runner-tray/dispatch-trayid-001-abcd", tray.ProviderData[nomadProviderDataDispatchedJobID]) + assert.Equal(t, "eval-1", tray.ProviderData[nomadProviderDataEvalID]) + // Pre-dispatch staging is durable on the tray. + assert.Equal(t, "ci", tray.ProviderData[nomadProviderDataNamespace]) + assert.Equal(t, "cattery-runner-tray", tray.ProviderData[nomadProviderDataParentJobID]) + + // Request shape. + assert.Equal(t, "cattery-runner-tray", seenJobID) + assert.Equal(t, "ci", seenNS, "namespace should be passed as a query param") + require.NotNil(t, seenReq) + assert.Equal(t, "trayid-001", seenReq.IdPrefixTemplate, "idPrefixTemplate must equal tray.Id for prefix-scan recovery") + + // Meta is the bootstrap contract. + assert.Equal(t, "trayid-001", seenReq.Meta["tray_name"]) + assert.NotEmpty(t, seenReq.Meta["bootstrap_token"]) + assert.Equal(t, "https://cattery.test", seenReq.Meta["cattery_url"]) + + // Payload spliced our user script in and emitted the cattery exec. + payload := string(seenReq.Payload) + assert.Contains(t, payload, "echo hi") + assert.Contains(t, payload, `--runner-folder "/cattery"`) + assert.Contains(t, payload, `curl -fsSL "$CATTERY_URL/agent/download"`) +} + +func TestStartDeploy_MissingJobId(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{}, "https://cattery.test") + fake, p := startFake(t, "ci") + + tray := newTestTray("tt", "trayid-002") + err := p.StartDeploy(context.Background(), tray) + require.Error(t, err) + assert.Contains(t, err.Error(), "missing jobId") + // No HTTP call should have happened. + assert.Equal(t, 0, fake.dispatchCount) + // And no staged keys, since we bailed before staging. + assert.Empty(t, tray.ProviderData[nomadProviderDataParentJobID]) +} + +func TestStartDeploy_DispatchError_StillStagesRecoveryKeys(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{ + JobId: "cattery-runner-tray", + }, "https://cattery.test") + fake, p := startFake(t, "ci") + + fake.onDispatch = func(jobID string, req *api.JobDispatchRequest, q url.Values) (*api.JobDispatchResponse, int) { + // Simulate a server-side failure. The provider's error path must + // still leave parentJobId + namespace staged on ProviderData so + // CleanTray can recover. + return &api.JobDispatchResponse{}, http.StatusInternalServerError + } + + tray := newTestTray("tt", "trayid-003") + err := p.StartDeploy(context.Background(), tray) + require.Error(t, err) + + assert.Equal(t, "ci", tray.ProviderData[nomadProviderDataNamespace], + "namespace must be staged before dispatch so CleanTray can recover a leaked child") + assert.Equal(t, "cattery-runner-tray", tray.ProviderData[nomadProviderDataParentJobID], + "parentJobId must be staged before dispatch") + assert.Empty(t, tray.ProviderData[nomadProviderDataDispatchedJobID], + "dispatchedJobId must NOT be set when dispatch failed") +} + +func TestStartDeploy_WrongTrayConfigType(t *testing.T) { + // Install a Docker config under the same tray type name. The provider + // must reject it instead of dispatching garbage. + cfg := &config.CatteryConfig{ + Server: config.ServerConfig{ListenAddress: ":0", AdvertiseUrl: "http://x"}, + TrayTypes: []*config.TrayType{ + { + Name: "tt", + Provider: "test-nomad", + GitHubOrg: "org", + RunnerGroupId: 1, + Config: config.DockerTrayConfig{Image: "wrong"}, + }, + }, + } + config.SetForTest(t, cfg) + fake, p := startFake(t, "ci") + + tray := newTestTray("tt", "trayid-004") + err := p.StartDeploy(context.Background(), tray) + require.Error(t, err) + assert.Contains(t, err.Error(), "unexpected tray config type") + assert.Equal(t, 0, fake.dispatchCount) +} + +func TestStartDeploy_ExtraMetadataCannotClobberContractKeys(t *testing.T) { + cfg := &config.CatteryConfig{ + Server: config.ServerConfig{ListenAddress: ":0", AdvertiseUrl: "https://cattery.test"}, + TrayTypes: []*config.TrayType{ + { + Name: "tt", + Provider: "test-nomad", + GitHubOrg: "org", + RunnerGroupId: 1, + Config: config.NomadTrayConfig{JobId: "cattery-runner-tray"}, + ExtraMetadata: config.TrayExtraMetadata{ + "tray_name": "ATTACKER", + "bootstrap_token": "ATTACKER", + "cattery_url": "https://evil.example", + "foo": "bar", + }, + }, + }, + } + config.SetForTest(t, cfg) + fake, p := startFake(t, "ci") + + var seenMeta map[string]string + fake.onDispatch = func(jobID string, req *api.JobDispatchRequest, q url.Values) (*api.JobDispatchResponse, int) { + seenMeta = req.Meta + return &api.JobDispatchResponse{DispatchedJobID: "x", EvalID: "y"}, http.StatusOK + } + + tray := newTestTray("tt", "trayid-005") + err := p.StartDeploy(context.Background(), tray) + require.NoError(t, err) + + assert.Equal(t, "trayid-005", seenMeta["tray_name"], "provider must overwrite operator-supplied tray_name") + assert.NotEqual(t, "ATTACKER", seenMeta["bootstrap_token"], "bootstrap_token must be the freshly generated one") + assert.Equal(t, "https://cattery.test", seenMeta["cattery_url"], "cattery_url must be the server's advertised URL") + assert.Equal(t, "bar", seenMeta["foo"], "non-contract extraMetadata keys must pass through") +} + +// --------------------------------------------------------------------------- +// WaitDeploy +// --------------------------------------------------------------------------- + +func TestWaitDeploy_Complete(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onEvalInfo = func(evalID string, q url.Values) (*api.Evaluation, int) { + assert.Equal(t, "eval-1", evalID) + return &api.Evaluation{ID: "eval-1", Status: "complete"}, http.StatusOK + } + + tray := newTestTray("tt", "t") + tray.ProviderData[nomadProviderDataEvalID] = "eval-1" + assert.NoError(t, p.WaitDeploy(context.Background(), tray)) +} + +func TestWaitDeploy_Blocked_ReturnsCapacitySentinel(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onEvalInfo = func(evalID string, q url.Values) (*api.Evaluation, int) { + return &api.Evaluation{ + ID: "eval-1", + Status: "blocked", + FailedTGAllocs: map[string]*api.AllocationMetric{ + "vm": {NodesEvaluated: 0, NodesExhausted: 0, ConstraintFiltered: map[string]int{"meta.runner_host=cattery": 3}}, + }, + }, http.StatusOK + } + + tray := newTestTray("tt", "t") + tray.ProviderData[nomadProviderDataEvalID] = "eval-1" + err := p.WaitDeploy(context.Background(), tray) + require.Error(t, err) + assert.True(t, errors.Is(err, ErrCapacityBlocked), "blocked eval must produce ErrCapacityBlocked") + assert.Contains(t, err.Error(), "constraintFiltered=1", "wrapped reason must contain the formatted metric") +} + +func TestWaitDeploy_Failed(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onEvalInfo = func(evalID string, q url.Values) (*api.Evaluation, int) { + return &api.Evaluation{ID: "eval-1", Status: "failed", StatusDescription: "bad"}, http.StatusOK + } + + tray := newTestTray("tt", "t") + tray.ProviderData[nomadProviderDataEvalID] = "eval-1" + err := p.WaitDeploy(context.Background(), tray) + require.Error(t, err) + assert.False(t, errors.Is(err, ErrCapacityBlocked), "failed != blocked") + assert.Contains(t, err.Error(), "failed") +} + +func TestWaitDeploy_NoEvalIDIsNoOp(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + // No handler installed. If the provider hits the API, the default + // 501 handler will surface the bug as an error. + tray := newTestTray("tt", "t") // no eval id staged + assert.NoError(t, p.WaitDeploy(context.Background(), tray)) + assert.Equal(t, 0, fake.dispatchCount) +} + +// --------------------------------------------------------------------------- +// CleanTray +// --------------------------------------------------------------------------- + +func TestCleanTray_FastPath_DeregistersDispatchedJob(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + tray := newTestTray("tt", "t") + tray.ProviderData[nomadProviderDataDispatchedJobID] = "j/dispatch-t-001" + tray.ProviderData[nomadProviderDataNamespace] = "ci" + + require.NoError(t, p.CleanTray(context.Background(), tray)) + require.Len(t, fake.deregCalls, 1) + assert.Equal(t, "j/dispatch-t-001", fake.deregCalls[0]) +} + +func TestCleanTray_FastPath_404IsSwallowed(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onDeregister = func(jobID string, q url.Values) int { + // Nomad signals "already gone" via 404; provider must not surface it. + return http.StatusNotFound + } + + tray := newTestTray("tt", "t") + tray.ProviderData[nomadProviderDataDispatchedJobID] = "j/dispatch-t-001" + + assert.NoError(t, p.CleanTray(context.Background(), tray)) +} + +func TestCleanTray_LeakedChildScan_FindsAndDeregistersByPrefix(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onJobsList = func(q url.Values) ([]*api.JobListStub, int) { + // Provider asks for the parent's dispatch prefix. + assert.Equal(t, "parent-job/dispatch-", q.Get("prefix")) + return []*api.JobListStub{ + // Match: correct parent + correct tray prefix. + {ID: "parent-job/dispatch-tray-leak-001-1700000000-aaaa", ParentID: "parent-job"}, + // Same parent, different tray — must be ignored. + {ID: "parent-job/dispatch-some-other-tray-1700000000-bbbb", ParentID: "parent-job"}, + // Right-shaped ID but different ParentID — ignored. + {ID: "parent-job/dispatch-tray-leak-001-x", ParentID: "different-parent"}, + }, http.StatusOK + } + + tray := newTestTray("tt", "tray-leak-001") + // dispatchedJobId intentionally absent — this is the leaked-child case. + tray.ProviderData[nomadProviderDataParentJobID] = "parent-job" + tray.ProviderData[nomadProviderDataNamespace] = "ci" + + require.NoError(t, p.CleanTray(context.Background(), tray)) + require.Len(t, fake.deregCalls, 1, "exactly the matching leaked child must be deregistered") + assert.Equal(t, "parent-job/dispatch-tray-leak-001-1700000000-aaaa", fake.deregCalls[0]) +} + +func TestCleanTray_LeakedChildScan_NoMatchIsNoOp(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + fake.onJobsList = func(q url.Values) ([]*api.JobListStub, int) { + return []*api.JobListStub{}, http.StatusOK + } + + tray := newTestTray("tt", "tray-leak-002") + tray.ProviderData[nomadProviderDataParentJobID] = "parent-job" + tray.ProviderData[nomadProviderDataNamespace] = "ci" + + require.NoError(t, p.CleanTray(context.Background(), tray)) + assert.Empty(t, fake.deregCalls) +} + +func TestCleanTray_NoIdsRecorded_IsNoOp(t *testing.T) { + setupTrayConfig(t, "tt", config.NomadTrayConfig{JobId: "j"}, "http://x") + fake, p := startFake(t, "ci") + + tray := newTestTray("tt", "t") // empty ProviderData + + require.NoError(t, p.CleanTray(context.Background(), tray)) + assert.Empty(t, fake.deregCalls) +} diff --git a/src/lib/trays/providers/nomadProvider_test.go b/src/lib/trays/providers/nomadProvider_test.go new file mode 100644 index 0000000..89754c5 --- /dev/null +++ b/src/lib/trays/providers/nomadProvider_test.go @@ -0,0 +1,236 @@ +package providers + +import ( + "cattery/lib/config" + "encoding/hex" + "errors" + "fmt" + "strings" + "testing" + + "github.com/hashicorp/nomad/api" + "github.com/stretchr/testify/assert" +) + +func TestGenerateBootstrapToken(t *testing.T) { + t.Run("returns 64 hex chars", func(t *testing.T) { + tok, err := generateBootstrapToken() + assert.NoError(t, err) + assert.Len(t, tok, 64) + _, err = hex.DecodeString(tok) + assert.NoError(t, err, "token must be valid hex") + }) + + t.Run("two calls produce distinct tokens", func(t *testing.T) { + a, _ := generateBootstrapToken() + b, _ := generateBootstrapToken() + assert.NotEqual(t, a, b) + }) +} + +func TestBuildBootstrapPayload(t *testing.T) { + t.Run("empty runnerFolder defaults to /cattery", func(t *testing.T) { + out := string(buildBootstrapPayload("", "")) + assert.Contains(t, out, `--runner-folder "/cattery"`) + }) + + t.Run("custom runnerFolder is quoted with %q semantics", func(t *testing.T) { + out := string(buildBootstrapPayload("", "/opt/runner")) + assert.Contains(t, out, `--runner-folder "/opt/runner"`) + }) + + t.Run("prelude downloads cattery agent and chmods it", func(t *testing.T) { + out := string(buildBootstrapPayload("", "")) + assert.Contains(t, out, `curl -fsSL "$CATTERY_URL/agent/download" -o /usr/local/bin/cattery`) + assert.Contains(t, out, "chmod +x /usr/local/bin/cattery") + }) + + t.Run("starts with shebang and strict mode", func(t *testing.T) { + out := string(buildBootstrapPayload("", "")) + assert.True(t, strings.HasPrefix(out, "#!/bin/bash\nset -euo pipefail\n"), + "output must start with shebang and `set -euo pipefail`, got: %q", out[:min(64, len(out))]) + }) + + t.Run("user script is spliced between prelude and exec", func(t *testing.T) { + out := string(buildBootstrapPayload("echo MARKER", "/cattery")) + + preludeIdx := strings.Index(out, "chmod +x /usr/local/bin/cattery") + userIdx := strings.Index(out, "echo MARKER") + execIdx := strings.Index(out, "exec /usr/local/bin/cattery agent") + + assert.NotEqual(t, -1, preludeIdx) + assert.NotEqual(t, -1, userIdx) + assert.NotEqual(t, -1, execIdx) + assert.Less(t, preludeIdx, userIdx, "prelude must come before user script") + assert.Less(t, userIdx, execIdx, "user script must come before exec") + }) + + t.Run("user script without trailing newline is normalized", func(t *testing.T) { + // no trailing newline + out := string(buildBootstrapPayload("echo a", "")) + // The user line and the exec line must be on separate lines. + assert.NotContains(t, out, "echo aexec") + assert.Contains(t, out, "echo a\n") + }) + + t.Run("user script with trailing newline doesn't gain a duplicate", func(t *testing.T) { + out := string(buildBootstrapPayload("echo a\n", "")) + // One blank line between user script and exec is fine; three+ would be a bug. + assert.NotContains(t, out, "echo a\n\n\n\n") + }) + + t.Run("multi-line user script preserved verbatim", func(t *testing.T) { + userScript := "set -e\necho one\necho two" + out := string(buildBootstrapPayload(userScript, "")) + assert.Contains(t, out, userScript) + }) + + t.Run("empty user script produces no extra block", func(t *testing.T) { + out := string(buildBootstrapPayload("", "")) + // chmod line followed by blank line then exec, no leftover user-script artefacts + assert.NotContains(t, out, "echo") + }) + + t.Run("exec line uses TRAY_NAME and CATTERY_URL env vars", func(t *testing.T) { + out := string(buildBootstrapPayload("", "")) + assert.Contains(t, out, `exec /usr/local/bin/cattery agent -i "$TRAY_NAME" -s "$CATTERY_URL"`) + }) +} + +func TestFormatBlockedReason(t *testing.T) { + t.Run("empty FailedTGAllocs falls back to StatusDescription", func(t *testing.T) { + eval := &api.Evaluation{StatusDescription: "no nodes available"} + assert.Equal(t, "no nodes available", formatBlockedReason(eval)) + }) + + t.Run("nil metric in map is skipped", func(t *testing.T) { + eval := &api.Evaluation{ + FailedTGAllocs: map[string]*api.AllocationMetric{"vm": nil}, + } + // Skipping the nil leaves an empty join — acceptable, don't panic. + assert.NotPanics(t, func() { formatBlockedReason(eval) }) + }) + + t.Run("single group renders all counters", func(t *testing.T) { + eval := &api.Evaluation{ + FailedTGAllocs: map[string]*api.AllocationMetric{ + "vm": { + NodesEvaluated: 5, + NodesFiltered: 2, + NodesExhausted: 1, + ClassFiltered: map[string]int{"runner-host": 1}, + ConstraintFiltered: map[string]int{"meta.runner_host=cattery": 2}, + }, + }, + } + out := formatBlockedReason(eval) + assert.Contains(t, out, "vm: ") + assert.Contains(t, out, "nodesEvaluated=5") + assert.Contains(t, out, "nodesFiltered=2") + assert.Contains(t, out, "nodesExhausted=1") + assert.Contains(t, out, "classFiltered=1") + assert.Contains(t, out, "constraintFiltered=1") + }) + + t.Run("multiple groups joined by semicolons", func(t *testing.T) { + eval := &api.Evaluation{ + FailedTGAllocs: map[string]*api.AllocationMetric{ + "vm-a": {NodesEvaluated: 1}, + "vm-b": {NodesEvaluated: 2}, + }, + } + out := formatBlockedReason(eval) + assert.Contains(t, out, "; ") + // Don't assert ordering — map iteration is non-deterministic. + assert.Contains(t, out, "vm-a:") + assert.Contains(t, out, "vm-b:") + }) +} + +func TestIsNomad404(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil error", nil, false}, + {"contains lowercase 'not found'", errors.New("job not found"), true}, + {"contains uppercase 'Not Found' (case-insensitive)", errors.New("Job Not Found"), true}, + {"contains '404'", errors.New("server returned 404"), true}, + {"unrelated error", errors.New("connection refused"), false}, + {"wrapped 404 error", fmt.Errorf("dispatch failed: %w", errors.New("404 page not found")), true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, isNomad404(tc.err)) + }) + } +} + +func TestNewNomadProvider(t *testing.T) { + t.Run("missing address returns nil", func(t *testing.T) { + p := NewNomadProvider("toaster", config.ProviderConfig{ + "name": "toaster", + "type": "nomad", + }) + assert.Nil(t, p) + }) + + t.Run("minimal valid config builds a provider", func(t *testing.T) { + p := NewNomadProvider("toaster", config.ProviderConfig{ + "name": "toaster", + "type": "nomad", + "address": "https://example.invalid:4646", + }) + if assert.NotNil(t, p) { + assert.Equal(t, "toaster", p.GetProviderName()) + assert.Empty(t, p.namespace, "namespace stays empty when not configured") + } + }) + + t.Run("namespace is captured on the provider", func(t *testing.T) { + p := NewNomadProvider("toaster", config.ProviderConfig{ + "name": "toaster", + "type": "nomad", + "address": "https://example.invalid:4646", + "namespace": "ci", + }) + if assert.NotNil(t, p) { + assert.Equal(t, "ci", p.namespace) + } + }) + + t.Run("insecure=true is parsed case-insensitively", func(t *testing.T) { + // Just verifying the constructor accepts the value without erroring. + // The TLS config lives inside an unexported nomad client field, so + // we don't introspect it — we'd need a real handshake, which an + // httptest server below covers. + for _, v := range []string{"true", "TRUE", "True"} { + p := NewNomadProvider("toaster", config.ProviderConfig{ + "name": "toaster", + "type": "nomad", + "address": "https://example.invalid:4646", + "insecure": v, + }) + assert.NotNil(t, p, "insecure=%q should be accepted", v) + } + }) +} + +func TestGetProviderName(t *testing.T) { + p := NewNomadProvider("my-nomad", config.ProviderConfig{ + "name": "my-nomad", + "type": "nomad", + "address": "https://example.invalid:4646", + }) + if assert.NotNil(t, p) { + assert.Equal(t, "my-nomad", p.GetProviderName()) + } +} + +func min(a, b int) int { + if a < b { + return a + } + return b +}