diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 62121769..ccb27af5 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -153,7 +153,11 @@ func (m *Manager) Run() { wg := new(sync.WaitGroup) processingCh := make(chan struct{}, flags.LintersLimit) + recordNames := collectRecordingRuleNames(m.Modules) + for _, module := range m.Modules { + module.SetRecordingRuleNames(recordNames) + processingCh <- struct{}{} wg.Add(1) @@ -195,6 +199,69 @@ func getLintersForModule(cfg *pkg.LintersSettings, errList *errors.LintRuleError } } +func collectRecordingRuleNames(modules []*module.Module) map[string]struct{} { + names := make(map[string]struct{}) + + for _, m := range modules { + for _, obj := range m.GetStorage() { + if obj.Unstructured.GetKind() != "PrometheusRule" { + continue + } + + ispec, ok := obj.Unstructured.Object["spec"] + if !ok { + continue + } + + spec, ok := ispec.(map[string]any) + if !ok { + continue + } + + igroups, ok := spec["groups"] + if !ok { + continue + } + + groups, ok := igroups.([]any) + if !ok { + continue + } + + for _, ig := range groups { + group, ok := ig.(map[string]any) + if !ok { + continue + } + + irules, ok := group["rules"] + if !ok { + continue + } + + rules, ok := irules.([]any) + if !ok { + continue + } + + for _, ir := range rules { + rule, ok := ir.(map[string]any) + if !ok { + continue + } + + record, ok := rule["record"].(string) + if ok && record != "" { + names[record] = struct{}{} + } + } + } + } + } + + return names +} + func (m *Manager) PrintResult() { errs := m.errors.GetErrors() diff --git a/internal/module/module.go b/internal/module/module.go index 188dd86e..83aabc08 100644 --- a/internal/module/module.go +++ b/internal/module/module.go @@ -146,6 +146,14 @@ func (m *Module) GetModuleConfig() *pkg.LintersSettings { return m.linterConfig } +func (m *Module) SetRecordingRuleNames(names map[string]struct{}) { + if m == nil || m.linterConfig == nil { + return + } + + m.linterConfig.Templates.SourceLabelSettings.RecordingRuleNames = names +} + // remapLinterSettings converts configuration settings from the config package format // to the pkg package format, mapping both rule-level configurations and exclusion rules // across all linter domains (Container, Image, NoCyrillic, OpenAPI, Templates, RBAC, Hooks, Module). @@ -342,6 +350,7 @@ func mapTemplatesRules(linterSettings *pkg.LintersSettings, configSettings *conf rules.ClusterDomainRule.SetLevel(globalRules.ClusterDomainRule.Impact, fallbackImpact) rules.RegistryRule.SetLevel(globalRules.RegistryRule.Impact, fallbackImpact) rules.EnabledModulesRule.SetLevel(globalRules.EnabledModulesRule.Impact, fallbackImpact) + rules.SourceLabelRule.SetLevel(globalRules.SourceLabelRule.Impact, fallbackImpact) } // mapOpenAPIRules configures OpenAPI linter rules @@ -459,6 +468,7 @@ func mapTemplatesExclusionsAndSettings(linterSettings *pkg.LintersSettings, conf // Additional settings linterSettings.Templates.PrometheusRuleSettings.Disable = configSettings.Templates.PrometheusRules.Disable linterSettings.Templates.GrafanaDashboardsSettings.Disable = configSettings.Templates.GrafanaDashboards.Disable + linterSettings.Templates.SourceLabelSettings.AllowedMetrics = configSettings.Templates.SourceLabel.AllowedMetrics } // mapRBACExclusions maps RBAC linter exclusion rules diff --git a/internal/values/global-openapi/values.yaml b/internal/values/global-openapi/values.yaml index c3e810c3..68f7e430 100644 --- a/internal/values/global-openapi/values.yaml +++ b/internal/values/global-openapi/values.yaml @@ -276,10 +276,10 @@ properties: items: type: string x-examples: - - ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus"] + - ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus-crd"] - ["cert-manager", "prometheus", "priority-class"] x-dmt-default: - ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus"] + ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus-crd"] discovery: additionalProperties: true type: object diff --git a/pkg/config.go b/pkg/config.go index bf828295..097c7300 100644 --- a/pkg/config.go +++ b/pkg/config.go @@ -129,6 +129,12 @@ type TemplatesLinterConfig struct { ExcludeRules TemplatesExcludeRules PrometheusRuleSettings PrometheusRuleSettings GrafanaDashboardsSettings GrafanaDashboardsSettings + SourceLabelSettings SourceLabelSettings +} + +type SourceLabelSettings struct { + AllowedMetrics []string + RecordingRuleNames map[string]struct{} } type TemplatesLinterRules struct { VPARule RuleConfig @@ -141,6 +147,7 @@ type TemplatesLinterRules struct { ClusterDomainRule RuleConfig RegistryRule RuleConfig EnabledModulesRule RuleConfig + SourceLabelRule RuleConfig } type PrometheusRuleSettings struct { diff --git a/pkg/config/global/global.go b/pkg/config/global/global.go index d6c969a9..a9fda869 100644 --- a/pkg/config/global/global.go +++ b/pkg/config/global/global.go @@ -139,6 +139,7 @@ type TemplatesLinterRules struct { ClusterDomainRule RuleConfig `mapstructure:"cluster-domain"` RegistryRule RuleConfig `mapstructure:"registry"` EnabledModulesRule RuleConfig `mapstructure:"enabled-modules"` + SourceLabelRule RuleConfig `mapstructure:"source-label"` } func (c LinterConfig) IsWarn() bool { diff --git a/pkg/config/linters_settings.go b/pkg/config/linters_settings.go index bd60f17d..dfa1792f 100644 --- a/pkg/config/linters_settings.go +++ b/pkg/config/linters_settings.go @@ -189,11 +189,16 @@ type TemplatesSettings struct { ExcludeRules TemplatesExcludeRules `mapstructure:"exclude-rules"` GrafanaDashboards GrafanaDashboardsExcludeList `mapstructure:"grafana-dashboards"` PrometheusRules PrometheusRulesExcludeList `mapstructure:"prometheus-rules"` + SourceLabel SourceLabelSettings `mapstructure:"source-label"` Rules TemplatesLinterRules `mapstructure:"rules"` Impact string `mapstructure:"impact"` } +type SourceLabelSettings struct { + AllowedMetrics []string `mapstructure:"allowed-metrics"` +} + type TemplatesLinterRules struct { VPARule RuleConfig `mapstructure:"vpa"` PDBRule RuleConfig `mapstructure:"pdb"` @@ -205,6 +210,7 @@ type TemplatesLinterRules struct { ClusterDomainRule RuleConfig `mapstructure:"cluster-domain"` RegistryRule RuleConfig `mapstructure:"registry"` EnabledModulesRule RuleConfig `mapstructure:"enabled-modules"` + SourceLabelRule RuleConfig `mapstructure:"source-label"` } type TemplatesExcludeRules struct { diff --git a/pkg/linters/templates/README.md b/pkg/linters/templates/README.md index f1e4c27e..9be6049e 100644 --- a/pkg/linters/templates/README.md +++ b/pkg/linters/templates/README.md @@ -17,6 +17,7 @@ Proper template validation prevents runtime issues, ensures applications are pro | [ingress-rules](#ingress-rules) | Validates Ingress configuration snippets | ✅ | enabled | | [prometheus-rules](#prometheus-rules) | Validates Prometheus rules with promtool and proper templates | ✅ | enabled | | [grafana-dashboards](#grafana-dashboards) | Validates Grafana dashboard templates | ✅ | enabled | +| [source-label](#source-label) | Requires `source="deckhouse"` selector on metrics in Prometheus rules and Grafana dashboards | ✅ | enabled | | [cluster-domain](#cluster-domain) | Validates cluster domain configuration is dynamic | ❌ | enabled | | [registry](#registry) | Validates registry secret configuration | ❌ | enabled | | [werf](#werf) | Validates image names in `werf.yaml` do not contain underscores | ❌ | enabled | @@ -1284,6 +1285,85 @@ linters-settings: --- +### source-label + +**Purpose:** Ensures every Deckhouse-owned metric referenced in PromQL expressions (in PrometheusRule objects and Grafana dashboards) is selected with an explicit `source="deckhouse"` label matcher, isolating our metrics from foreign metrics that share the same name. + +**Description:** + +The rule parses the PromQL expressions of alerting/recording rules and dashboard panel/template queries, and for every vector selector over a Deckhouse metric it requires a `source="deckhouse"` matcher (or a `source=$...` Grafana/templating variable that resolves to it). + +**Why it matters:** + +User can ran their own exporter that exposed a metric with the **same name** as one of ours. Because both time series shared the metric name, the metric was effectively duplicated, the rule expression returned ambiguous data, and it failed to evaluate. Pinning `source="deckhouse"` separates our metrics from foreign ones with colliding names, so a query only matches our own time series and evaluates reliably. + +**What it checks:** + +1. `PrometheusRule` objects — the `expr` of every alerting and recording rule +2. Grafana dashboards under `monitoring/grafana-dashboards` — `expr` of panel targets and `query`/`definition` of query template variables (only Prometheus datasources) +3. Each Deckhouse metric selector contains `source="deckhouse"` (or `source=$`) + +**Exemptions (no source selector required):** + +- Metrics produced by recording rules within the module (collected automatically at runtime) +- Metrics matching the per-module `allowed-metrics` globs (for intentionally foreign metrics, e.g. third-party exporters) +- Prometheus synthetic metrics: `ALERTS`, `ALERTS_FOR_STATE` + +**Examples:** + +❌ **Incorrect** - Metric without source selector: + +```yaml +# templates/monitoring.yaml -> PrometheusRule +- alert: TestAlertMissingSource + expr: up{job="node-exporter"} == 0 + for: 5m + labels: + severity_level: "5" +``` + +**Error:** +``` +Error: metric 'up' in rule 'TestAlertMissingSource' (group 'e2e.source-label') must have source="deckhouse" selector +``` + +✅ **Correct** - Metric with source selector: + +```yaml +- alert: TestAlertWithSource + expr: up{job="node-exporter", source="deckhouse"} == 0 + for: 5m + labels: + severity_level: "5" +``` + +✅ **Correct** - Grafana panel query using a source variable: + +```json +{ + "title": "Requests", + "targets": [ + { "expr": "rate(my_module_requests_total{source=\"deckhouse\"}[$__rate_interval])" } + ] +} +``` + +**Configuration:** + +The rule does not use `exclude-rules`. Instead, use `allowed-metrics` to list metric names (glob patterns with `*` and `?` are supported) that are allowed to appear without a source selector: + +```yaml +# .dmt.yaml +linters-settings: + templates: + source-label: + allowed-metrics: + - "rabbitmq_*" # All metrics from a third-party exporter + - "node_exporter_build_info" +``` + +--- + ### cluster-domain **Purpose:** Prevents hardcoding of the cluster domain (`cluster.local`) in templates. Ensures cluster domain is configurable to support custom cluster configurations and multi-cluster deployments. @@ -1663,6 +1743,11 @@ linters-settings: prometheus-rules: disable: true + + # Allow specific (foreign) metrics without a source="deckhouse" selector + source-label: + allowed-metrics: + - "rabbitmq_*" ``` ### Per-Rule Impact Levels @@ -1694,6 +1779,8 @@ linters-settings: impact: error enabled-modules: impact: warning + source-label: + impact: error ``` ### Rule-Level Exclusions diff --git a/pkg/linters/templates/rules/source_label.go b/pkg/linters/templates/rules/source_label.go new file mode 100644 index 00000000..237709f7 --- /dev/null +++ b/pkg/linters/templates/rules/source_label.go @@ -0,0 +1,422 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rules + +import ( + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql/parser" + "github.com/tidwall/gjson" + "sigs.k8s.io/yaml" + + "github.com/deckhouse/dmt/internal/fsutils" + "github.com/deckhouse/dmt/internal/storage" + "github.com/deckhouse/dmt/pkg" + "github.com/deckhouse/dmt/pkg/errors" +) + +const ( + SourceLabelRuleName = "source-label" +) + +// prometheusSyntheticMetrics contains Prometheus built-in metrics that are generated +// internally by the engine and never receive scrape-time labels like source="deckhouse". +// This set is intentionally separate from allowedMetrics (per-module config) and +// recordingRuleNames (computed at runtime) because these metrics are universally +// synthetic regardless of module or deployment. +var prometheusSyntheticMetrics = map[string]struct{}{ + "ALERTS": {}, + "ALERTS_FOR_STATE": {}, +} + +// SourceLabelRule enforces that every Deckhouse-owned metric referenced in +// PromQL expressions (PrometheusRule objects and Grafana dashboards) is selected +// with an explicit source="deckhouse" label matcher. +// +// The following metrics are exempt from the check: +// - metrics produced by recording rules within the module (recordingRuleNames), +// since those are computed by us and already scoped; +// - metrics matching the per-module allowed-metrics globs (allowedMetrics), +// used for intentionally foreign metrics such as third-party exporters; +// - Prometheus synthetic metrics (ALERTS, ALERTS_FOR_STATE) that never carry +// scrape-time labels. +type SourceLabelRule struct { + pkg.RuleMeta + recordingRuleNames map[string]struct{} + allowedMetrics []*regexp.Regexp +} + +// globToRegexp converts a simple glob pattern (supporting * and ?) to a regexp. +// Plain strings without wildcards are compiled as ^exact_name$, behaving like exact match. +func globToRegexp(pattern string) (*regexp.Regexp, error) { + var b strings.Builder + b.WriteString("^") + + for _, ch := range pattern { + switch ch { + case '*': + b.WriteString(".*") + case '?': + b.WriteString(".") + default: + b.WriteString(regexp.QuoteMeta(string(ch))) + } + } + + b.WriteString("$") + + return regexp.Compile(b.String()) +} + +// NewSourceLabelRule builds the rule from the templates linter config. The +// allowed-metrics patterns (globs supporting * and ?) are compiled into regexps, +// and the runtime-collected recording rule names are stored so that metrics they +// produce are not required to carry a source selector. +func NewSourceLabelRule(cfg *pkg.TemplatesLinterConfig) *SourceLabelRule { + var allowedMetrics []*regexp.Regexp + + recordNames := make(map[string]struct{}) + + if cfg != nil { + for _, m := range cfg.SourceLabelSettings.AllowedMetrics { + if re, err := globToRegexp(m); err == nil { + allowedMetrics = append(allowedMetrics, re) + } + } + + if cfg.SourceLabelSettings.RecordingRuleNames != nil { + recordNames = cfg.SourceLabelSettings.RecordingRuleNames + } + } + + return &SourceLabelRule{ + RuleMeta: pkg.RuleMeta{ + Name: SourceLabelRuleName, + }, + recordingRuleNames: recordNames, + allowedMetrics: allowedMetrics, + } +} + +// SourceLabelCheck inspects a single PrometheusRule object and verifies that the +// PromQL expressions of all its alerting and recording rules select Deckhouse +// metrics with a source="deckhouse" matcher. Non-PrometheusRule objects are +// ignored. +func (r *SourceLabelRule) SourceLabelCheck(m pkg.Module, object storage.StoreObject, errorList *errors.LintRuleErrorsList) { + errorList = errorList.WithFilePath(m.GetPath()).WithRule(r.GetName()) + + if object.Unstructured.GetKind() != "PrometheusRule" { + return + } + + ispec, ok := object.Unstructured.Object["spec"] + if !ok { + return + } + + spec, ok := ispec.(map[string]any) + if !ok { + return + } + + specBytes, err := yaml.Marshal(spec) + if err != nil { + return + } + + type rule struct { + Record string `yaml:"record,omitempty"` + Alert string `yaml:"alert,omitempty"` + Expr string `yaml:"expr"` + } + + type ruleGroup struct { + Name string `yaml:"name"` + Rules []rule `yaml:"rules"` + } + + type ruleGroups struct { + Groups []ruleGroup `yaml:"groups"` + } + + var groups ruleGroups + if err := yaml.Unmarshal(specBytes, &groups); err != nil { + return + } + + for _, group := range groups.Groups { + for _, rl := range group.Rules { + if rl.Expr == "" { + continue + } + + ruleName := rl.Alert + if ruleName == "" { + ruleName = rl.Record + } + + r.checkExpr(rl.Expr, fmt.Sprintf("rule '%s' (group '%s')", ruleName, group.Name), object.GetPath(), errorList) + } + } +} + +// checkExpr parses a PromQL expression and reports every vector selector over a +// Deckhouse metric that lacks a source="deckhouse" matcher. Exempt metrics +// (recording rule outputs, allowed-metrics, synthetic metrics and placeholder +// names produced by Grafana variable sanitization) are skipped. +func (r *SourceLabelRule) checkExpr(expr, context, filePath string, errorList *errors.LintRuleErrorsList) { + ast, err := parser.ParseExpr(expr) + if err != nil { + return + } + + parser.Inspect(ast, func(node parser.Node, _ []parser.Node) error { + vs, ok := node.(*parser.VectorSelector) + if !ok { + return nil + } + + metricName := vs.Name + if metricName == "" { + for _, m := range vs.LabelMatchers { + if m.Name == labels.MetricName && m.Type == labels.MatchEqual { + metricName = m.Value + break + } + } + } + + if metricName == "" || strings.Contains(metricName, "__placeholder__") { + return nil + } + + if _, ok := r.recordingRuleNames[metricName]; ok { + return nil + } + + if r.isAllowedMetric(metricName) { + return nil + } + + if _, ok := prometheusSyntheticMetrics[metricName]; ok { + return nil + } + + hasSourceLabel := false + + for _, m := range vs.LabelMatchers { + if m.Name == "source" && m.Type == labels.MatchEqual && + (m.Value == "deckhouse" || strings.HasPrefix(m.Value, "$")) { + hasSourceLabel = true + break + } + } + + if !hasSourceLabel { + errorList.WithFilePath(filePath). + Errorf("metric '%s' in %s must have source=\"deckhouse\" selector", + metricName, context) + } + + return nil + }) +} + +func (r *SourceLabelRule) isAllowedMetric(metricName string) bool { + for _, re := range r.allowedMetrics { + if re.MatchString(metricName) { + return true + } + } + + return false +} + +var ( + grafanaBuiltinVarRe = regexp.MustCompile(`\$__\w+`) + grafanaVarBracesRe = regexp.MustCompile(`\$\{(\w+)(?::[^}]*)?\}`) + grafanaVarSimpleRe = regexp.MustCompile(`\$([a-zA-Z_]\w*)`) +) + +// sanitizeGrafanaExpr makes a Grafana panel/template expression parseable as +// plain PromQL by replacing Grafana variables with neutral placeholders: +// built-in variables (e.g. $__rate_interval) become a dummy duration, while +// other variables become "__placeholder__" so they are ignored by checkExpr. +// The "source" variable is preserved so a source=$source matcher still counts +// as an explicit source selector. +func sanitizeGrafanaExpr(expr string) string { + result := grafanaBuiltinVarRe.ReplaceAllString(expr, "5m") + result = grafanaVarBracesRe.ReplaceAllStringFunc(result, func(match string) string { + sub := grafanaVarBracesRe.FindStringSubmatch(match) + if len(sub) > 1 && sub[1] == "source" { + return match + } + + return "__placeholder__" + }) + result = grafanaVarSimpleRe.ReplaceAllStringFunc(result, func(match string) string { + name := match[1:] + if name == "source" { + return match + } + + return "__placeholder__" + }) + + return result +} + +// SourceLabelCheckDashboards walks every Grafana dashboard file under +// monitoring/grafana-dashboards and verifies that the PromQL queries in their +// panels and template variables select Deckhouse metrics with a +// source="deckhouse" matcher. +func (r *SourceLabelRule) SourceLabelCheckDashboards(m pkg.Module, errorList *errors.LintRuleErrorsList) { + errorList = errorList.WithRule(r.GetName()) + + searchPath := filepath.Join(m.GetPath(), "monitoring", "grafana-dashboards") + entries := fsutils.GetFiles(searchPath, true, fsutils.FilterFileByExtensions(".json", ".tpl")) + + for _, entry := range entries { + r.checkDashboardFile(entry, errorList) + } +} + +func (r *SourceLabelRule) checkDashboardFile(filePath string, errorList *errors.LintRuleErrorsList) { + content, err := os.ReadFile(filePath) + if err != nil { + errorList.WithFilePath(filePath).Errorf("failed to read dashboard file: %s", err) + return + } + + dashboard := gjson.ParseBytes(content) + if !dashboard.IsObject() { + return + } + + panels := r.extractDashboardPanels(&dashboard) + for _, panel := range panels { + r.checkPanel(&panel, filePath, errorList) + } + + r.checkTemplateVariables(&dashboard, filePath, errorList) +} + +func isPrometheusDataSource(obj *gjson.Result) bool { + ds := obj.Get("datasource") + if !ds.Exists() { + return true + } + + if ds.Type == gjson.String { + return strings.Contains(strings.ToLower(ds.String()), "prometheus") + } + + dsType := ds.Get("type").String() + + return dsType == "" || dsType == "prometheus" +} + +func (r *SourceLabelRule) checkPanel(panel *gjson.Result, filePath string, errorList *errors.LintRuleErrorsList) { + if !isPrometheusDataSource(panel) { + return + } + + panelTitle := panel.Get("title").String() + if panelTitle == "" { + panelTitle = "unnamed" + } + + targets := panel.Get("targets").Array() + for _, target := range targets { + if !isPrometheusDataSource(&target) { + continue + } + + expr := target.Get("expr").String() + if expr == "" { + continue + } + + sanitized := sanitizeGrafanaExpr(expr) + r.checkExpr(sanitized, fmt.Sprintf("panel '%s'", panelTitle), filePath, errorList) + } +} + +func (r *SourceLabelRule) checkTemplateVariables(dashboard *gjson.Result, filePath string, errorList *errors.LintRuleErrorsList) { + templating := dashboard.Get("templating.list") + if !templating.Exists() || !templating.IsArray() { + return + } + + for _, tmpl := range templating.Array() { + if tmpl.Get("type").String() != "query" { + continue + } + + if !isPrometheusDataSource(&tmpl) { + continue + } + + query := tmpl.Get("definition").String() + if query == "" { + query = tmpl.Get("query").String() + } + + if query == "" { + continue + } + + sanitized := sanitizeGrafanaExpr(query) + tmplName := tmpl.Get("name").String() + r.checkExpr(sanitized, fmt.Sprintf("template variable '%s'", tmplName), filePath, errorList) + } +} + +func (r *SourceLabelRule) extractDashboardPanels(dashboard *gjson.Result) []gjson.Result { + rows := dashboard.Get("rows").Array() + directPanels := dashboard.Get("panels").Array() + panels := make([]gjson.Result, 0, len(rows)+len(directPanels)) + + for _, row := range rows { + rowPanels := row.Get("panels").Array() + panels = append(panels, rowPanels...) + } + + panels = append(panels, collectPanelsRecursive(directPanels)...) + + return panels +} + +func collectPanelsRecursive(items []gjson.Result) []gjson.Result { + result := make([]gjson.Result, 0, len(items)) + + for _, item := range items { + if item.Get("type").String() == "row" { + nested := item.Get("panels").Array() + result = append(result, collectPanelsRecursive(nested)...) + } else { + result = append(result, item) + } + } + + return result +} diff --git a/pkg/linters/templates/rules/source_label_test.go b/pkg/linters/templates/rules/source_label_test.go new file mode 100644 index 00000000..7cbcf166 --- /dev/null +++ b/pkg/linters/templates/rules/source_label_test.go @@ -0,0 +1,403 @@ +/* +Copyright 2025 Flant JSC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rules + +import ( + "regexp" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/tidwall/gjson" + + "github.com/deckhouse/dmt/pkg/errors" +) + +func TestCheckExprWithSourceLabel(t *testing.T) { + tests := []struct { + name string + expr string + recordNames map[string]struct{} + allowedMetrics []string + expectedErrors int + }{ + { + name: "metric with source=deckhouse has no errors", + expr: `kube_pod_info{source="deckhouse"}`, + expectedErrors: 0, + }, + { + name: "metric without source selector produces error", + expr: `kube_pod_info`, + expectedErrors: 1, + }, + { + name: "recording rule name is allowed without source", + expr: `my_recording_rule`, + recordNames: map[string]struct{}{"my_recording_rule": {}}, + expectedErrors: 0, + }, + { + name: "binary expr - one metric with source, one without", + expr: `a{source="deckhouse"} * on() b`, + expectedErrors: 1, + }, + { + name: "exact allowed metric does not produce error", + expr: `allowed_metric`, + allowedMetrics: []string{"allowed_metric"}, + expectedErrors: 0, + }, + { + name: "glob pattern matches metric prefix", + expr: `coredns_panics_total + coredns_dns_requests_total`, + allowedMetrics: []string{"coredns_*"}, + expectedErrors: 0, + }, + { + name: "glob pattern does not match unrelated metric", + expr: `kube_pod_info`, + allowedMetrics: []string{"coredns_*"}, + expectedErrors: 1, + }, + { + name: "glob pattern with question mark", + expr: `metric_v1_total + metric_v2_total`, + allowedMetrics: []string{"metric_v?_total"}, + expectedErrors: 0, + }, + { + name: "mixed exact and glob", + expr: `exact_metric + coredns_cache_hits_total + unknown_metric`, + allowedMetrics: []string{"exact_metric", "coredns_*"}, + expectedErrors: 1, + }, + { + name: "ALERTS synthetic metric is allowed without source", + expr: `ALERTS{alertname="SomeAlert"}`, + expectedErrors: 0, + }, + { + name: "ALERTS_FOR_STATE synthetic metric is allowed without source", + expr: `ALERTS_FOR_STATE{alertname="SomeAlert"}`, + expectedErrors: 0, + }, + { + name: "ALERTS in complex expr does not produce error", + expr: `ALERTS{alertname="KubeQuotaExceeded"} == 1 and on(namespace) kube_pod_info{source="deckhouse"}`, + expectedErrors: 0, + }, + { + name: "ALERTS without source but regular metric also without source", + expr: `ALERTS{alertname="X"} * on() some_metric`, + expectedErrors: 1, + }, + { + name: "placeholder metric name is skipped", + expr: `__placeholder__{some="label"}`, + expectedErrors: 0, + }, + { + name: "placeholder embedded in metric name is skipped", + expr: `__placeholder___requests_total{some="label"}`, + expectedErrors: 0, + }, + { + name: "selector without metric name is skipped", + expr: `{job="foo"}`, + expectedErrors: 0, + }, + { + name: "exact __name__ matcher requires source label", + expr: `{__name__="kube_pod_info"}`, + expectedErrors: 1, + }, + { + name: "exact __name__ matcher with source is ok", + expr: `{__name__="kube_pod_info", source="deckhouse"}`, + expectedErrors: 0, + }, + { + name: "regex __name__ matcher is skipped", + expr: `{__name__=~"kube_.*"}`, + expectedErrors: 0, + }, + { + name: "invalid PromQL is silently skipped", + expr: `label_values(kube_pod_info, namespace)`, + expectedErrors: 0, + }, + { + name: "expression with $source variable is accepted", + expr: `m{source="$source"}`, + expectedErrors: 0, + }, + { + name: "expression with ${source} variable is accepted", + expr: `m{source="${source}"}`, + expectedErrors: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + recordNames := tt.recordNames + if recordNames == nil { + recordNames = make(map[string]struct{}) + } + + compiled := make([]*regexp.Regexp, 0, len(tt.allowedMetrics)) + + for _, m := range tt.allowedMetrics { + re, err := globToRegexp(m) + assert.NoError(t, err) + + compiled = append(compiled, re) + } + + rule := &SourceLabelRule{ + recordingRuleNames: recordNames, + allowedMetrics: compiled, + } + + errorList := errors.NewLintRuleErrorsList() + rule.checkExpr(tt.expr, "rule 'test-rule' (group 'test-group')", "/test/file.yaml", errorList) + + assert.Len(t, errorList.GetErrors(), tt.expectedErrors) + }) + } +} + +func TestGlobToRegexp(t *testing.T) { + tests := []struct { + pattern string + match []string + noMatch []string + }{ + { + pattern: "coredns_*", + match: []string{"coredns_panics_total", "coredns_dns_requests_total", "coredns_"}, + noMatch: []string{"xcoredns_foo", "kube_pod_info", "coredns"}, + }, + { + pattern: "metric_v?_total", + match: []string{"metric_v1_total", "metric_vX_total"}, + noMatch: []string{"metric_v12_total", "metric_v_total"}, + }, + { + pattern: "exact_metric", + match: []string{"exact_metric"}, + noMatch: []string{"exact_metric_extra", "xexact_metric", "exact_metricx"}, + }, + { + pattern: "ingress_nginx_*_responses_*", + match: []string{"ingress_nginx_detail_responses_total", "ingress_nginx_overall_responses_5xx"}, + noMatch: []string{"ingress_nginx_connections", "nginx_responses_total"}, + }, + { + pattern: "*.total", + match: []string{"foo.total", "bar_baz.total"}, + noMatch: []string{"foo_total", "total"}, + }, + } + + for _, tt := range tests { + t.Run(tt.pattern, func(t *testing.T) { + re, err := globToRegexp(tt.pattern) + assert.NoError(t, err) + + for _, s := range tt.match { + assert.True(t, re.MatchString(s), "expected %q to match pattern %q", s, tt.pattern) + } + + for _, s := range tt.noMatch { + assert.False(t, re.MatchString(s), "expected %q NOT to match pattern %q", s, tt.pattern) + } + }) + } +} + +func TestIsPrometheusDataSource(t *testing.T) { + tests := []struct { + name string + json string + expected bool + }{ + { + name: "no datasource field defaults to prometheus", + json: `{"title": "panel"}`, + expected: true, + }, + { + name: "datasource type prometheus", + json: `{"datasource": {"type": "prometheus", "uid": "$ds"}}`, + expected: true, + }, + { + name: "datasource type empty defaults to prometheus", + json: `{"datasource": {"uid": "$ds"}}`, + expected: true, + }, + { + name: "datasource is plain string Graphite", + json: `{"datasource": "Graphite"}`, + expected: false, + }, + { + name: "datasource is plain string with prometheus in name", + json: `{"datasource": "$ds_prometheus"}`, + expected: true, + }, + { + name: "datasource is plain string Prometheus capitalized", + json: `{"datasource": "Prometheus"}`, + expected: true, + }, + { + name: "datasource type loki", + json: `{"datasource": {"type": "loki", "uid": "$ds_loki"}}`, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := gjson.Parse(tt.json) + assert.Equal(t, tt.expected, isPrometheusDataSource(&result)) + }) + } +} + +func TestCollectPanelsRecursive(t *testing.T) { + tests := []struct { + name string + json string + expectedTitles []string + }{ + { + name: "flat panels", + json: `[ + {"type": "graph", "title": "A"}, + {"type": "stat", "title": "B"} + ]`, + expectedTitles: []string{"A", "B"}, + }, + { + name: "one level of row nesting", + json: `[ + {"type": "row", "panels": [ + {"type": "graph", "title": "A"} + ]}, + {"type": "stat", "title": "B"} + ]`, + expectedTitles: []string{"A", "B"}, + }, + { + name: "two levels of row nesting", + json: `[ + {"type": "row", "panels": [ + {"type": "row", "panels": [ + {"type": "graph", "title": "deep"} + ]} + ]} + ]`, + expectedTitles: []string{"deep"}, + }, + { + name: "three levels of row nesting", + json: `[ + {"type": "row", "panels": [ + {"type": "row", "panels": [ + {"type": "row", "panels": [ + {"type": "graph", "title": "very-deep"} + ]} + ]} + ]} + ]`, + expectedTitles: []string{"very-deep"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + items := gjson.Parse(tt.json).Array() + panels := collectPanelsRecursive(items) + + titles := make([]string, 0, len(panels)) + for _, p := range panels { + titles = append(titles, p.Get("title").String()) + } + + assert.Equal(t, tt.expectedTitles, titles) + }) + } +} + +func TestSanitizeGrafanaExpr(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "replaces $__rate_interval with 5m", + input: `rate(m{source="deckhouse"}[$__rate_interval])`, + expected: `rate(m{source="deckhouse"}[5m])`, + }, + { + name: "replaces $namespace with __placeholder__", + input: `m{namespace="$namespace"}`, + expected: `m{namespace="__placeholder__"}`, + }, + { + name: "replaces ${var:regex} with __placeholder__", + input: `m{var="${var:regex}"}`, + expected: `m{var="__placeholder__"}`, + }, + { + name: "does not replace $source", + input: `m{source="$source"}`, + expected: `m{source="$source"}`, + }, + { + name: "replaces $__range with 5m", + input: `increase(m[$__range])`, + expected: `increase(m[5m])`, + }, + { + name: "does not replace ${source}", + input: `m{source="${source}"}`, + expected: `m{source="${source}"}`, + }, + { + name: "does not replace ${source:json}", + input: `m{source="${source:json}"}`, + expected: `m{source="${source:json}"}`, + }, + { + name: "replaces ${other_var} with __placeholder__", + input: `m{ns="${namespace}"}`, + expected: `m{ns="__placeholder__"}`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := sanitizeGrafanaExpr(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/pkg/linters/templates/templates.go b/pkg/linters/templates/templates.go index 3ed5261b..7a56b1a3 100644 --- a/pkg/linters/templates/templates.go +++ b/pkg/linters/templates/templates.go @@ -63,10 +63,12 @@ func (l *Templates) Run(m *module.Module) { // monitoring prometheusRule := rules.NewPrometheusRule(l.cfg) grafanaRule := rules.NewGrafanaRule(l.cfg) + sourceLabelRule := rules.NewSourceLabelRule(l.cfg) if err := dirExists(m.GetPath(), "monitoring"); err == nil { grafanaRule.ValidateGrafanaDashboards(m, errorList.WithMaxLevel(l.cfg.Rules.GrafanaRule.GetLevel())) prometheusRule.ValidatePrometheusRules(m, errorList.WithMaxLevel(l.cfg.Rules.PrometheusRule.GetLevel())) + sourceLabelRule.SourceLabelCheckDashboards(m, errorList.WithMaxLevel(l.cfg.Rules.SourceLabelRule.GetLevel())) } else if !os.IsNotExist(err) { errorList.Errorf("reading the 'monitoring' folder failed: %s", err) } @@ -80,6 +82,7 @@ func (l *Templates) Run(m *module.Module) { servicePortRule.ObjectServiceTargetPort(object, errorList.WithMaxLevel(l.cfg.Rules.ServicePortRule.GetLevel())) prometheusRule.PromtoolRuleCheck(m, object, errorList.WithMaxLevel(l.cfg.Rules.PrometheusRule.GetLevel())) ingressRule.CheckSnippetsRule(object, errorList.WithMaxLevel(l.cfg.Rules.IngressRule.GetLevel())) + sourceLabelRule.SourceLabelCheck(m, object, errorList.WithMaxLevel(l.cfg.Rules.SourceLabelRule.GetLevel())) } // Cluster domain rule diff --git a/test/e2e/testdata/templates/source-label-allowed/expected.yaml b/test/e2e/testdata/templates/source-label-allowed/expected.yaml new file mode 100644 index 00000000..17268ce0 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/expected.yaml @@ -0,0 +1,5 @@ +description: > + A PrometheusRule with a metric listed in allowed-metrics should not produce + source-label errors. +module: module +expect: [] diff --git a/test/e2e/testdata/templates/source-label-allowed/module/.dmtlint.yaml b/test/e2e/testdata/templates/source-label-allowed/module/.dmtlint.yaml new file mode 100644 index 00000000..f83b2aaf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/module/.dmtlint.yaml @@ -0,0 +1,5 @@ +linters-settings: + templates: + source-label: + allowed-metrics: + - "rabbitmq_*" diff --git a/test/e2e/testdata/templates/source-label-allowed/module/module.yaml b/test/e2e/testdata/templates/source-label-allowed/module/module.yaml new file mode 100644 index 00000000..f6eed6c2 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/module/module.yaml @@ -0,0 +1,2 @@ +name: e2e-source-allowed +namespace: e2e-source-allowed diff --git a/test/e2e/testdata/templates/source-label-allowed/module/openapi/config-values.yaml b/test/e2e/testdata/templates/source-label-allowed/module/openapi/config-values.yaml new file mode 100644 index 00000000..03b0d8bf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/module/openapi/config-values.yaml @@ -0,0 +1,2 @@ +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-allowed/module/openapi/values.yaml b/test/e2e/testdata/templates/source-label-allowed/module/openapi/values.yaml new file mode 100644 index 00000000..47180da5 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/module/openapi/values.yaml @@ -0,0 +1,4 @@ +x-extend: + schema: config-values.yaml +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-allowed/module/templates/prometheus-rule.yaml b/test/e2e/testdata/templates/source-label-allowed/module/templates/prometheus-rule.yaml new file mode 100644 index 00000000..4852064a --- /dev/null +++ b/test/e2e/testdata/templates/source-label-allowed/module/templates/prometheus-rule.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: e2e-source-allowed-rules + namespace: e2e-source-allowed +spec: + groups: + - name: e2e.source-allowed + rules: + - alert: TestAllowedMetric + expr: rabbitmq_node_mem_used{job="rabbitmq"} > 1000000 + for: 5m + labels: + severity_level: "5" diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/expected.yaml b/test/e2e/testdata/templates/source-label-dashboard-allowed/expected.yaml new file mode 100644 index 00000000..b0ca21a3 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/expected.yaml @@ -0,0 +1,5 @@ +description: > + A Grafana dashboard with a metric listed in allowed-metrics should not produce + source-label errors. +module: module +expect: [] diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/module/.dmtlint.yaml b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/.dmtlint.yaml new file mode 100644 index 00000000..f83b2aaf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/.dmtlint.yaml @@ -0,0 +1,5 @@ +linters-settings: + templates: + source-label: + allowed-metrics: + - "rabbitmq_*" diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/module/module.yaml b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/module.yaml new file mode 100644 index 00000000..adec94e2 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/module.yaml @@ -0,0 +1,2 @@ +name: e2e-source-dash-ok +namespace: e2e-source-dash-ok diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/module/monitoring/grafana-dashboards/test-dashboard.json b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/monitoring/grafana-dashboards/test-dashboard.json new file mode 100644 index 00000000..2343fb11 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/monitoring/grafana-dashboards/test-dashboard.json @@ -0,0 +1,24 @@ +{ + "title": "E2E Allowed Dashboard", + "panels": [ + { + "type": "graph", + "title": "RabbitMQ Memory", + "targets": [ + { + "expr": "rabbitmq_node_mem_used{job=\"rabbitmq\"}" + } + ], + "datasource": {"type": "prometheus", "uid": "${ds_prometheus}"} + } + ], + "templating": { + "list": [ + { + "name": "ds_prometheus", + "type": "datasource", + "query": "prometheus" + } + ] + } +} diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/config-values.yaml b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/config-values.yaml new file mode 100644 index 00000000..03b0d8bf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/config-values.yaml @@ -0,0 +1,2 @@ +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/values.yaml b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/values.yaml new file mode 100644 index 00000000..47180da5 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-allowed/module/openapi/values.yaml @@ -0,0 +1,4 @@ +x-extend: + schema: config-values.yaml +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-dashboard-missing/expected.yaml b/test/e2e/testdata/templates/source-label-dashboard-missing/expected.yaml new file mode 100644 index 00000000..4b173cb5 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-missing/expected.yaml @@ -0,0 +1,9 @@ +description: > + A Grafana dashboard with a metric missing source="deckhouse" selector must be + flagged by the source-label rule. +module: module +expect: + - linter: templates + rule: source-label + level: error + textContains: "metric 'container_cpu_usage_seconds_total' in panel 'CPU Usage' must have source=\"deckhouse\" selector" diff --git a/test/e2e/testdata/templates/source-label-dashboard-missing/module/module.yaml b/test/e2e/testdata/templates/source-label-dashboard-missing/module/module.yaml new file mode 100644 index 00000000..7aaabb0f --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-missing/module/module.yaml @@ -0,0 +1,2 @@ +name: e2e-source-dashboard +namespace: e2e-source-dashboard diff --git a/test/e2e/testdata/templates/source-label-dashboard-missing/module/monitoring/grafana-dashboards/test-dashboard.json b/test/e2e/testdata/templates/source-label-dashboard-missing/module/monitoring/grafana-dashboards/test-dashboard.json new file mode 100644 index 00000000..170de4a9 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-missing/module/monitoring/grafana-dashboards/test-dashboard.json @@ -0,0 +1,24 @@ +{ + "title": "E2E Source Label Dashboard", + "panels": [ + { + "type": "graph", + "title": "CPU Usage", + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{namespace=\"default\"}[5m])" + } + ], + "datasource": {"type": "prometheus", "uid": "${ds_prometheus}"} + } + ], + "templating": { + "list": [ + { + "name": "ds_prometheus", + "type": "datasource", + "query": "prometheus" + } + ] + } +} diff --git a/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/config-values.yaml b/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/config-values.yaml new file mode 100644 index 00000000..03b0d8bf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/config-values.yaml @@ -0,0 +1,2 @@ +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/values.yaml b/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/values.yaml new file mode 100644 index 00000000..47180da5 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-dashboard-missing/module/openapi/values.yaml @@ -0,0 +1,4 @@ +x-extend: + schema: config-values.yaml +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-missing/expected.yaml b/test/e2e/testdata/templates/source-label-missing/expected.yaml new file mode 100644 index 00000000..9078b95c --- /dev/null +++ b/test/e2e/testdata/templates/source-label-missing/expected.yaml @@ -0,0 +1,9 @@ +description: > + A PrometheusRule with a metric missing source="deckhouse" selector must be + flagged by the source-label rule. +module: module +expect: + - linter: templates + rule: source-label + level: error + textContains: "metric 'up' in rule 'TestAlertMissingSource' (group 'e2e.source-label') must have source=\"deckhouse\" selector" diff --git a/test/e2e/testdata/templates/source-label-missing/module/module.yaml b/test/e2e/testdata/templates/source-label-missing/module/module.yaml new file mode 100644 index 00000000..3045202d --- /dev/null +++ b/test/e2e/testdata/templates/source-label-missing/module/module.yaml @@ -0,0 +1,2 @@ +name: e2e-source-label +namespace: e2e-source-label diff --git a/test/e2e/testdata/templates/source-label-missing/module/openapi/config-values.yaml b/test/e2e/testdata/templates/source-label-missing/module/openapi/config-values.yaml new file mode 100644 index 00000000..03b0d8bf --- /dev/null +++ b/test/e2e/testdata/templates/source-label-missing/module/openapi/config-values.yaml @@ -0,0 +1,2 @@ +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-missing/module/openapi/values.yaml b/test/e2e/testdata/templates/source-label-missing/module/openapi/values.yaml new file mode 100644 index 00000000..47180da5 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-missing/module/openapi/values.yaml @@ -0,0 +1,4 @@ +x-extend: + schema: config-values.yaml +type: object +properties: {} diff --git a/test/e2e/testdata/templates/source-label-missing/module/templates/prometheus-rule.yaml b/test/e2e/testdata/templates/source-label-missing/module/templates/prometheus-rule.yaml new file mode 100644 index 00000000..671581e7 --- /dev/null +++ b/test/e2e/testdata/templates/source-label-missing/module/templates/prometheus-rule.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: e2e-source-label-rules + namespace: e2e-source-label +spec: + groups: + - name: e2e.source-label + rules: + - alert: TestAlertMissingSource + expr: up{job="node-exporter"} == 0 + for: 5m + labels: + severity_level: "5"