Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions internal/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,11 @@ func (m *Manager) Run() {
wg := new(sync.WaitGroup)
processingCh := make(chan struct{}, flags.LintersLimit)

recordNames := collectRecordingRuleNames(m.Modules)

for _, module := range m.Modules {
module.SetRecordingRuleNames(recordNames)

processingCh <- struct{}{}

wg.Add(1)
Expand Down Expand Up @@ -195,6 +199,69 @@ func getLintersForModule(cfg *pkg.LintersSettings, errList *errors.LintRuleError
}
}

func collectRecordingRuleNames(modules []*module.Module) map[string]struct{} {
names := make(map[string]struct{})

for _, m := range modules {
for _, obj := range m.GetStorage() {
if obj.Unstructured.GetKind() != "PrometheusRule" {
continue
}

ispec, ok := obj.Unstructured.Object["spec"]
if !ok {
continue
}

spec, ok := ispec.(map[string]any)
if !ok {
continue
}

igroups, ok := spec["groups"]
if !ok {
continue
}

groups, ok := igroups.([]any)
if !ok {
continue
}

for _, ig := range groups {
group, ok := ig.(map[string]any)
if !ok {
continue
}

irules, ok := group["rules"]
if !ok {
continue
}

rules, ok := irules.([]any)
if !ok {
continue
}

for _, ir := range rules {
rule, ok := ir.(map[string]any)
if !ok {
continue
}

record, ok := rule["record"].(string)
if ok && record != "" {
names[record] = struct{}{}
}
}
}
}
}

return names
}

func (m *Manager) PrintResult() {
errs := m.errors.GetErrors()

Expand Down
10 changes: 10 additions & 0 deletions internal/module/module.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,14 @@ func (m *Module) GetModuleConfig() *pkg.LintersSettings {
return m.linterConfig
}

func (m *Module) SetRecordingRuleNames(names map[string]struct{}) {
if m == nil || m.linterConfig == nil {
return
}

m.linterConfig.Templates.SourceLabelSettings.RecordingRuleNames = names
}

// remapLinterSettings converts configuration settings from the config package format
// to the pkg package format, mapping both rule-level configurations and exclusion rules
// across all linter domains (Container, Image, NoCyrillic, OpenAPI, Templates, RBAC, Hooks, Module).
Expand Down Expand Up @@ -342,6 +350,7 @@ func mapTemplatesRules(linterSettings *pkg.LintersSettings, configSettings *conf
rules.ClusterDomainRule.SetLevel(globalRules.ClusterDomainRule.Impact, fallbackImpact)
rules.RegistryRule.SetLevel(globalRules.RegistryRule.Impact, fallbackImpact)
rules.EnabledModulesRule.SetLevel(globalRules.EnabledModulesRule.Impact, fallbackImpact)
rules.SourceLabelRule.SetLevel(globalRules.SourceLabelRule.Impact, fallbackImpact)
}

// mapOpenAPIRules configures OpenAPI linter rules
Expand Down Expand Up @@ -459,6 +468,7 @@ func mapTemplatesExclusionsAndSettings(linterSettings *pkg.LintersSettings, conf
// Additional settings
linterSettings.Templates.PrometheusRuleSettings.Disable = configSettings.Templates.PrometheusRules.Disable
linterSettings.Templates.GrafanaDashboardsSettings.Disable = configSettings.Templates.GrafanaDashboards.Disable
linterSettings.Templates.SourceLabelSettings.AllowedMetrics = configSettings.Templates.SourceLabel.AllowedMetrics
}

// mapRBACExclusions maps RBAC linter exclusion rules
Expand Down
4 changes: 2 additions & 2 deletions internal/values/global-openapi/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,10 @@ properties:
items:
type: string
x-examples:
- ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus"]
- ["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus-crd"]
- ["cert-manager", "prometheus", "priority-class"]
x-dmt-default:
["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus"]
["cert-manager", "vertical-pod-autoscaler", "vertical-pod-autoscaler-crd", "prometheus", "priority-class", "prometheus-crd", "operator-prometheus", "operator-prometheus-crd"]
Comment thread
ldmonster marked this conversation as resolved.
discovery:
additionalProperties: true
type: object
Expand Down
7 changes: 7 additions & 0 deletions pkg/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ type TemplatesLinterConfig struct {
ExcludeRules TemplatesExcludeRules
PrometheusRuleSettings PrometheusRuleSettings
GrafanaDashboardsSettings GrafanaDashboardsSettings
SourceLabelSettings SourceLabelSettings
}

type SourceLabelSettings struct {
AllowedMetrics []string
RecordingRuleNames map[string]struct{}
}
type TemplatesLinterRules struct {
VPARule RuleConfig
Expand All @@ -141,6 +147,7 @@ type TemplatesLinterRules struct {
ClusterDomainRule RuleConfig
RegistryRule RuleConfig
EnabledModulesRule RuleConfig
SourceLabelRule RuleConfig
}

type PrometheusRuleSettings struct {
Expand Down
1 change: 1 addition & 0 deletions pkg/config/global/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ type TemplatesLinterRules struct {
ClusterDomainRule RuleConfig `mapstructure:"cluster-domain"`
RegistryRule RuleConfig `mapstructure:"registry"`
EnabledModulesRule RuleConfig `mapstructure:"enabled-modules"`
SourceLabelRule RuleConfig `mapstructure:"source-label"`
}

func (c LinterConfig) IsWarn() bool {
Expand Down
6 changes: 6 additions & 0 deletions pkg/config/linters_settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,16 @@ type TemplatesSettings struct {
ExcludeRules TemplatesExcludeRules `mapstructure:"exclude-rules"`
GrafanaDashboards GrafanaDashboardsExcludeList `mapstructure:"grafana-dashboards"`
PrometheusRules PrometheusRulesExcludeList `mapstructure:"prometheus-rules"`
SourceLabel SourceLabelSettings `mapstructure:"source-label"`
Rules TemplatesLinterRules `mapstructure:"rules"`

Impact string `mapstructure:"impact"`
}

type SourceLabelSettings struct {
AllowedMetrics []string `mapstructure:"allowed-metrics"`
}

type TemplatesLinterRules struct {
VPARule RuleConfig `mapstructure:"vpa"`
PDBRule RuleConfig `mapstructure:"pdb"`
Expand All @@ -205,6 +210,7 @@ type TemplatesLinterRules struct {
ClusterDomainRule RuleConfig `mapstructure:"cluster-domain"`
RegistryRule RuleConfig `mapstructure:"registry"`
EnabledModulesRule RuleConfig `mapstructure:"enabled-modules"`
SourceLabelRule RuleConfig `mapstructure:"source-label"`
}

type TemplatesExcludeRules struct {
Expand Down
87 changes: 87 additions & 0 deletions pkg/linters/templates/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Proper template validation prevents runtime issues, ensures applications are pro
| [ingress-rules](#ingress-rules) | Validates Ingress configuration snippets | ✅ | enabled |
| [prometheus-rules](#prometheus-rules) | Validates Prometheus rules with promtool and proper templates | ✅ | enabled |
| [grafana-dashboards](#grafana-dashboards) | Validates Grafana dashboard templates | ✅ | enabled |
| [source-label](#source-label) | Requires `source="deckhouse"` selector on metrics in Prometheus rules and Grafana dashboards | ✅ | enabled |
| [cluster-domain](#cluster-domain) | Validates cluster domain configuration is dynamic | ❌ | enabled |
| [registry](#registry) | Validates registry secret configuration | ❌ | enabled |
| [werf](#werf) | Validates image names in `werf.yaml` do not contain underscores | ❌ | enabled |
Expand Down Expand Up @@ -1284,6 +1285,85 @@ linters-settings:

---

### source-label

**Purpose:** Ensures every Deckhouse-owned metric referenced in PromQL expressions (in PrometheusRule objects and Grafana dashboards) is selected with an explicit `source="deckhouse"` label matcher, isolating our metrics from foreign metrics that share the same name.

**Description:**

The rule parses the PromQL expressions of alerting/recording rules and dashboard panel/template queries, and for every vector selector over a Deckhouse metric it requires a `source="deckhouse"` matcher (or a `source=$...` Grafana/templating variable that resolves to it).

**Why it matters:**

User can ran their own exporter that exposed a metric with the **same name** as one of ours. Because both time series shared the metric name, the metric was effectively duplicated, the rule expression returned ambiguous data, and it failed to evaluate. Pinning `source="deckhouse"` separates our metrics from foreign ones with colliding names, so a query only matches our own time series and evaluates reliably.

**What it checks:**

1. `PrometheusRule` objects — the `expr` of every alerting and recording rule
2. Grafana dashboards under `monitoring/grafana-dashboards` — `expr` of panel targets and `query`/`definition` of query template variables (only Prometheus datasources)
3. Each Deckhouse metric selector contains `source="deckhouse"` (or `source=$<var>`)

**Exemptions (no source selector required):**

- Metrics produced by recording rules within the module (collected automatically at runtime)
- Metrics matching the per-module `allowed-metrics` globs (for intentionally foreign metrics, e.g. third-party exporters)
- Prometheus synthetic metrics: `ALERTS`, `ALERTS_FOR_STATE`

**Examples:**

❌ **Incorrect** - Metric without source selector:

```yaml
# templates/monitoring.yaml -> PrometheusRule
- alert: TestAlertMissingSource
expr: up{job="node-exporter"} == 0
for: 5m
labels:
severity_level: "5"
```

**Error:**
```
Error: metric 'up' in rule 'TestAlertMissingSource' (group 'e2e.source-label') must have source="deckhouse" selector
```

✅ **Correct** - Metric with source selector:

```yaml
- alert: TestAlertWithSource
expr: up{job="node-exporter", source="deckhouse"} == 0
for: 5m
labels:
severity_level: "5"
```

✅ **Correct** - Grafana panel query using a source variable:

```json
{
"title": "Requests",
"targets": [
{ "expr": "rate(my_module_requests_total{source=\"deckhouse\"}[$__rate_interval])" }
]
}
```

**Configuration:**

The rule does not use `exclude-rules`. Instead, use `allowed-metrics` to list metric names (glob patterns with `*` and `?` are supported) that are allowed to appear without a source selector:

```yaml
# .dmt.yaml
linters-settings:
templates:
source-label:
allowed-metrics:
- "rabbitmq_*" # All metrics from a third-party exporter
- "node_exporter_build_info"
```

---

### cluster-domain

**Purpose:** Prevents hardcoding of the cluster domain (`cluster.local`) in templates. Ensures cluster domain is configurable to support custom cluster configurations and multi-cluster deployments.
Expand Down Expand Up @@ -1663,6 +1743,11 @@ linters-settings:

prometheus-rules:
disable: true

# Allow specific (foreign) metrics without a source="deckhouse" selector
source-label:
allowed-metrics:
- "rabbitmq_*"
```

### Per-Rule Impact Levels
Expand Down Expand Up @@ -1694,6 +1779,8 @@ linters-settings:
impact: error
enabled-modules:
impact: warning
source-label:
impact: error
```

### Rule-Level Exclusions
Expand Down
Loading
Loading