From a6242f6a4696d1c06304db988a123d22b457267c Mon Sep 17 00:00:00 2001 From: Doug Holt Date: Wed, 27 May 2026 22:04:56 -0600 Subject: [PATCH] Refresh DCGM exporter --- .../files/k8s-cluster/dcgm-custom-metrics.csv | 89 +++++++++++-------- docs/deepops/update-deepops.md | 2 +- docs/slurm-cluster/slurm-monitor.md | 4 +- roles/nvidia-dcgm-exporter/defaults/main.yml | 2 +- .../files/dcgm-custom-metrics.csv | 89 +++++++++++-------- workloads/services/k8s/dcgm-exporter.yml | 6 +- 6 files changed, 116 insertions(+), 76 deletions(-) diff --git a/config.example/files/k8s-cluster/dcgm-custom-metrics.csv b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv index 66875527f..2eed7a651 100644 --- a/config.example/files/k8s-cluster/dcgm-custom-metrics.csv +++ b/config.example/files/k8s-cluster/dcgm-custom-metrics.csv @@ -1,55 +1,63 @@ -# Format,, -# If line starts with a '#' it is considered a comment,, +# Format +# If line starts with a '#' it is considered a comment # DCGM FIELD, Prometheus metric type, help message -# Clocks,, +# Clocks DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). -# Temperature,, +# Temperature DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). -# Power,, +# Power DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). -# PCIE,, -DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. -DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. +# PCIE +# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. -# Utilization (the sample period varies depending on the product),, +# Utilization (the sample period varies depending on the product) DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). -# Errors and violations,, -DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. -# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). -# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). -# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). -# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). -# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). -# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in ns). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in ns). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in ns). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in ns). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in ns). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in ns). -# Memory usage,, +# DCGM Exporter fields + +# DCGM_EXP_CLOCK_EVENTS_COUNT, counter, reported clock events +# DCGM_EXP_XID_ERRORS_COUNT, counter, reported XIDs during last window +# DCGM_EXP_GPU_HEALTH_STATUS, counter, DCGM reported health status +# DCGM_EXP_P2P_STATUS, counter, P2P NvLink status + +# Memory usage DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). +DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). -# ECC,, +# ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. -# Retired pages,, +# Retired pages # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. -# NVLink,, +# NVLink # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. @@ -57,23 +65,34 @@ DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. -# VGPU License status,, -# DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status -# Remapped rows,, +# Remapped rows DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed -# DCP metrics,, -DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). -DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). -DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). -DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). -DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). -DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). -DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). -DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). -DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. -DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device +# Datacenter Profiling (DCP) metrics +# NOTE: supported on Nvidia datacenter Volta GPUs and newer +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. diff --git a/docs/deepops/update-deepops.md b/docs/deepops/update-deepops.md index a1f7af047..63d0a75d5 100644 --- a/docs/deepops/update-deepops.md +++ b/docs/deepops/update-deepops.md @@ -524,7 +524,7 @@ systemctl restart docker.node-exporter For the NVIDIA DCGM Exporter, we do pin a particular version of the container. To update to a newer version, edit your DeepOps configuration to specify a new container tag: ``` -nvidia_dcgm_container_version: "2.1.8-2.4.0-rc.2-ubuntu20.04" +nvidia_dcgm_container_version: "4.5.3-4.8.2-distroless" ``` Then re-run the playbook: diff --git a/docs/slurm-cluster/slurm-monitor.md b/docs/slurm-cluster/slurm-monitor.md index 88847a74f..9c9a717af 100644 --- a/docs/slurm-cluster/slurm-monitor.md +++ b/docs/slurm-cluster/slurm-monitor.md @@ -22,7 +22,7 @@ $ sudo docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 139fe402640f quay.io/prometheus/node-exporter "/bin/node_exporter …" 56 minutes ago Up 56 minutes docker.node-exporter.service -0da9e3f1a7c8 nvidia/dcgm-exporter "/usr/bin/dcgm-expor…" 56 minutes ago Up 56 minutes 0.0.0.0:9400->9400/tcp docker.dcgm-exporter.service +0da9e3f1a7c8 nvcr.io/nvidia/k8s/dcgm-exporter "/usr/bin/dcgm-expor…" 56 minutes ago Up 56 minutes 0.0.0.0:9400->9400/tcp docker.dcgm-exporter.service ``` ## Grafana @@ -102,4 +102,4 @@ Describes how to configure Alertmanager. Visit http://\:9093 for checking and setting about alerting. ![Alertmanager](../img/slurm_monitoring_alertmanager02.png) -For more information on constructing Alertmanager alerting see the official [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/). \ No newline at end of file +For more information on constructing Alertmanager alerting see the official [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/). diff --git a/roles/nvidia-dcgm-exporter/defaults/main.yml b/roles/nvidia-dcgm-exporter/defaults/main.yml index 63e0c8785..123658d96 100644 --- a/roles/nvidia-dcgm-exporter/defaults/main.yml +++ b/roles/nvidia-dcgm-exporter/defaults/main.yml @@ -1,4 +1,4 @@ -nvidia_dcgm_container_version: "2.1.8-2.4.0-rc.2-ubuntu20.04" +nvidia_dcgm_container_version: "4.5.3-4.8.2-distroless" nvidia_dcgm_container: "nvcr.io/nvidia/k8s/dcgm-exporter:{{ nvidia_dcgm_container_version }}" nvidia_dcgm_container_config_dir: "/opt/deepops/nvidia-dcgm-exporter" nvidia_dcgm_container_custom_metrics_file: "dcgm-custom-metrics.csv" diff --git a/roles/nvidia-dcgm-exporter/files/dcgm-custom-metrics.csv b/roles/nvidia-dcgm-exporter/files/dcgm-custom-metrics.csv index 66875527f..2eed7a651 100644 --- a/roles/nvidia-dcgm-exporter/files/dcgm-custom-metrics.csv +++ b/roles/nvidia-dcgm-exporter/files/dcgm-custom-metrics.csv @@ -1,55 +1,63 @@ -# Format,, -# If line starts with a '#' it is considered a comment,, +# Format +# If line starts with a '#' it is considered a comment # DCGM FIELD, Prometheus metric type, help message -# Clocks,, +# Clocks DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). -# Temperature,, +# Temperature DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). -# Power,, +# Power DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). -# PCIE,, -DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. -DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. +# PCIE +# DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML. +# DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML. DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. -# Utilization (the sample period varies depending on the product),, +# Utilization (the sample period varies depending on the product) DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). -# Errors and violations,, -DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. -# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). -# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). -# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). -# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). -# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). -# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in ns). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in ns). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in ns). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in ns). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in ns). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in ns). -# Memory usage,, +# DCGM Exporter fields + +# DCGM_EXP_CLOCK_EVENTS_COUNT, counter, reported clock events +# DCGM_EXP_XID_ERRORS_COUNT, counter, reported XIDs during last window +# DCGM_EXP_GPU_HEALTH_STATUS, counter, DCGM reported health status +# DCGM_EXP_P2P_STATUS, counter, P2P NvLink status + +# Memory usage DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). +DCGM_FI_DEV_FB_RESERVED, gauge, Framebuffer memory reserved (in MiB). -# ECC,, +# ECC # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. -# Retired pages,, +# Retired pages # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. -# NVLink,, +# NVLink # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. @@ -57,23 +65,34 @@ DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. -# VGPU License status,, -# DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status -# Remapped rows,, +# Remapped rows DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed -# DCP metrics,, -DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). -DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). -DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). -DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). -DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). -DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). -DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). -DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). -DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. -DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device +# Datacenter Profiling (DCP) metrics +# NOTE: supported on Nvidia datacenter Volta GPUs and newer +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active. +# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned. +# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM. +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active. +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data. +# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active. +# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active. +# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active. +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. diff --git a/workloads/services/k8s/dcgm-exporter.yml b/workloads/services/k8s/dcgm-exporter.yml index d5b9d268f..165adb448 100644 --- a/workloads/services/k8s/dcgm-exporter.yml +++ b/workloads/services/k8s/dcgm-exporter.yml @@ -23,9 +23,9 @@ spec: nodeSelector: hardware-type: NVIDIAGPU containers: - - image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.3-ubuntu20.04" + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.5.3-4.8.2-distroless" name: nvidia-dcgm-exporter - command: ["/usr/bin/dcgm-exporter", "-f", "/etc/dcgm-config/dcgm-custom-metrics.csv"] + args: ["-f", "/etc/dcgm-config/dcgm-custom-metrics.csv"] env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" @@ -36,6 +36,8 @@ spec: runAsUser: 0 capabilities: add: ["SYS_ADMIN"] + drop: ["ALL"] + allowPrivilegeEscalation: false volumeMounts: - name: "pod-gpu-resources" readOnly: true