From e6bf1f6e83a8ff33a490220e717fff6bd9f76aa5 Mon Sep 17 00:00:00 2001 From: Doug Holt Date: Wed, 27 May 2026 09:21:35 -0600 Subject: [PATCH 1/2] feat(dgx): update DGX software stack role --- README.md | 12 +- docs/deepops/dgx-software-stack.md | 131 +++++++ docs/deepops/testing.md | 4 +- roles/nvidia-dgx/tasks/main.yml | 4 +- roles/nvidia-dgx/tasks/redhat-el8-plus.yml | 139 +++++++ roles/nvidia-dgx/tasks/redhat-legacy-el7.yml | 105 +++++ roles/nvidia-dgx/tasks/redhat.yml | 111 +----- roles/nvidia-dgx/tasks/ubuntu-22.04.yml | 124 ++++++ roles/nvidia-dgx/tasks/ubuntu-24.04.yml | 92 +++++ roles/nvidia-dgx/tasks/ubuntu-legacy.yml | 384 ++++++++++++++++++ roles/nvidia-dgx/tasks/ubuntu.yml | 391 +------------------ roles/nvidia-dgx/vars/redhat.yml | 112 ++++++ roles/nvidia-dgx/vars/ubuntu-22.04.yml | 115 ++++++ roles/nvidia-dgx/vars/ubuntu-24.04.yml | 102 +++++ 14 files changed, 1338 insertions(+), 488 deletions(-) create mode 100644 docs/deepops/dgx-software-stack.md create mode 100644 roles/nvidia-dgx/tasks/redhat-el8-plus.yml create mode 100644 roles/nvidia-dgx/tasks/redhat-legacy-el7.yml create mode 100644 roles/nvidia-dgx/tasks/ubuntu-22.04.yml create mode 100644 roles/nvidia-dgx/tasks/ubuntu-24.04.yml create mode 100644 roles/nvidia-dgx/tasks/ubuntu-legacy.yml create mode 100644 roles/nvidia-dgx/vars/ubuntu-22.04.yml create mode 100644 roles/nvidia-dgx/vars/ubuntu-24.04.yml diff --git a/README.md b/README.md index 6f4dbdb42..68df95a20 100644 --- a/README.md +++ b/README.md @@ -38,19 +38,21 @@ It is recommended to use the latest release branch for stable code (linked above The provisioning system is used to orchestrate the running of all playbooks and one will be needed when instantiating Kubernetes or Slurm clusters. Supported operating systems which are tested and supported include: -- NVIDIA DGX OS 4, 5 -- Ubuntu 18.04 LTS, 20.04, 22.04 LTS +- NVIDIA DGX OS 4, 5, 6, 7 +- Ubuntu 18.04 LTS, 20.04, 22.04 LTS, 24.04 LTS - CentOS 7, 8 ### Cluster System The cluster nodes will follow the requirements described by Slurm or Kubernetes. You may also use a cluster node as a provisioning system but it is not required. -- NVIDIA DGX OS 4, 5 -- Ubuntu 18.04 LTS, 20.04, 22.04 LTS +- NVIDIA DGX OS 4, 5, 6, 7 +- Ubuntu 18.04 LTS, 20.04, 22.04 LTS, 24.04 LTS - CentOS 7, 8 +- Red Hat Enterprise Linux / Rocky Linux 8 and 9 for the DGX software stack through the `nvidia-dgx` role You may also install a supported operating system on all servers via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)) or utilize the provided [OS install container](docs/pxe/minimal-pxe-container.md). +For DGX platform software installation on top of vanilla Ubuntu or Red Hat family operating systems, see the [DGX software stack role guide](docs/deepops/dgx-software-stack.md). ### Kubernetes @@ -77,7 +79,7 @@ For more information on Slurm in general, refer to the [official Slurm docs](htt [NVIDIA Bright Cluster Manager](https://www.brightcomputing.com/brightclustermanager) is recommended as an enterprise solution which enables managing multiple workload managers within a single cluster, including Kubernetes, Slurm, Univa Grid Engine, and PBS Pro. **DeepOps does not test or support a configuration where nodes have a heterogenous OS running.** -Additional modifications are needed if you plan to use unsupported operating systems such as RHEL. +The `nvidia-dgx` role can install NVIDIA DGX platform software on supported DGX systems running Red Hat Enterprise Linux / Rocky Linux 8 or 9; broader Kubernetes or Slurm cluster support on RHEL still requires site-specific validation. ### Virtual diff --git a/docs/deepops/dgx-software-stack.md b/docs/deepops/dgx-software-stack.md new file mode 100644 index 000000000..9164c1a0d --- /dev/null +++ b/docs/deepops/dgx-software-stack.md @@ -0,0 +1,131 @@ +# DGX Software Stack Role + +The `nvidia-dgx` role installs NVIDIA DGX platform software on supported DGX +systems after a base operating system has been installed. + +This role is intended for DGX hardware only. It checks the system product name +and stops on non-DGX systems. + +## Supported Paths + +The role has two implementation paths: + +| Base OS | DGX software path | Notes | +| ------- | ----------------- | ----- | +| Ubuntu 18.04 | DGX OS 4 legacy packages | Existing legacy role path. | +| Ubuntu 20.04 | DGX OS 5 legacy packages | Existing legacy role path. | +| Ubuntu 22.04 | DGX OS 6 software stack | Uses the official DGX OS 6 repository archive and system-specific packages. | +| Ubuntu 24.04 | DGX OS 7 software stack | Uses the official DGX OS 7 repository archive and unified `nvidia-system-*` packages. | +| Red Hat Enterprise Linux 7 | Legacy DGX EL7 packages | Existing legacy role path. | +| Red Hat Enterprise Linux 8 / Rocky Linux 8 | DGX Software for RHEL 8 | Uses the official NVIDIA repository setup RPM and DGX configuration groups. | +| Red Hat Enterprise Linux 9 / Rocky Linux 9 | DGX Software for RHEL 9 | Uses the official NVIDIA repository setup RPM and DGX configuration groups. | + +The EL8 work addresses GitHub issue +[#1120](https://github.com/NVIDIA/deepops/issues/1120). + +## Official References + +- [Installing DGX Software on Ubuntu](https://docs.nvidia.com/dgx/dgx-os-6-user-guide/installing_on_ubuntu.html) +- [Customizing Ubuntu Installation with DGX Software](https://docs.nvidia.com/dgx/dgx-os-7-user-guide/installing_on_ubuntu.html) +- [DGX Software for Red Hat Enterprise Linux 8 Installation Guide](https://docs.nvidia.com/dgx/dgx-rhel8-install-guide/index.html) +- [DGX Software for Red Hat Enterprise Linux 8 Release Notes](https://docs.nvidia.com/dgx/dgx-rhel8-sw-release-notes/index.html) +- [DGX Software for Red Hat Enterprise Linux 9 User Guide](https://docs.nvidia.com/dgx/dgx-el9-user-guide/index.html) + +## Ubuntu 22.04 / DGX OS 6 + +The role follows the DGX OS 6 guide: + +1. Install the DGX repository files from + `https://repo.download.nvidia.com/baseos/ubuntu/jammy/dgx-repo-files.tgz`. +2. Install the system-specific DGX configuration and tools packages. +3. Install `linux-tools-nvidia` and `nvidia-peermem-loader`. +4. Optionally install the NVIDIA driver, Docker/NVIDIA Container Toolkit, NVSM, + serial-over-LAN, logrotate, and additional DGX OS administration/development + packages. + +The default driver branch is `550`, matching the DGX OS 6 examples. Override it +when needed: + +```yaml +dgx_os6_driver_branch: "580" +``` + +Disruptive package upgrades are opt-in: + +```yaml +dgx_os6_upgrade_packages: true +``` + +## Red Hat Enterprise Linux 8 and 9 + +The role follows the official Red Hat DGX software guides: + +1. Optionally enable the required Red Hat subscription repositories on RHEL. + This is skipped automatically on Rocky Linux. +2. Install the NVIDIA DGX repository setup RPM for EL8 or EL9. +3. Install the DGX configuration group for the detected DGX platform. +4. Optionally install the NVIDIA driver module and support packages. +5. Optionally install Docker CE and the NVIDIA Container Runtime group. + +The default driver stream follows the official examples: `525` on most EL8 +systems, `535-dkms` on EL8 DGX H100, and `580` on EL9. EL9 NVSwitch systems +install the open-kernel-module stream by default. Override the branch when a +validated DGX release note calls for another stream: + +```yaml +dgx_redhat_driver_branch: "580" +``` + +RHEL subscription repository management is enabled by default only when +`ansible_distribution == 'RedHat'`. Disable it if subscriptions are managed +outside DeepOps: + +```yaml +dgx_redhat_manage_subscription_repos: false +``` + +Disruptive `dnf update --nobest` behavior is opt-in: + +```yaml +dgx_redhat_upgrade_packages: true +``` + +## Ubuntu 24.04 / DGX OS 7 + +The role follows the DGX OS 7 guide: + +1. Install the architecture-specific DGX OS 7 repository archive from + `https://repo.download.nvidia.com/baseos/ubuntu/noble/`. +2. Install the unified DGX OS 7 metapackages: `nvidia-system-core`, + `nvidia-system-utils`, and `nvidia-system-extra`. +3. Install `nvidia-system-station` for DGX Station and DGX Spark systems. +4. Install kernel tools and `nvidia-peermem-loader`. +5. Optionally install the Release 580 open GPU kernel module driver packages, + including Fabric Manager, NVLSM/NVSDM, or IMEX packages for the DGX platform + that requires them. + +Disruptive package upgrades are opt-in: + +```yaml +dgx_os7_upgrade_packages: true +``` + +## Validation + +Full validation requires real DGX hardware and access to NVIDIA/OS package +repositories. At minimum, run syntax validation before opening a PR: + +```bash +ansible-playbook --syntax-check playbooks/nvidia-dgx/nvidia-dgx.yml +``` + +On hardware, validate the role with the target OS and DGX model, reboot if the +driver was installed, then verify: + +```bash +nvidia-smi +sudo docker run --gpus=all --rm nvcr.io/nvidia/cuda:12.3.2-base-ubuntu22.04 nvidia-smi +``` + +Use the RHEL UBI CUDA image from the official guide when validating the RHEL +path. diff --git a/docs/deepops/testing.md b/docs/deepops/testing.md index e0d0b6765..7f2b1731a 100644 --- a/docs/deepops/testing.md +++ b/docs/deepops/testing.md @@ -53,8 +53,8 @@ A short description of the nightly testing is outlined below. The full suit of t | Ubuntu 20.04 | | x | x | | | CentOS 7 | | x | x | | | CentOS | | | x | | -| DGX OS | | | | No automated testing support | -| RHEL | | | | No testing support | +| DGX OS | | | | Syntax-checked only; full validation requires DGX hardware | +| RHEL | | | | DGX software-stack role syntax-checked only; full validation requires DGX hardware and subscriptions | | 1 mgmt node | x | x | | | | 3 mgmt nodes | | | x | | | 1 gpu node | x | x | | | diff --git a/roles/nvidia-dgx/tasks/main.yml b/roles/nvidia-dgx/tasks/main.yml index 64198524e..e95f03fa2 100644 --- a/roles/nvidia-dgx/tasks/main.yml +++ b/roles/nvidia-dgx/tasks/main.yml @@ -4,12 +4,12 @@ msg: "Role supports DGX systems only" when: ansible_product_name is not search("DGX") -- name: Ubuntu tasks for DGX OS 4/5 +- name: Ubuntu tasks for DGX Software Stack include_tasks: ubuntu.yml when: - ansible_distribution == 'Ubuntu' -- name: redhat family tasks +- name: Red Hat family tasks for DGX Software Stack include_tasks: redhat.yml when: ansible_os_family == 'RedHat' diff --git a/roles/nvidia-dgx/tasks/redhat-el8-plus.yml b/roles/nvidia-dgx/tasks/redhat-el8-plus.yml new file mode 100644 index 000000000..ce4b1997c --- /dev/null +++ b/roles/nvidia-dgx/tasks/redhat-el8-plus.yml @@ -0,0 +1,139 @@ +--- +- name: Determine Red Hat DGX platform package set + set_fact: + dgx_redhat_platform: "{{ item }}" + loop: "{{ dgx_redhat_platforms }}" + when: + - ansible_product_name is search(item.match) + - ansible_distribution_major_version in item.supported_major_versions + +- name: Fail if Red Hat DGX platform package set is unknown + fail: + msg: "Unsupported DGX model for EL{{ ansible_distribution_major_version }} DGX role path: {{ ansible_product_name }}" + when: dgx_redhat_platform is not defined + +- name: Enable Red Hat subscription repositories for DGX Software Stack + command: "subscription-manager repos --enable={{ item }}" + loop: "{{ dgx_redhat_subscription_repos[ansible_distribution_major_version] }}" + changed_when: false + when: + - dgx_redhat_manage_subscription_repos + - ansible_distribution == 'RedHat' + +- name: Enable Rocky Linux CRB repository for DGX Software Stack + command: dnf config-manager --set-enabled crb + changed_when: false + when: + - ansible_distribution == 'Rocky' + - ansible_distribution_major_version == '9' + +- name: Install NVIDIA DGX repository setup package + dnf: + name: "{{ dgx_redhat_repo_setup_rpms[ansible_distribution_major_version] }}" + state: present + disable_gpg_check: yes + +- name: Upgrade Red Hat DGX Software Stack packages + dnf: + name: "*" + state: latest + nobest: yes + when: dgx_redhat_upgrade_packages + tags: skip_ansible_lint + +- name: Install EL9 kernel development packages for DGX driver builds + dnf: + name: + - "kernel-devel-{{ ansible_kernel }}" + - "kernel-headers-{{ ansible_kernel }}" + state: present + when: ansible_distribution_major_version == '9' + +- name: Install Red Hat DGX configuration group + dnf: + name: "@{{ dgx_redhat_platform.configuration_group }}" + state: present + +- name: Configure Red Hat DGX driver facts + set_fact: + dgx_redhat_driver_profile: "{{ 'fm' if dgx_redhat_platform.nvswitch else 'default' }}" + dgx_redhat_driver_stream: "{{ dgx_redhat_driver_branch }}{{ '-open' if dgx_redhat_use_open_kernel_modules else '' }}" + +- name: Configure Red Hat DGX driver profile list + set_fact: + dgx_redhat_driver_profiles: >- + {{ + [dgx_redhat_driver_profile, 'src'] + if ansible_distribution_major_version == '8' + and not dgx_redhat_use_open_kernel_modules + and 'dkms' not in dgx_redhat_driver_stream + else [dgx_redhat_driver_profile] + }} + +- name: Configure Red Hat DGX driver module specification + set_fact: + dgx_redhat_driver_module_spec: >- + nvidia-driver:{{ dgx_redhat_driver_stream }}/{{ '{' ~ (dgx_redhat_driver_profiles | join(',')) ~ '}' if (dgx_redhat_driver_profiles | length) > 1 else dgx_redhat_driver_profiles[0] }} + +- name: Install Red Hat DGX NVIDIA driver module + command: "dnf module install --nobest -y {{ dgx_redhat_driver_module_spec }}" + register: dgx_redhat_driver_module_install + changed_when: "'Nothing to do' not in dgx_redhat_driver_module_install.stdout" + notify: reboot after driver install + when: dgx_redhat_install_driver + +- name: Mark Red Hat DGX driver install state for reboot handler + set_fact: + install_driver: + changed: "{{ dgx_redhat_driver_module_install.changed | default(false) }}" + when: dgx_redhat_install_driver + +- name: Install Red Hat DGX driver support packages + dnf: + name: "{{ dgx_redhat_driver_support_packages + (dgx_redhat_fabricmanager_packages if dgx_redhat_platform.nvswitch else dgx_redhat_non_nvswitch_packages) + (dgx_redhat_nvlink5_packages if dgx_redhat_platform.nvlink5 else []) }}" + state: present + when: dgx_redhat_install_driver + +- name: Install Red Hat DGX Station extra driver packages + dnf: + name: "{{ dgx_redhat_station_packages }}" + state: present + when: + - dgx_redhat_install_driver + - dgx_redhat_platform.station + +- name: Install Red Hat DGX Docker CE + dnf: + name: docker-ce + state: present + allowerasing: yes + when: + - dgx_redhat_install_container_runtime + - dgx_redhat_install_docker_ce + +- name: Install Red Hat DGX NVIDIA Container Runtime group + dnf: + name: "@NVIDIA Container Runtime" + state: present + allowerasing: yes + notify: + - restart docker + when: dgx_redhat_install_container_runtime + +- name: Install Red Hat DGX optional cachefilesd configuration + dnf: + name: nvidia-conf-cachefilesd + state: present + when: dgx_redhat_install_cachefilesd + +- name: Populate Red Hat DGX service facts + service_facts: + +- name: Enable available Red Hat DGX services + systemd: + name: "{{ item }}" + state: started + enabled: yes + daemon_reload: yes + loop: "{{ dgx_redhat_services }}" + when: item in ansible_facts.services diff --git a/roles/nvidia-dgx/tasks/redhat-legacy-el7.yml b/roles/nvidia-dgx/tasks/redhat-legacy-el7.yml new file mode 100644 index 000000000..e1ab122f8 --- /dev/null +++ b/roles/nvidia-dgx/tasks/redhat-legacy-el7.yml @@ -0,0 +1,105 @@ +--- +- name: include os specific variables + include_vars: redhat.yml + +- name: trust GPG key for EPEL + rpm_key: + key: "{{ epel_key_url }}" + state: present + +- name: Add epel repo + yum: + name: + - "{{ epel_package }}" + state: latest + +- name: Add DGX repo + yum_repository: + name: nvidia-dgx-7 + description: NVIDIA DGX EL7 + gpgkey: "{{ nvidia_dgx_rhel_gpgkey }}" + baseurl: "{{ nvidia_dgx_rhel_baseurl }}" + +- name: Install packages + yum: + name: "@DGX-1 Configurations" + state: present + update_cache: yes + when: ansible_product_name is search("DGX-1") + +- name: Install packages + yum: + name: "@DGX-2 Configurations" + state: present + update_cache: yes + when: ansible_product_name is search("DGX-2") + +- name: Install extra packages + yum: + name: "{{ dgx_extra_packages }}" + state: present + update_cache: yes + +- name: check kernel versions + yum: + list: kernel + register: yum_list + +- name: register installed kernel version + debug: + msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" + register: kernel_version + +- name: check kernel-headers versions + yum: + list: kernel-headers + register: yum_list + +- name: register installed kernel-headers version + debug: + msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" + register: kernel_headers_version + +- name: update kernel if headers don't match + yum: + name: + - kernel + - kernel-tools + - kernel-tools-libs + - kernel-devel + - kernel-debug-devel + - kernel-headers + state: latest + register: kernel_update + when: kernel_version.msg[0].release != kernel_headers_version.msg[0].release + tags: skip_ansible_lint + +- name: Install driver packages + package: + name: "{{ item }}" + with_items: + - cuda-drivers + - cuda-drivers-diagnostic + - dgx-persistence-mode + register: install_driver + notify: reboot after driver install + +- name: Install Docker and the NVIDIA container runtime + yum: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - docker + - "@NVIDIA container runtime" + notify: + - restart docker + +- name: Install DGX System Management Tools + yum: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - python36 + - "@DGX System Management" diff --git a/roles/nvidia-dgx/tasks/redhat.yml b/roles/nvidia-dgx/tasks/redhat.yml index 61ead034a..e4c2794a4 100644 --- a/roles/nvidia-dgx/tasks/redhat.yml +++ b/roles/nvidia-dgx/tasks/redhat.yml @@ -1,105 +1,16 @@ --- -- name: include os specific variables +- name: Include Red Hat DGX variables include_vars: redhat.yml -- name: trust GPG key for EPEL - rpm_key: - key: "{{ epel_key_url }}" - state: present +- name: Check for supported Red Hat family DGX Software Stack version + fail: + msg: "Role supports Red Hat family major versions 7, 8, and 9 only" + when: ansible_distribution_major_version not in ['7', '8', '9'] -- name: Add epel repo - yum: - name: - - "{{ epel_package }}" - state: latest +- name: Run legacy EL7 DGX tasks + include_tasks: redhat-legacy-el7.yml + when: ansible_distribution_major_version == '7' -- name: Add DGX repo - yum_repository: - name: nvidia-dgx-7 - description: NVIDIA DGX EL7 - gpgkey: "{{ nvidia_dgx_rhel_gpgkey }}" - baseurl: "{{ nvidia_dgx_rhel_baseurl }}" - -- name: Install packages - yum: - name: "@DGX-1 Configurations" - state: present - update_cache: yes - when: ansible_product_name is search("DGX-1") - -- name: Install packages - yum: - name: "@DGX-2 Configurations" - state: present - update_cache: yes - when: ansible_product_name is search("DGX-2") - -- name: Install extra packages - yum: - name: "{{ dgx_extra_packages }}" - state: present - update_cache: yes - -- name: check kernel versions - yum: - list: kernel - register: yum_list - -- name: register installed kernel version - debug: - msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" - register: kernel_version - -- name: check kernel-headers versions - yum: - list: kernel-headers - register: yum_list - -- name: register installed kernel-headers version - debug: - msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" - register: kernel_headers_version - -- name: update kernel if headers don't match - yum: - name: - - kernel - - kernel-tools - - kernel-tools-libs - - kernel-devel - - kernel-debug-devel - - kernel-headers - state: latest - register: kernel_update - when: kernel_version.msg[0].release != kernel_headers_version.msg[0].release - tags: skip_ansible_lint - -- name: Install driver packages - package: - name: "{{ item }}" - with_items: - - cuda-drivers - - cuda-drivers-diagnostic - - dgx-persistence-mode - register: install_driver - notify: reboot after driver install - -- name: Install Docker and the NVIDIA container runtime - yum: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - docker - - "@NVIDIA container runtime" - notify: - - restart docker - -- name: Install DGX System Management Tools - yum: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - python36 - - "@DGX System Management" +- name: Run EL8 and EL9 DGX Software Stack tasks + include_tasks: redhat-el8-plus.yml + when: ansible_distribution_major_version in ['8', '9'] diff --git a/roles/nvidia-dgx/tasks/ubuntu-22.04.yml b/roles/nvidia-dgx/tasks/ubuntu-22.04.yml new file mode 100644 index 000000000..2240b4338 --- /dev/null +++ b/roles/nvidia-dgx/tasks/ubuntu-22.04.yml @@ -0,0 +1,124 @@ +--- +- name: Include DGX OS 6 variables + include_vars: ubuntu-22.04.yml + +- name: Determine DGX OS 6 platform package set + set_fact: + dgx_os6_platform: "{{ item }}" + loop: "{{ dgx_os6_platforms }}" + when: ansible_product_name is search(item.match) + +- name: Fail if DGX OS 6 platform package set is unknown + fail: + msg: "Unsupported DGX model for Ubuntu 22.04 DGX OS 6 role path: {{ ansible_product_name }}" + when: dgx_os6_platform is not defined + +- name: Create temporary DGX OS 6 repository archive path + tempfile: + state: file + suffix: -dgx-repo-files.tgz + register: dgx_os6_repo_archive + +- name: Download DGX OS 6 repository files + get_url: + url: "{{ dgx_os6_repo_files_url }}" + dest: "{{ dgx_os6_repo_archive.path }}" + mode: "0644" + +- name: Install DGX OS 6 repository files + unarchive: + src: "{{ dgx_os6_repo_archive.path }}" + dest: / + remote_src: true + +- name: Remove temporary DGX OS 6 repository archive + file: + path: "{{ dgx_os6_repo_archive.path }}" + state: absent + +- name: Update apt cache for DGX OS 6 repositories + apt: + update_cache: yes + +- name: Upgrade packages for DGX OS 6 + apt: + upgrade: dist + when: dgx_os6_upgrade_packages + +- name: Install DGX OS 6 system configurations and tools + apt: + name: "{{ dgx_os6_platform.config_packages + dgx_os6_common_packages }}" + state: present + update_cache: yes + +- name: Install DGX OS 6 NVIDIA driver packages + apt: + name: "{{ dgx_os6_driver_packages + (dgx_os6_fabricmanager_packages if dgx_os6_platform.nvswitch else []) }}" + state: present + update_cache: yes + register: install_driver + notify: reboot after driver install + when: dgx_os6_install_driver + +- name: Install DGX OS 6 Docker and NVIDIA container toolkit + apt: + name: "{{ dgx_os6_container_packages }}" + state: present + update_cache: yes + notify: + - restart docker + when: dgx_os6_install_container_runtime + +- name: Install DGX OS 6 NVSM packages + apt: + name: "{{ dgx_os6_nvsm_packages }}" + state: present + update_cache: yes + when: dgx_os6_install_nvsm + +- name: Install additional DGX OS 6 administration packages + apt: + name: "{{ dgx_os6_admin_packages }}" + state: present + update_cache: yes + when: dgx_os6_install_admin_packages + +- name: Install additional DGX OS 6 development packages + apt: + name: "{{ dgx_os6_development_packages }}" + state: present + update_cache: yes + when: dgx_os6_install_development_packages + +- name: Disable unattended upgrades for DGX OS 6 + apt: + name: unattended-upgrades + state: absent + purge: yes + when: dgx_os6_disable_unattended_upgrades + +- name: Install DGX OS 6 serial-over-LAN console package + apt: + name: nvidia-ipmisol + state: present + update_cache: yes + when: dgx_os6_install_ipmisol + +- name: Install DGX OS 6 logrotate package + apt: + name: nvidia-logrotate + state: present + update_cache: yes + when: dgx_os6_install_logrotate + +- name: Populate DGX OS 6 service facts + service_facts: + +- name: Enable available DGX OS 6 services + systemd: + name: "{{ item }}" + state: started + enabled: yes + daemon_reload: yes + loop: "{{ dgx_os6_services }}" + when: item in ansible_facts.services diff --git a/roles/nvidia-dgx/tasks/ubuntu-24.04.yml b/roles/nvidia-dgx/tasks/ubuntu-24.04.yml new file mode 100644 index 000000000..b2c8161a1 --- /dev/null +++ b/roles/nvidia-dgx/tasks/ubuntu-24.04.yml @@ -0,0 +1,92 @@ +--- +- name: Include DGX OS 7 variables + include_vars: ubuntu-24.04.yml + +- name: Determine DGX OS 7 platform package set + set_fact: + dgx_os7_platform: "{{ item }}" + loop: "{{ dgx_os7_platforms }}" + when: ansible_product_name is search(item.match) + +- name: Fail if DGX OS 7 platform package set is unknown + fail: + msg: "Unsupported DGX model for Ubuntu 24.04 DGX OS 7 role path: {{ ansible_product_name }}" + when: dgx_os7_platform is not defined + +- name: Create temporary DGX OS 7 repository archive path + tempfile: + state: file + suffix: -dgx-repo-files.tgz + register: dgx_os7_repo_archive + +- name: Download DGX OS 7 repository files + get_url: + url: "{{ dgx_os7_repo_files_url }}" + dest: "{{ dgx_os7_repo_archive.path }}" + mode: "0644" + +- name: Install DGX OS 7 repository files + unarchive: + src: "{{ dgx_os7_repo_archive.path }}" + dest: / + remote_src: true + +- name: Remove temporary DGX OS 7 repository archive + file: + path: "{{ dgx_os7_repo_archive.path }}" + state: absent + +- name: Update apt cache for DGX OS 7 repositories + apt: + update_cache: yes + +- name: Upgrade packages for DGX OS 7 + apt: + upgrade: dist + when: dgx_os7_upgrade_packages + +- name: Install DGX OS 7 system packages + apt: + name: "{{ dgx_os7_system_packages + ([dgx_os7_station_package] if dgx_os7_platform.station else []) }}" + state: present + update_cache: yes + +- name: Install DGX OS 7 kernel tools and peer memory loader + apt: + name: + - "{{ dgx_os7_platform.linux_tools_package }}" + - nvidia-peermem-loader + state: present + update_cache: yes + +- name: Install DGX OS 7 NVIDIA driver packages + apt: + name: "{{ dgx_os7_driver_packages + (dgx_os7_fabricmanager_packages if dgx_os7_platform.nvswitch else []) + (dgx_os7_nvlink5_packages if dgx_os7_platform.nvlink5 else []) + (dgx_os7_imex_packages if dgx_os7_platform.imex else []) }}" + state: present + update_cache: yes + register: install_driver + notify: reboot after driver install + when: dgx_os7_install_driver + +- name: Disable unattended upgrades for DGX OS 7 + apt: + name: unattended-upgrades + state: absent + purge: yes + when: dgx_os7_disable_unattended_upgrades + +- name: Populate DGX OS 7 service facts + service_facts: + +- name: Build DGX OS 7 service list + set_fact: + dgx_os7_enabled_services: "{{ dgx_os7_base_services + (dgx_os7_fabricmanager_services if dgx_os7_platform.nvswitch else []) + (dgx_os7_imex_services if dgx_os7_platform.imex else []) }}" + +- name: Enable available DGX OS 7 services + systemd: + name: "{{ item }}" + state: started + enabled: yes + daemon_reload: yes + loop: "{{ dgx_os7_enabled_services }}" + when: item in ansible_facts.services diff --git a/roles/nvidia-dgx/tasks/ubuntu-legacy.yml b/roles/nvidia-dgx/tasks/ubuntu-legacy.yml new file mode 100644 index 000000000..cdfb95f96 --- /dev/null +++ b/roles/nvidia-dgx/tasks/ubuntu-legacy.yml @@ -0,0 +1,384 @@ +--- +- name: Check for Ubuntu version + fail: + msg: "Role supports Ubuntu 18.04 and 20.04 only" + when: + - ansible_distribution_version != "20.04" + - ansible_distribution_version != "18.04" + +- name: include 18.04 specific variables for DGX OS 4 + include_vars: ubuntu-18.04.yml + when: ansible_distribution_version == "18.04" + +- name: include DGXA100 specific variables for DGX OS 4.99 + include_vars: dgxa100.yml + when: + - ansible_distribution_version == "18.04" + - ansible_product_name is search("DGXA100") + +- name: include 20.04 specific variables for DGX OS 5 + include_vars: ubuntu-20.04.yml + when: ansible_distribution_version == "20.04" + +- name: install dmidecode + apt: + name: dmidecode + state: present + update_cache: yes + +# Housekeeping +- name: register DGX product name + command: "dmidecode --string system-product-name" + register: dgx_name + +- name: register DGX-1 serial number + command: "dmidecode --string system-serial-number" + register: dgx1_serial + when: ansible_product_name is search("DGX-1") + +- name: register DGX-2 serial number + command: "dmidecode --string chassis-serial-number" + register: dgx2_serial + when: ansible_product_name is search("DGX-2") + +- name: register DGXA100 serial number + command: "dmidecode --string chassis-serial-number" + register: dgxa100_serial + when: ansible_product_name is search("DGXA100") + +- name: figure out which serial number we ended up with + set_fact: + dgx_serial: "{{ dgx1_serial.stdout }}" + when: dgx1_serial.skipped is not defined + +- name: figure out which serial number we ended up with + set_fact: + dgx_serial: "{{ dgx2_serial.stdout }}" + when: dgx2_serial.skipped is not defined + +- name: figure out which serial number we ended up with + set_fact: + dgx_serial: "{{ dgxa100_serial.stdout }}" + when: dgxa100_serial.skipped is not defined + +- name: fail if we don't recognize the DGX system + fail: + msg: "Unknown DGX model: {{ ansible_product_name }}" + when: dgx_serial is undefined + +- name: update dgx platform file + blockinfile: + path: /etc/dgx-release + create: yes + block: | + DGX_NAME="DGX Server" + DGX_PRETTY_NAME="NVIDIA DGX Server" + DGX_PLATFORM="DGX Server for {{ dgx_name.stdout }}" + DGX_SERIAL_NUMBER="{{ dgx_serial }}" + +- name: update dgx platform file with additional info for DGX OS 4 + blockinfile: + path: /etc/dgx-release + create: yes + block: | + DGX_SWBUILD_DATE="{{ DGX_SWBUILD_DATE }}" + DGX_SWBUILD_VERSION="{{ DGX_SWBUILD_VERSION }}" + DGX_COMMIT_ID="{{ DGX_COMMIT_ID }}" + when: ansible_distribution_version == "18.04" + +# Repos and installs +- name: remove ubuntu nvidia driver ppa if installed + apt_repository: + repo: ppa:graphics-drivers/ppa + state: absent + +- name: remove packages when using Bright Computing source image + apt: + name: "{{ item }}" + state: absent + purge: yes + with_items: + - cuda-dcgm + - cuda-driver + - libumad2sim0 + - libsim-utils + - ibutils + +- name: remove apt mirror when using Bright Computing source image + replace: + path: /etc/apt/sources.list + regexp: 'nl\.' + +- name: unload drivers when using Bright Computing source image + modprobe: + name: "{{ item }}" + state: absent + with_items: + - ib_srp + - ib_iser + - rdma_ucm + - rpcrdma + - rdma_cm + +- name: add DGX repo keys for DGX OS 4 + apt: + deb: "{{ nvidia_dgx_ubuntu_gpgkey }}" + when: ansible_distribution_version == "18.04" + +- name: add DGX repo key for DGX OS 5 + apt_key: + url: "{{ nvidia_dgx_os5_ubuntu_gpgkey }}" + state: present + when: ansible_distribution_version == "20.04" + +- name: add DGX repo for DGX OS 4 + template: + src: dgx.list.j2 + dest: /etc/apt/sources.list.d/dgx.list + mode: 0644 + when: ansible_distribution_version == "18.04" + +- name: add DGX repo for DGX OS 5 + template: + src: dgxos5.list.j2 + dest: /etc/apt/sources.list.d/dgx-temp.list + mode: 0644 + when: ansible_distribution_version == "20.04" + +- name: update apt cache + apt: + update_cache: yes + +- name: add DGX cuda 10.1 repo for DGX OS 4 + apt: + name: dgx-bionic-r418+cuda10.1-repo + update_cache: yes + dpkg_options: "force-confdef,force-confold" + when: ansible_distribution_version == "18.04" + +- name: add DGX cuda 11.0 repo for DGX OS 4 + apt: + name: dgx-bionic-r450+cuda11.0-repo + update_cache: yes + dpkg_options: "force-confdef,force-confold" + when: ansible_distribution_version == "18.04" + +- name: add DGXA100 repo for DGX OS 4 + apt: + name: dgx-bionic-4.99-repo + update_cache: yes + dpkg_options: "force-confdef,force-confold" + when: + - ansible_distribution_version == "18.04" + - ansible_product_name is search("DGXA100") + +- name: install prerequisites for DGX OS 5 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ G_PREREQS }}" + - "{{ G_SETUP_PKGS }}" + when: ansible_distribution_version == "20.04" + +- name: install DGX repo package for DGX OS 5 + apt: + name: "{{ item }}" + update_cache: yes + dpkg_options: "force-confdef,force-confold" + with_items: + - "{{ G_DGX_REPO_PKG }}" + when: ansible_distribution_version == "20.04" + +- name: remove temporary repo package for DGX OS 5 + file: + path: /etc/apt/sources.list.d/dgx-temp.list + state: absent + when: ansible_distribution_version == "20.04" + +- name: install more packages for DGX OS 5 + apt: + name: "{{ item }}" + update_cache: yes + dpkg_options: "force-confdef,force-confold" + with_items: + - "{{ G_PKGS_DEFAULT }}" + when: ansible_distribution_version == "20.04" + +- name: install DGX-1 packages for DGX OS 4 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ PKGS_DGX1_ALL }}" + when: + - ansible_distribution_version == "18.04" + - ansible_product_name is search("DGX-1") + notify: + - restart docker + +- name: install DGX-1 packages for DGX OS 5 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ G_DGX_COMMON_PKGS }}" + - "{{ G_DGX1_PKGS }}" + when: + - ansible_distribution_version == "20.04" + - ansible_product_name is search("DGX-1") + notify: + - restart docker + +- name: install DGX-2 packages for DGX OS 4 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ PKGS_DGX2_ALL }}" + when: + - ansible_distribution_version == "18.04" + - ansible_product_name is search("DGX-2") + notify: + - restart docker + +- name: install DGX-2 packages for DGX OS 5 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ G_DGX_COMMON_PKGS }}" + - "{{ G_DGX2_PKGS }}" + when: + - ansible_distribution_version == "20.04" + - ansible_product_name is search("DGX-2") + notify: + - restart docker + +- name: install DGXA100 packages for DGX OS 4 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ PKGS_DGXA100_ALL }}" + when: + - ansible_distribution_version == "18.04" + - ansible_product_name is search("DGXA100") + notify: + - restart docker + +- name: install DGXA100 packages for DGX OS 5 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ G_DGX_COMMON_PKGS }}" + - "{{ G_DGX_A100_PKGS }}" + when: + - ansible_distribution_version == "20.04" + - ansible_product_name is search("DGXA100") + notify: + - restart docker + +- name: install extra DGX packages for DGX OS 4 + apt: + name: "{{ item }}" + state: present + update_cache: yes + with_items: + - "{{ dgx_extra_packages }}" + when: ansible_distribution_version == "18.04" + +- name: touch grub ipmisol.cfg for DGX OS 4 + file: + path: /etc/default/grub.d/ipmisol.cfg + state: touch + mode: u=rw,g=r,o=r + when: ansible_distribution_version == "18.04" + +- name: add DGX IPMI SOL package for DGX OS 4 + apt: + name: dgx-ipmisol + state: present + when: + - ansible_distribution_version == "18.04" + - dgx_name is search("DGX") + +- name: disable release update prompt for DGX OS 5 + lineinfile: + path: /etc/update-manager/release-upgrades + regexp: '^Prompt=.*' + line: Prompt=never + when: ansible_distribution_version == "20.04" + +- name: add ipmi kernel module at boot + lineinfile: + path: /etc/modules + line: ipmi_devintf + +- name: populate service facts + service_facts: + +- name: enable services + systemd: + name: "{{ item }}" + state: started + enabled: yes + daemon_reload: yes + loop: + - openibd.service + - dcgm.service + - nvidia-dcgm.service + - nvidia-persistenced.service + - nvidia-fabricmanager.service + when: item in services + +- name: enable PXE/UEFI on MLNX interfaces for DGX OS 5 + command: /usr/sbin/mlnx_pxe_setup.bash + when: ansible_distribution_version == "20.04" + +- name: configure nv peer mem service startup + command: /usr/sbin/update-rc.d nv_peer_mem defaults + +- name: configure default ubuntu repos for DGX OS 4 + template: + src: sources.list.j2 + dest: /etc/apt/sources.list + mode: 0644 + when: ansible_distribution_version == "18.04" + +- name: disable srp services + systemd: + name: "{{ item }}" + state: stopped + enabled: no + when: ansible_product_name is search("DGXA100") + with_items: + - srp_daemon + - srptools + +# setup_data_drive "/dev/sdb1" "dgx1cache" +- name: create raid directory for raid cache mount point + file: + path: "{{ cachefilesd_cache_dir }}" + state: directory + mode: "{{ cachefilesd_cache_dir_mode }}" + +- name: configure cachefilesd + template: + src: cachefilesd.conf.j2 + dest: /etc/cachefilesd.conf + +# Misc stuff +- name: restart docker service just in case... + systemd: + name: docker + state: restarted + enabled: yes diff --git a/roles/nvidia-dgx/tasks/ubuntu.yml b/roles/nvidia-dgx/tasks/ubuntu.yml index cdfb95f96..7a2821ccb 100644 --- a/roles/nvidia-dgx/tasks/ubuntu.yml +++ b/roles/nvidia-dgx/tasks/ubuntu.yml @@ -1,384 +1,17 @@ --- -- name: Check for Ubuntu version +- name: Check for supported Ubuntu DGX Software Stack version fail: - msg: "Role supports Ubuntu 18.04 and 20.04 only" - when: - - ansible_distribution_version != "20.04" - - ansible_distribution_version != "18.04" + msg: "Role supports Ubuntu 18.04, 20.04, 22.04, and 24.04 only" + when: ansible_distribution_version not in ['18.04', '20.04', '22.04', '24.04'] -- name: include 18.04 specific variables for DGX OS 4 - include_vars: ubuntu-18.04.yml - when: ansible_distribution_version == "18.04" +- name: Run legacy DGX OS 4/5 tasks + include_tasks: ubuntu-legacy.yml + when: ansible_distribution_version in ['18.04', '20.04'] -- name: include DGXA100 specific variables for DGX OS 4.99 - include_vars: dgxa100.yml - when: - - ansible_distribution_version == "18.04" - - ansible_product_name is search("DGXA100") +- name: Run DGX OS 6 tasks + include_tasks: ubuntu-22.04.yml + when: ansible_distribution_version == '22.04' -- name: include 20.04 specific variables for DGX OS 5 - include_vars: ubuntu-20.04.yml - when: ansible_distribution_version == "20.04" - -- name: install dmidecode - apt: - name: dmidecode - state: present - update_cache: yes - -# Housekeeping -- name: register DGX product name - command: "dmidecode --string system-product-name" - register: dgx_name - -- name: register DGX-1 serial number - command: "dmidecode --string system-serial-number" - register: dgx1_serial - when: ansible_product_name is search("DGX-1") - -- name: register DGX-2 serial number - command: "dmidecode --string chassis-serial-number" - register: dgx2_serial - when: ansible_product_name is search("DGX-2") - -- name: register DGXA100 serial number - command: "dmidecode --string chassis-serial-number" - register: dgxa100_serial - when: ansible_product_name is search("DGXA100") - -- name: figure out which serial number we ended up with - set_fact: - dgx_serial: "{{ dgx1_serial.stdout }}" - when: dgx1_serial.skipped is not defined - -- name: figure out which serial number we ended up with - set_fact: - dgx_serial: "{{ dgx2_serial.stdout }}" - when: dgx2_serial.skipped is not defined - -- name: figure out which serial number we ended up with - set_fact: - dgx_serial: "{{ dgxa100_serial.stdout }}" - when: dgxa100_serial.skipped is not defined - -- name: fail if we don't recognize the DGX system - fail: - msg: "Unknown DGX model: {{ ansible_product_name }}" - when: dgx_serial is undefined - -- name: update dgx platform file - blockinfile: - path: /etc/dgx-release - create: yes - block: | - DGX_NAME="DGX Server" - DGX_PRETTY_NAME="NVIDIA DGX Server" - DGX_PLATFORM="DGX Server for {{ dgx_name.stdout }}" - DGX_SERIAL_NUMBER="{{ dgx_serial }}" - -- name: update dgx platform file with additional info for DGX OS 4 - blockinfile: - path: /etc/dgx-release - create: yes - block: | - DGX_SWBUILD_DATE="{{ DGX_SWBUILD_DATE }}" - DGX_SWBUILD_VERSION="{{ DGX_SWBUILD_VERSION }}" - DGX_COMMIT_ID="{{ DGX_COMMIT_ID }}" - when: ansible_distribution_version == "18.04" - -# Repos and installs -- name: remove ubuntu nvidia driver ppa if installed - apt_repository: - repo: ppa:graphics-drivers/ppa - state: absent - -- name: remove packages when using Bright Computing source image - apt: - name: "{{ item }}" - state: absent - purge: yes - with_items: - - cuda-dcgm - - cuda-driver - - libumad2sim0 - - libsim-utils - - ibutils - -- name: remove apt mirror when using Bright Computing source image - replace: - path: /etc/apt/sources.list - regexp: 'nl\.' - -- name: unload drivers when using Bright Computing source image - modprobe: - name: "{{ item }}" - state: absent - with_items: - - ib_srp - - ib_iser - - rdma_ucm - - rpcrdma - - rdma_cm - -- name: add DGX repo keys for DGX OS 4 - apt: - deb: "{{ nvidia_dgx_ubuntu_gpgkey }}" - when: ansible_distribution_version == "18.04" - -- name: add DGX repo key for DGX OS 5 - apt_key: - url: "{{ nvidia_dgx_os5_ubuntu_gpgkey }}" - state: present - when: ansible_distribution_version == "20.04" - -- name: add DGX repo for DGX OS 4 - template: - src: dgx.list.j2 - dest: /etc/apt/sources.list.d/dgx.list - mode: 0644 - when: ansible_distribution_version == "18.04" - -- name: add DGX repo for DGX OS 5 - template: - src: dgxos5.list.j2 - dest: /etc/apt/sources.list.d/dgx-temp.list - mode: 0644 - when: ansible_distribution_version == "20.04" - -- name: update apt cache - apt: - update_cache: yes - -- name: add DGX cuda 10.1 repo for DGX OS 4 - apt: - name: dgx-bionic-r418+cuda10.1-repo - update_cache: yes - dpkg_options: "force-confdef,force-confold" - when: ansible_distribution_version == "18.04" - -- name: add DGX cuda 11.0 repo for DGX OS 4 - apt: - name: dgx-bionic-r450+cuda11.0-repo - update_cache: yes - dpkg_options: "force-confdef,force-confold" - when: ansible_distribution_version == "18.04" - -- name: add DGXA100 repo for DGX OS 4 - apt: - name: dgx-bionic-4.99-repo - update_cache: yes - dpkg_options: "force-confdef,force-confold" - when: - - ansible_distribution_version == "18.04" - - ansible_product_name is search("DGXA100") - -- name: install prerequisites for DGX OS 5 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ G_PREREQS }}" - - "{{ G_SETUP_PKGS }}" - when: ansible_distribution_version == "20.04" - -- name: install DGX repo package for DGX OS 5 - apt: - name: "{{ item }}" - update_cache: yes - dpkg_options: "force-confdef,force-confold" - with_items: - - "{{ G_DGX_REPO_PKG }}" - when: ansible_distribution_version == "20.04" - -- name: remove temporary repo package for DGX OS 5 - file: - path: /etc/apt/sources.list.d/dgx-temp.list - state: absent - when: ansible_distribution_version == "20.04" - -- name: install more packages for DGX OS 5 - apt: - name: "{{ item }}" - update_cache: yes - dpkg_options: "force-confdef,force-confold" - with_items: - - "{{ G_PKGS_DEFAULT }}" - when: ansible_distribution_version == "20.04" - -- name: install DGX-1 packages for DGX OS 4 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ PKGS_DGX1_ALL }}" - when: - - ansible_distribution_version == "18.04" - - ansible_product_name is search("DGX-1") - notify: - - restart docker - -- name: install DGX-1 packages for DGX OS 5 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ G_DGX_COMMON_PKGS }}" - - "{{ G_DGX1_PKGS }}" - when: - - ansible_distribution_version == "20.04" - - ansible_product_name is search("DGX-1") - notify: - - restart docker - -- name: install DGX-2 packages for DGX OS 4 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ PKGS_DGX2_ALL }}" - when: - - ansible_distribution_version == "18.04" - - ansible_product_name is search("DGX-2") - notify: - - restart docker - -- name: install DGX-2 packages for DGX OS 5 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ G_DGX_COMMON_PKGS }}" - - "{{ G_DGX2_PKGS }}" - when: - - ansible_distribution_version == "20.04" - - ansible_product_name is search("DGX-2") - notify: - - restart docker - -- name: install DGXA100 packages for DGX OS 4 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ PKGS_DGXA100_ALL }}" - when: - - ansible_distribution_version == "18.04" - - ansible_product_name is search("DGXA100") - notify: - - restart docker - -- name: install DGXA100 packages for DGX OS 5 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ G_DGX_COMMON_PKGS }}" - - "{{ G_DGX_A100_PKGS }}" - when: - - ansible_distribution_version == "20.04" - - ansible_product_name is search("DGXA100") - notify: - - restart docker - -- name: install extra DGX packages for DGX OS 4 - apt: - name: "{{ item }}" - state: present - update_cache: yes - with_items: - - "{{ dgx_extra_packages }}" - when: ansible_distribution_version == "18.04" - -- name: touch grub ipmisol.cfg for DGX OS 4 - file: - path: /etc/default/grub.d/ipmisol.cfg - state: touch - mode: u=rw,g=r,o=r - when: ansible_distribution_version == "18.04" - -- name: add DGX IPMI SOL package for DGX OS 4 - apt: - name: dgx-ipmisol - state: present - when: - - ansible_distribution_version == "18.04" - - dgx_name is search("DGX") - -- name: disable release update prompt for DGX OS 5 - lineinfile: - path: /etc/update-manager/release-upgrades - regexp: '^Prompt=.*' - line: Prompt=never - when: ansible_distribution_version == "20.04" - -- name: add ipmi kernel module at boot - lineinfile: - path: /etc/modules - line: ipmi_devintf - -- name: populate service facts - service_facts: - -- name: enable services - systemd: - name: "{{ item }}" - state: started - enabled: yes - daemon_reload: yes - loop: - - openibd.service - - dcgm.service - - nvidia-dcgm.service - - nvidia-persistenced.service - - nvidia-fabricmanager.service - when: item in services - -- name: enable PXE/UEFI on MLNX interfaces for DGX OS 5 - command: /usr/sbin/mlnx_pxe_setup.bash - when: ansible_distribution_version == "20.04" - -- name: configure nv peer mem service startup - command: /usr/sbin/update-rc.d nv_peer_mem defaults - -- name: configure default ubuntu repos for DGX OS 4 - template: - src: sources.list.j2 - dest: /etc/apt/sources.list - mode: 0644 - when: ansible_distribution_version == "18.04" - -- name: disable srp services - systemd: - name: "{{ item }}" - state: stopped - enabled: no - when: ansible_product_name is search("DGXA100") - with_items: - - srp_daemon - - srptools - -# setup_data_drive "/dev/sdb1" "dgx1cache" -- name: create raid directory for raid cache mount point - file: - path: "{{ cachefilesd_cache_dir }}" - state: directory - mode: "{{ cachefilesd_cache_dir_mode }}" - -- name: configure cachefilesd - template: - src: cachefilesd.conf.j2 - dest: /etc/cachefilesd.conf - -# Misc stuff -- name: restart docker service just in case... - systemd: - name: docker - state: restarted - enabled: yes +- name: Run DGX OS 7 tasks + include_tasks: ubuntu-24.04.yml + when: ansible_distribution_version == '24.04' diff --git a/roles/nvidia-dgx/vars/redhat.yml b/roles/nvidia-dgx/vars/redhat.yml index 5eaa1621d..1ed547e5d 100644 --- a/roles/nvidia-dgx/vars/redhat.yml +++ b/roles/nvidia-dgx/vars/redhat.yml @@ -3,3 +3,115 @@ dgx_repo_dir: "rhel{{ ansible_distribution_major_version }}" dgx_extra_packages: - dgx-conf-cachefilesd - kernel-headers + +dgx_redhat_manage_subscription_repos: true +dgx_redhat_upgrade_packages: false +dgx_redhat_install_driver: true +dgx_redhat_install_container_runtime: true +dgx_redhat_install_docker_ce: "{{ ansible_distribution_major_version == '8' }}" +dgx_redhat_install_cachefilesd: false +dgx_redhat_driver_branch: "{{ '580' if ansible_distribution_major_version == '9' else ('535-dkms' if dgx_redhat_platform.configuration_group == 'DGX H100 Configurations' else '525') }}" +dgx_redhat_use_open_kernel_modules: "{{ ansible_distribution_major_version == '9' and dgx_redhat_platform.nvswitch | default(false) }}" + +dgx_redhat_repo_setup_rpms: + "8": https://repo.download.nvidia.com/baseos/el/el-files/8/nvidia-repo-setup-21.06-1.el8.x86_64.rpm + "9": https://repo.download.nvidia.com/baseos/el/el-files/9/nvidia-repo-setup-25.02-1.el9.x86_64.rpm + +dgx_redhat_subscription_repos: + "8": + - rhel-8-for-x86_64-appstream-rpms + - rhel-8-for-x86_64-baseos-rpms + - codeready-builder-for-rhel-8-x86_64-rpms + "9": + - rhel-9-for-x86_64-appstream-rpms + - rhel-9-for-x86_64-baseos-rpms + - codeready-builder-for-rhel-9-x86_64-rpms + +dgx_redhat_platforms: + - match: "DGX-1" + configuration_group: "DGX-1 Configurations" + supported_major_versions: ["8"] + nvswitch: false + nvlink5: false + station: false + - match: "DGX-2" + configuration_group: "DGX-2 Configurations" + supported_major_versions: ["8"] + nvswitch: true + nvlink5: false + station: false + - match: "DGX[ -]?A100|DGXA100" + configuration_group: "DGX A100 Configurations" + supported_major_versions: ["8", "9"] + nvswitch: true + nvlink5: false + station: false + - match: "DGX[ -]?A800|DGXA800" + configuration_group: "DGX A800 Configurations" + supported_major_versions: ["8"] + nvswitch: true + nvlink5: false + station: false + - match: "DGX[ -]?H100|DGXH100" + configuration_group: "DGX H100 Configurations" + supported_major_versions: ["8", "9"] + nvswitch: true + nvlink5: false + station: false + - match: "DGX[ -]?H200|DGXH200" + configuration_group: "DGX H200 Configurations" + supported_major_versions: ["9"] + nvswitch: true + nvlink5: false + station: false + - match: "DGX[ -]?B200|DGXB200" + configuration_group: "DGX B200 Configurations" + supported_major_versions: ["9"] + nvswitch: true + nvlink5: true + station: false + - match: "DGX[ -]?B300|DGXB300" + configuration_group: "DGX B300 Configurations" + supported_major_versions: ["9"] + nvswitch: true + nvlink5: true + station: false + - match: "DGX Station A100" + configuration_group: "DGX Station A100 Configurations" + supported_major_versions: ["8", "9"] + nvswitch: false + nvlink5: false + station: true + - match: "DGX Station A800" + configuration_group: "DGX Station A800 Configurations" + supported_major_versions: ["8"] + nvswitch: false + nvlink5: false + station: true + - match: "^DGX Station$" + configuration_group: "DGX Station Configurations" + supported_major_versions: ["8"] + nvswitch: false + nvlink5: false + station: true + +dgx_redhat_driver_support_packages: + - nv-persistence-mode + +dgx_redhat_fabricmanager_packages: + - nvidia-fm-enable + +dgx_redhat_non_nvswitch_packages: + - "libnvidia-nscq-{{ dgx_redhat_driver_branch }}" + +dgx_redhat_nvlink5_packages: + - nvlsm + +dgx_redhat_station_packages: + - nvidia-conf-xconfig + - nv-docker-gpus + +dgx_redhat_services: + - docker.service + - nvidia-persistenced.service + - nvidia-fabricmanager.service diff --git a/roles/nvidia-dgx/vars/ubuntu-22.04.yml b/roles/nvidia-dgx/vars/ubuntu-22.04.yml new file mode 100644 index 000000000..a17da2b91 --- /dev/null +++ b/roles/nvidia-dgx/vars/ubuntu-22.04.yml @@ -0,0 +1,115 @@ +--- +dgx_os6_repo_files_url: https://repo.download.nvidia.com/baseos/ubuntu/jammy/dgx-repo-files.tgz + +dgx_os6_upgrade_packages: false +dgx_os6_install_driver: true +dgx_os6_install_container_runtime: true +dgx_os6_install_nvsm: true +dgx_os6_install_admin_packages: true +dgx_os6_install_development_packages: true +dgx_os6_disable_unattended_upgrades: true +dgx_os6_install_ipmisol: true +dgx_os6_install_logrotate: true + +dgx_os6_driver_branch: "550" + +dgx_os6_platforms: + - match: "DGX-1" + nvswitch: false + config_packages: + - dgx1-system-configurations + - dgx1-system-tools + - dgx1-system-tools-extra + - match: "DGX-2" + nvswitch: true + config_packages: + - dgx2-system-configurations + - dgx2-system-tools + - dgx2-system-tools-extra + - match: "DGX[ -]?A100|DGXA100" + nvswitch: true + config_packages: + - dgx-a100-system-configurations + - dgx-a100-system-tools + - dgx-a100-system-tools-extra + - "nvidia-utils-{{ dgx_os6_driver_branch }}-server" + - match: "DGX[ -]?H100|DGXH100" + nvswitch: true + config_packages: + - dgx-h100-system-configurations + - dgx-h100-system-tools + - dgx-h100-system-tools-extra + - "nvidia-utils-{{ dgx_os6_driver_branch }}-server" + - nvfwupd + - match: "DGX[ -]?H200|DGXH200" + nvswitch: true + config_packages: + - dgx-h200-system-configurations + - dgx-h200-system-tools + - dgx-h200-system-tools-extra + - "nvidia-utils-{{ dgx_os6_driver_branch }}-server" + - nvfwupd + +dgx_os6_common_packages: + - linux-tools-nvidia + - nvidia-peermem-loader + +dgx_os6_driver_packages: + - "nvidia-driver-{{ dgx_os6_driver_branch }}-server" + - "linux-modules-nvidia-{{ dgx_os6_driver_branch }}-server-nvidia" + - "libnvidia-nscq-{{ dgx_os6_driver_branch }}" + - nvidia-modprobe + - nv-persistence-mode + +dgx_os6_fabricmanager_packages: + - "nvidia-fabricmanager-{{ dgx_os6_driver_branch }}" + - nvidia-fm-enable + +dgx_os6_container_packages: + - docker-ce + - nvidia-container-toolkit + - nv-docker-options + +dgx_os6_nvsm_packages: + - nvsm + - nvidia-motd + +dgx_os6_admin_packages: + - chrpath + - cifs-utils + - fping + - gdisk + - iperf + - ipmitool + - lsscsi + - net-tools + - nfs-common + - quota + - rasdaemon + - pm-utils + - samba-common + - samba-libs + - sysstat + - vlan + +dgx_os6_development_packages: + - build-essential + - automake + - bison + - cmake + - dpatch + - flex + - gcc-multilib + - gdb + - g++-multilib + - libelf-dev + - libltdl-dev + - linux-tools-generic + - m4 + - swig + +dgx_os6_services: + - dcgm.service + - nvidia-dcgm.service + - nvidia-persistenced.service + - nvidia-fabricmanager.service diff --git a/roles/nvidia-dgx/vars/ubuntu-24.04.yml b/roles/nvidia-dgx/vars/ubuntu-24.04.yml new file mode 100644 index 000000000..bd05fa49d --- /dev/null +++ b/roles/nvidia-dgx/vars/ubuntu-24.04.yml @@ -0,0 +1,102 @@ +--- +dgx_os7_repo_files_url: "https://repo.download.nvidia.com/baseos/ubuntu/noble/{{ 'arm64' if ansible_architecture in ['aarch64', 'arm64'] else 'x86_64' }}/dgx-repo-files.tgz" + +dgx_os7_upgrade_packages: false +dgx_os7_install_driver: true +dgx_os7_disable_unattended_upgrades: true + +dgx_os7_platforms: + - match: "DGX Station A100" + station: true + nvswitch: false + nvlink5: false + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX Station A800" + station: true + nvswitch: false + nvlink5: false + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX Spark" + station: true + nvswitch: false + nvlink5: false + imex: false + linux_tools_package: linux-tools-nvidia-hwe-24.04 + - match: "DGX[ -]?A100|DGXA100" + station: false + nvswitch: true + nvlink5: false + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX[ -]?H100|DGXH100" + station: false + nvswitch: true + nvlink5: false + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX[ -]?H200|DGXH200" + station: false + nvswitch: true + nvlink5: false + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX[ -]?B200|DGXB200" + station: false + nvswitch: true + nvlink5: true + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX[ -]?B300|DGXB300" + station: false + nvswitch: true + nvlink5: true + imex: false + linux_tools_package: linux-tools-generic + - match: "DGX[ -]?GB200|DGXGB200" + station: false + nvswitch: false + nvlink5: false + imex: true + linux_tools_package: linux-tools-nvidia-64k-hwe-24.04 + - match: "DGX[ -]?GB300|DGXGB300" + station: false + nvswitch: false + nvlink5: false + imex: true + linux_tools_package: linux-tools-nvidia-64k-hwe-24.04 + +dgx_os7_system_packages: + - nvidia-system-core + - nvidia-system-utils + - nvidia-system-extra + +dgx_os7_station_package: nvidia-system-station + +dgx_os7_driver_packages: + - nvidia-driver-580-open + - libnvidia-nscq + - nvidia-modprobe + - datacenter-gpu-manager-4-cuda13 + - nv-persistence-mode + +dgx_os7_fabricmanager_packages: + - nvidia-fabricmanager + +dgx_os7_nvlink5_packages: + - nvlsm + - libnvsdm + +dgx_os7_imex_packages: + - nvidia-imex + +dgx_os7_base_services: + - nvidia-persistenced.service + - nvidia-dcgm.service + +dgx_os7_fabricmanager_services: + - nvidia-fabricmanager.service + +dgx_os7_imex_services: + - nvidia-imex.service From c8c74ed36ae992a9a56588d567496900162c2d6b Mon Sep 17 00:00:00 2001 From: Doug Holt Date: Wed, 27 May 2026 10:24:23 -0600 Subject: [PATCH 2/2] fix(dgx): harden EL8 driver install --- docs/deepops/dgx-software-stack.md | 9 +++++---- roles/nvidia-dgx/tasks/redhat-el8-plus.yml | 22 +++++++++++++++------- roles/nvidia-dgx/vars/redhat.yml | 7 +++++-- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/docs/deepops/dgx-software-stack.md b/docs/deepops/dgx-software-stack.md index 9164c1a0d..ffd5db302 100644 --- a/docs/deepops/dgx-software-stack.md +++ b/docs/deepops/dgx-software-stack.md @@ -67,10 +67,11 @@ The role follows the official Red Hat DGX software guides: 4. Optionally install the NVIDIA driver module and support packages. 5. Optionally install Docker CE and the NVIDIA Container Runtime group. -The default driver stream follows the official examples: `525` on most EL8 -systems, `535-dkms` on EL8 DGX H100, and `580` on EL9. EL9 NVSwitch systems -install the open-kernel-module stream by default. Override the branch when a -validated DGX release note calls for another stream: +The default driver stream uses DKMS on EL8 so current EL8 minor kernels can +build a matching NVIDIA kernel module: `525-dkms` on most EL8 systems, +`535-dkms` on EL8 DGX H100, and `580` on EL9. EL9 NVSwitch systems install the +open-kernel-module stream by default. Override the branch when a validated DGX +release note calls for another stream: ```yaml dgx_redhat_driver_branch: "580" diff --git a/roles/nvidia-dgx/tasks/redhat-el8-plus.yml b/roles/nvidia-dgx/tasks/redhat-el8-plus.yml index ce4b1997c..e3137e2ae 100644 --- a/roles/nvidia-dgx/tasks/redhat-el8-plus.yml +++ b/roles/nvidia-dgx/tasks/redhat-el8-plus.yml @@ -41,13 +41,13 @@ when: dgx_redhat_upgrade_packages tags: skip_ansible_lint -- name: Install EL9 kernel development packages for DGX driver builds +- name: Install kernel development packages for DGX driver builds dnf: name: - "kernel-devel-{{ ansible_kernel }}" - "kernel-headers-{{ ansible_kernel }}" state: present - when: ansible_distribution_major_version == '9' + when: ansible_distribution_major_version == '9' or 'dkms' in dgx_redhat_driver_branch - name: Install Red Hat DGX configuration group dnf: @@ -70,13 +70,21 @@ else [dgx_redhat_driver_profile] }} -- name: Configure Red Hat DGX driver module specification +- name: Configure Red Hat DGX driver module specifications set_fact: - dgx_redhat_driver_module_spec: >- - nvidia-driver:{{ dgx_redhat_driver_stream }}/{{ '{' ~ (dgx_redhat_driver_profiles | join(',')) ~ '}' if (dgx_redhat_driver_profiles | length) > 1 else dgx_redhat_driver_profiles[0] }} + dgx_redhat_driver_module_specs: "{{ dgx_redhat_driver_profiles | map('regex_replace', '^(.*)$', 'nvidia-driver:' ~ dgx_redhat_driver_stream ~ '/\\1') | list }}" + +- name: Remove precompiled NVIDIA kmod headers before DKMS driver install + dnf: + name: nvidia-kmod-headers + state: absent + when: + - dgx_redhat_install_driver + - "'dkms' in dgx_redhat_driver_stream" - name: Install Red Hat DGX NVIDIA driver module - command: "dnf module install --nobest -y {{ dgx_redhat_driver_module_spec }}" + command: "dnf module install --nobest -y {{ item }}" + loop: "{{ dgx_redhat_driver_module_specs }}" register: dgx_redhat_driver_module_install changed_when: "'Nothing to do' not in dgx_redhat_driver_module_install.stdout" notify: reboot after driver install @@ -135,5 +143,5 @@ state: started enabled: yes daemon_reload: yes - loop: "{{ dgx_redhat_services }}" + loop: "{{ dgx_redhat_services + (dgx_redhat_fabricmanager_services if dgx_redhat_platform.nvswitch else []) }}" when: item in ansible_facts.services diff --git a/roles/nvidia-dgx/vars/redhat.yml b/roles/nvidia-dgx/vars/redhat.yml index 1ed547e5d..27a386c12 100644 --- a/roles/nvidia-dgx/vars/redhat.yml +++ b/roles/nvidia-dgx/vars/redhat.yml @@ -10,7 +10,8 @@ dgx_redhat_install_driver: true dgx_redhat_install_container_runtime: true dgx_redhat_install_docker_ce: "{{ ansible_distribution_major_version == '8' }}" dgx_redhat_install_cachefilesd: false -dgx_redhat_driver_branch: "{{ '580' if ansible_distribution_major_version == '9' else ('535-dkms' if dgx_redhat_platform.configuration_group == 'DGX H100 Configurations' else '525') }}" +dgx_redhat_driver_branch: "{{ '580' if ansible_distribution_major_version == '9' else ('535-dkms' if dgx_redhat_platform.configuration_group == 'DGX H100 Configurations' else '525-dkms') }}" +dgx_redhat_driver_package_branch: "{{ dgx_redhat_driver_branch | regex_replace('-dkms$', '') }}" dgx_redhat_use_open_kernel_modules: "{{ ansible_distribution_major_version == '9' and dgx_redhat_platform.nvswitch | default(false) }}" dgx_redhat_repo_setup_rpms: @@ -102,7 +103,7 @@ dgx_redhat_fabricmanager_packages: - nvidia-fm-enable dgx_redhat_non_nvswitch_packages: - - "libnvidia-nscq-{{ dgx_redhat_driver_branch }}" + - "libnvidia-nscq-{{ dgx_redhat_driver_package_branch }}" dgx_redhat_nvlink5_packages: - nvlsm @@ -114,4 +115,6 @@ dgx_redhat_station_packages: dgx_redhat_services: - docker.service - nvidia-persistenced.service + +dgx_redhat_fabricmanager_services: - nvidia-fabricmanager.service