diff --git a/data/cloud/openstack/arbutus.cloud.computecanada.ca.yaml b/data/cloud/openstack/arbutus.cloud.computecanada.ca.yaml index c606a9201..0fad834a6 100644 --- a/data/cloud/openstack/arbutus.cloud.computecanada.ca.yaml +++ b/data/cloud/openstack/arbutus.cloud.computecanada.ca.yaml @@ -4,3 +4,8 @@ profile::gpu::install::vgpu::rpm::packages: - nvidia-vgpu-kmod - nvidia-vgpu-gridd - nvidia-vgpu-tools +profile::gpu::install::dcgm_packages: [] +profile::gpu::services::names: + - nvidia-persistenced + - nvidia-gridd + diff --git a/data/cloud/openstack/identity.arbutus.alliancecan.ca.yaml b/data/cloud/openstack/identity.arbutus.alliancecan.ca.yaml index 65db968b2..88eaa8bc3 100644 --- a/data/cloud/openstack/identity.arbutus.alliancecan.ca.yaml +++ b/data/cloud/openstack/identity.arbutus.alliancecan.ca.yaml @@ -1,4 +1,9 @@ profile::gpu::install::vgpu::installer: bin profile::gpu::install::vgpu::bin::source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/NVIDIA-Linux-x86_64-580.105.08-grid.run profile::gpu::install::vgpu::gridd_content: "FeatureType=4" -profile::gpu::install::vgpu::token_source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/kalpa-prod.tok \ No newline at end of file +profile::gpu::install::vgpu::token_source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/kalpa-prod.tok +profile::gpu::install::dcgm_packages: + - datacenter-gpu-manager-4-proprietary + - datacenter-gpu-manager-4-core + - datacenter-gpu-manager-4-cuda13 + diff --git a/data/common.yaml b/data/common.yaml index 3f67fabd0..97f8ef00b 100644 --- a/data/common.yaml +++ b/data/common.yaml @@ -340,7 +340,7 @@ profile::gpu::install::passthrough::packages: - nvidia-persistenced - nvidia-driver-cuda -profile::prometheus::slurm_job_exporter::version: 0.4.11 +profile::prometheus::slurm_job_exporter::version: 0.4.12 metrix::prometheus_ip: "%{alias('terraform.tag_ip.mgmt.0')}" metrix::ldap_password: "%{alias('profile::freeipa::server::admin_password')}" diff --git a/site/profile/manifests/gpu.pp b/site/profile/manifests/gpu.pp index 11b16c5c0..800b40cef 100644 --- a/site/profile/manifests/gpu.pp +++ b/site/profile/manifests/gpu.pp @@ -8,6 +8,11 @@ } class profile::gpu::install ( + Array[String] $dcgm_packages = [ + 'datacenter-gpu-manager-4-proprietary', + 'datacenter-gpu-manager-4-core', + 'datacenter-gpu-manager-4-cuda12', + ], Optional[String] $lib_symlink_path = undef ) { $restrict_profiling = lookup('profile::gpu::restrict_profiling') @@ -22,6 +27,21 @@ source_pp => 'puppet:///modules/profile/gpu/nvidia-gpu.pp', } + $os = "rhel${::facts['os']['release']['major']}" + $arch = $::facts['os']['architecture'] + + exec { 'cuda-repo': + command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo", + creates => "/etc/yum.repos.d/cuda-${os}.repo", + path => ['/usr/bin'], + } + if length($dcgm_packages) > 0 { + # DGCM is used by slurm-job-exporter to export GPU metrics + package { $dcgm_packages : + require => Yumrepo['cuda-repo'], + } + } + file { '/etc/modprobe.d/nvidia.conf': ensure => file, owner => 'root', @@ -101,14 +121,6 @@ Array[String] $packages, String $nvidia_driver_stream = '550-dkms' ) { - $os = "rhel${::facts['os']['release']['major']}" - $arch = $::facts['os']['architecture'] - - exec { 'cuda-repo': - command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo", - creates => "/etc/yum.repos.d/cuda-${os}.repo", - path => ['/usr/bin'], - } package { 'nvidia-stream': ensure => $nvidia_driver_stream, @@ -136,9 +148,6 @@ ], } - # Used by slurm-job-exporter to export GPU metrics - -> package { ['datacenter-gpu-manager-4-proprietary', 'datacenter-gpu-manager-4-core', 'datacenter-gpu-manager-4-cuda12']: } - -> augeas { 'nvidia-persistenced.service': context => '/files/lib/systemd/system/nvidia-persistenced.service/Service', changes => [ @@ -357,16 +366,19 @@ } } -class profile::gpu::services { - if ! profile::is_grid_vgpu() { - $gpu_services = ['nvidia-persistenced', 'nvidia-dcgm'] +class profile::gpu::services ( + Array[String] $names = ['nvidia-persistenced', 'nvidia-dcgm'], +) { + if profile::is_grid_vgpu() { + $gpu_services = unique($names + ['nvidia_gridd']) } else { - $gpu_services = ['nvidia-persistenced', 'nvidia-gridd'] + $gpu_services = $names } + service { $gpu_services: ensure => 'running', enable => true, - notify => Service['slurm-job-exporter'] + notify => Service['slurm-job-exporter'], } exec { 'stop_nvidia_services':