|
8 | 8 | } |
9 | 9 |
|
10 | 10 | class profile::gpu::install ( |
| 11 | + Array[String] $dcgm_packages = [ |
| 12 | + 'datacenter-gpu-manager-4-proprietary', |
| 13 | + 'datacenter-gpu-manager-4-core', |
| 14 | + 'datacenter-gpu-manager-4-cuda12', |
| 15 | + ], |
11 | 16 | Optional[String] $lib_symlink_path = undef |
12 | 17 | ) { |
13 | 18 | $restrict_profiling = lookup('profile::gpu::restrict_profiling') |
|
22 | 27 | source_pp => 'puppet:///modules/profile/gpu/nvidia-gpu.pp', |
23 | 28 | } |
24 | 29 |
|
| 30 | + $os = "rhel${::facts['os']['release']['major']}" |
| 31 | + $arch = $::facts['os']['architecture'] |
| 32 | + |
| 33 | + exec { 'cuda-repo': |
| 34 | + command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo", |
| 35 | + creates => "/etc/yum.repos.d/cuda-${os}.repo", |
| 36 | + path => ['/usr/bin'], |
| 37 | + } |
| 38 | + if length($dcgm_packages) > 0 { |
| 39 | + # DGCM is used by slurm-job-exporter to export GPU metrics |
| 40 | + package { $dcgm_packages : |
| 41 | + require => Yumrepo['cuda-repo'], |
| 42 | + } |
| 43 | + } |
| 44 | + |
25 | 45 | file { '/etc/modprobe.d/nvidia.conf': |
26 | 46 | ensure => file, |
27 | 47 | owner => 'root', |
|
101 | 121 | Array[String] $packages, |
102 | 122 | String $nvidia_driver_stream = '550-dkms' |
103 | 123 | ) { |
104 | | - $os = "rhel${::facts['os']['release']['major']}" |
105 | | - $arch = $::facts['os']['architecture'] |
106 | | - |
107 | | - exec { 'cuda-repo': |
108 | | - command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo", |
109 | | - creates => "/etc/yum.repos.d/cuda-${os}.repo", |
110 | | - path => ['/usr/bin'], |
111 | | - } |
112 | 124 |
|
113 | 125 | package { 'nvidia-stream': |
114 | 126 | ensure => $nvidia_driver_stream, |
|
136 | 148 | ], |
137 | 149 | } |
138 | 150 |
|
139 | | - # Used by slurm-job-exporter to export GPU metrics |
140 | | - -> package { ['datacenter-gpu-manager-4-proprietary', 'datacenter-gpu-manager-4-core', 'datacenter-gpu-manager-4-cuda12']: } |
141 | | - |
142 | 151 | -> augeas { 'nvidia-persistenced.service': |
143 | 152 | context => '/files/lib/systemd/system/nvidia-persistenced.service/Service', |
144 | 153 | changes => [ |
|
357 | 366 | } |
358 | 367 | } |
359 | 368 |
|
360 | | -class profile::gpu::services { |
361 | | - if ! profile::is_grid_vgpu() { |
362 | | - $gpu_services = ['nvidia-persistenced', 'nvidia-dcgm'] |
| 369 | +class profile::gpu::services ( |
| 370 | + Array[String] $names = ['nvidia-persistenced', 'nvidia-dcgm'], |
| 371 | +) { |
| 372 | + if profile::is_grid_vgpu() { |
| 373 | + $gpu_services = unique($names + ['nvidia_gridd']) |
363 | 374 | } else { |
364 | | - $gpu_services = ['nvidia-persistenced', 'nvidia-gridd'] |
| 375 | + $gpu_services = $names |
365 | 376 | } |
| 377 | + |
366 | 378 | service { $gpu_services: |
367 | 379 | ensure => 'running', |
368 | 380 | enable => true, |
369 | | - notify => Service['slurm-job-exporter'] |
| 381 | + notify => Service['slurm-job-exporter'], |
370 | 382 | } |
371 | 383 |
|
372 | 384 | exec { 'stop_nvidia_services': |
|
0 commit comments