Skip to content

Commit 2b34bb4

Browse files
committed
Install nvidia datacenter manager for VGPU too
1 parent a3c80de commit 2b34bb4

3 files changed

Lines changed: 39 additions & 17 deletions

File tree

data/cloud/openstack/arbutus.cloud.computecanada.ca.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,8 @@ profile::gpu::install::vgpu::rpm::packages:
44
- nvidia-vgpu-kmod
55
- nvidia-vgpu-gridd
66
- nvidia-vgpu-tools
7+
profile::gpu::install::dcgm_packages: []
8+
profile::gpu::services::names:
9+
- nvidia-persistenced
10+
- nvidia-gridd
11+
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
profile::gpu::install::vgpu::installer: bin
22
profile::gpu::install::vgpu::bin::source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/NVIDIA-Linux-x86_64-580.105.08-grid.run
33
profile::gpu::install::vgpu::gridd_content: "FeatureType=4"
4-
profile::gpu::install::vgpu::token_source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/kalpa-prod.tok
4+
profile::gpu::install::vgpu::token_source: https://object-arbutus.alliancecan.ca/swift/v1/6c87c15eb7d2468daf3d2bd0c58bbfce/vgpu/kalpa-prod.tok
5+
profile::gpu::install::dcgm_packages:
6+
- datacenter-gpu-manager-4-proprietary
7+
- datacenter-gpu-manager-4-core
8+
- datacenter-gpu-manager-4-cuda13
9+

site/profile/manifests/gpu.pp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
}
99

1010
class profile::gpu::install (
11+
Array[String] $dcgm_packages = [
12+
'datacenter-gpu-manager-4-proprietary',
13+
'datacenter-gpu-manager-4-core',
14+
'datacenter-gpu-manager-4-cuda12',
15+
],
1116
Optional[String] $lib_symlink_path = undef
1217
) {
1318
$restrict_profiling = lookup('profile::gpu::restrict_profiling')
@@ -22,6 +27,21 @@
2227
source_pp => 'puppet:///modules/profile/gpu/nvidia-gpu.pp',
2328
}
2429

30+
$os = "rhel${::facts['os']['release']['major']}"
31+
$arch = $::facts['os']['architecture']
32+
33+
exec { 'cuda-repo':
34+
command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo",
35+
creates => "/etc/yum.repos.d/cuda-${os}.repo",
36+
path => ['/usr/bin'],
37+
}
38+
if length($dcgm_packages) > 0 {
39+
# DGCM is used by slurm-job-exporter to export GPU metrics
40+
package { $dcgm_packages :
41+
require => Yumrepo['cuda-repo'],
42+
}
43+
}
44+
2545
file { '/etc/modprobe.d/nvidia.conf':
2646
ensure => file,
2747
owner => 'root',
@@ -101,14 +121,6 @@
101121
Array[String] $packages,
102122
String $nvidia_driver_stream = '550-dkms'
103123
) {
104-
$os = "rhel${::facts['os']['release']['major']}"
105-
$arch = $::facts['os']['architecture']
106-
107-
exec { 'cuda-repo':
108-
command => "dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/${os}/${arch}/cuda-${os}.repo",
109-
creates => "/etc/yum.repos.d/cuda-${os}.repo",
110-
path => ['/usr/bin'],
111-
}
112124

113125
package { 'nvidia-stream':
114126
ensure => $nvidia_driver_stream,
@@ -136,9 +148,6 @@
136148
],
137149
}
138150

139-
# Used by slurm-job-exporter to export GPU metrics
140-
-> package { ['datacenter-gpu-manager-4-proprietary', 'datacenter-gpu-manager-4-core', 'datacenter-gpu-manager-4-cuda12']: }
141-
142151
-> augeas { 'nvidia-persistenced.service':
143152
context => '/files/lib/systemd/system/nvidia-persistenced.service/Service',
144153
changes => [
@@ -357,16 +366,19 @@
357366
}
358367
}
359368

360-
class profile::gpu::services {
361-
if ! profile::is_grid_vgpu() {
362-
$gpu_services = ['nvidia-persistenced', 'nvidia-dcgm']
369+
class profile::gpu::services (
370+
Array[String] $names = ['nvidia-persistenced', 'nvidia-dcgm'],
371+
) {
372+
if profile::is_grid_vgpu() {
373+
$gpu_services = unique($names + ['nvidia_gridd'])
363374
} else {
364-
$gpu_services = ['nvidia-persistenced', 'nvidia-gridd']
375+
$gpu_services = $names
365376
}
377+
366378
service { $gpu_services:
367379
ensure => 'running',
368380
enable => true,
369-
notify => Service['slurm-job-exporter']
381+
notify => Service['slurm-job-exporter'],
370382
}
371383

372384
exec { 'stop_nvidia_services':

0 commit comments

Comments
 (0)