diff --git a/docs/README.md b/docs/README.md index 096f962a..d486dd88 100644 --- a/docs/README.md +++ b/docs/README.md @@ -618,6 +618,13 @@ available models per region ##### Incus - `target`: name of the [specific cluster member](https://linuxcontainers.org/incus/docs/main/howto/cluster_manage_instance/#launch-an-instance-on-a-specific-cluster-member) to deploy the instance. **Only use with Incus cluster.** +* `gpus_pci`: list of [PCI addresses of the GPU devices](https://linuxcontainers.org/incus/docs/main/reference/devices_gpu/#devices-gpu_physical:pci) to pass through to instances on the node. Use `incus info --resources` to list available resources. The Incus host must have the NVIDIA GPU driver and the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#with-dnf-rhel-centos-fedora-amazon-linux) installed. *** + +**Limitations:** + * The node `count` **must be 1** + * The container **must be unprivileged** + * **Do not use the `gpu` tag**, as it would install the NVIDIA driver. We use the host’s driver instead. + #### 4.7.3 Post build modification effect diff --git a/examples/incus/main.tf b/examples/incus/main.tf index 33b9d980..ab4ecfa6 100644 --- a/examples/incus/main.tf +++ b/examples/incus/main.tf @@ -15,6 +15,9 @@ module "incus" { mgmt = { type = "container", cpus = 4, ram = 6000, gpus = 0, tags = ["puppet", "mgmt", "nfs"], count = 1 } login = { type = "container", cpus = 2, ram = 3000, gpus = 0, tags = ["login", "proxy"], count = 1 } node = { type = "container", cpus = 2, ram = 3000, gpus = 0, tags = ["node"], count = 1 } + # Uncomment the folowing line to mount a GPU. The PCI id must match with the GPU and the container must be unprivileged + # Do not use the gpu tag, as it would install the NVIDIA driver. We use the host’s driver instead. + # node_gpu = { type = "container", cpus = 2, ram = 3000, gpus = 0, tags = ["node"], count = 1, gpu_pci = ["0000:00:06.0"] } } firewall_rules = { diff --git a/examples/incus/unprivileged.yaml b/examples/incus/unprivileged.yaml index 0ec6b1bf..097980ad 100644 --- a/examples/incus/unprivileged.yaml +++ b/examples/incus/unprivileged.yaml @@ -6,6 +6,9 @@ lookup_options: jupyterhub::kernel::venv::python: "3.12" +profile::cvmfs::local_user::uid: 40001 +profile::cvmfs::local_user::gid: 40001 + magic_castle::site::all: - profile::base - profile::consul diff --git a/incus/infrastructure.tf b/incus/infrastructure.tf index 5ad78c89..7451db99 100644 --- a/incus/infrastructure.tf +++ b/incus/infrastructure.tf @@ -91,6 +91,8 @@ resource "incus_instance" "instances" { config = { "cloud-init.user-data" = module.configuration.user_data[each.key] "security.privileged" = var.privileged + # nvidia.runtime is incompatible with privileged containers + "nvidia.runtime" = length(try(each.value.gpu_pci, [])) > 0 ? !var.privileged : false } device { @@ -137,6 +139,20 @@ resource "incus_instance" "instances" { } } + dynamic "device" { + for_each = length(try(each.value.gpu_pci, [])) > 0 ? { for idx, pci in each.value.gpu_pci : idx => pci } : {} + + + content { + name = "gpu${device.key}" + type = "gpu" + properties = { + gputype : "physical" + pci = device.value + } + } + } + wait_for { type = "ipv4" } @@ -145,9 +161,14 @@ resource "incus_instance" "instances" { locals { inventory = { for host, values in module.design.instances : host => { - prefix = values.prefix - tags = values.tags - specs = values.specs + prefix = values.prefix + tags = values.tags + specs = merge( + values.specs, + { + gpus = length(try(values.gpu_pci, [])) + } + ) volumes = {} } }