From a2c1b6c4d25300edb31199100092fea5925e7db9 Mon Sep 17 00:00:00 2001 From: Julie Yaunches Date: Mon, 22 Jun 2026 17:00:13 -0400 Subject: [PATCH 1/3] test(e2e): migrate test-jetson-nvmap-gpu.sh to vitest --- .github/workflows/e2e-vitest-scenarios.yaml | 104 +++++++++ .../live/jetson-nvmap-gpu.test.ts | 217 ++++++++++++++++++ .../e2e-scenarios-workflow.test.ts | 2 +- tools/e2e-scenarios/workflow-boundary.mts | 2 + 4 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 test/e2e-scenario/live/jetson-nvmap-gpu.test.ts diff --git a/.github/workflows/e2e-vitest-scenarios.yaml b/.github/workflows/e2e-vitest-scenarios.yaml index 1902447188..f6ddd4777b 100644 --- a/.github/workflows/e2e-vitest-scenarios.yaml +++ b/.github/workflows/e2e-vitest-scenarios.yaml @@ -2437,6 +2437,109 @@ jobs: if-no-files-found: ignore retention-days: 14 + jetson-nvmap-gpu-vitest: + needs: generate-matrix + if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',jetson-nvmap-gpu-vitest,') || contains(format(',{0},', inputs.scenarios), ',jetson-nvmap-gpu,') }} + runs-on: ${{ vars.JETSON_E2E_RUNNER_LABEL || 'linux-arm64-gpu-jetson-orin-latest-1' }} + timeout-minutes: 60 + env: + FREE_STANDING_VITEST_JOB: "1" + FREE_STANDING_SCENARIO_ID: "jetson-nvmap-gpu" + DOCKER_CONFIG: ${{ github.workspace }}/.docker-config-jetson-nvmap-gpu + E2E_ARTIFACT_DIR: ${{ github.workspace }}/e2e-artifacts/vitest/jetson-nvmap-gpu + NEMOCLAW_CLI_BIN: ${{ github.workspace }}/bin/nemoclaw.js + NEMOCLAW_RUN_E2E_SCENARIOS: "1" + NEMOCLAW_NON_INTERACTIVE: "1" + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" + NEMOCLAW_SANDBOX_NAME: "e2e-jetson-nvmap" + NEMOCLAW_RECREATE_SANDBOX: "1" + NEMOCLAW_PROVIDER: "ollama" + OPENSHELL_GATEWAY: "nemoclaw" + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false + + - name: Authenticate to Docker Hub + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + shell: bash + run: | + set -euo pipefail + if [[ -z "${DOCKERHUB_USERNAME}" || -z "${DOCKERHUB_TOKEN}" ]]; then + echo "::notice::Docker Hub credentials not configured; continuing with anonymous pulls." + exit 0 + fi + mkdir -p "${DOCKER_CONFIG}" + chmod 700 "${DOCKER_CONFIG}" + login_succeeded=0 + for attempt in 1 2 3; do + if echo "${DOCKERHUB_TOKEN}" | timeout 30s docker login docker.io --username "${DOCKERHUB_USERNAME}" --password-stdin; then + login_succeeded=1 + break + fi + if [[ "$attempt" -lt 3 ]]; then + echo "::warning::Docker Hub login attempt ${attempt} failed; retrying." + sleep 5 + fi + done + if [[ "$login_succeeded" -ne 1 ]]; then + echo "::warning::Docker Hub login failed after 3 attempts; continuing with anonymous pulls." + fi + + - name: Set up Node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.0.0 + with: + node-version: 22 + cache: npm + + - name: Install root dependencies + run: npm ci --ignore-scripts + + - name: Build CLI + run: npm run build:cli + + - name: Verify Jetson GPU availability + run: | + set -euo pipefail + echo "=== Tegra release ===" + cat /etc/nv_tegra_release 2>/dev/null || echo "(no /etc/nv_tegra_release)" + echo "" + echo "=== /dev/nvmap ===" + ls -l /dev/nvmap 2>/dev/null || echo "(no /dev/nvmap)" + echo "" + echo "=== Docker ===" + docker info --format '{{.ServerVersion}}' + docker info --format '{{json .Runtimes}}' + + - name: Run Jetson nvmap GPU live Vitest test + # Migrated from test/e2e/test-jetson-nvmap-gpu.sh. Keeps the + # Jetson/Tegra host, NVIDIA Docker runtime, install.sh, OpenShell + # sandbox exec, /dev/nvmap, CUDA cuInit(0), and status-proof boundary. + run: | + set -euo pipefail + npx vitest run --project e2e-scenarios-live \ + test/e2e-scenario/live/jetson-nvmap-gpu.test.ts \ + --silent=false --reporter=default + + - name: Upload Jetson nvmap GPU artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: e2e-vitest-scenarios-jetson-nvmap-gpu + path: e2e-artifacts/vitest/jetson-nvmap-gpu/ + include-hidden-files: false + if-no-files-found: ignore + retention-days: 14 + + - name: Clean up Docker auth + if: always() + run: | + set -euo pipefail + docker logout docker.io || true + rm -rf "${DOCKER_CONFIG}" + concurrent-gateway-ports-vitest: needs: generate-matrix if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',concurrent-gateway-ports-vitest,') || contains(format(',{0},', inputs.scenarios), ',concurrent-gateway-ports,') }} @@ -4816,6 +4919,7 @@ jobs: messaging-providers-vitest, launchable-smoke-vitest, double-onboard-vitest, + jetson-nvmap-gpu-vitest, concurrent-gateway-ports-vitest, full-e2e-vitest, cloud-onboard-vitest, diff --git a/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts new file mode 100644 index 0000000000..998371585d --- /dev/null +++ b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts @@ -0,0 +1,217 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** Live Vitest replacement for test/e2e/test-jetson-nvmap-gpu.sh. */ + +import path from "node:path"; + +import { buildAvailabilityProbeEnv } from "../fixtures/availability-env.ts"; +import type { HostCliClient } from "../fixtures/clients/host.ts"; +import { resultText } from "../fixtures/clients/index.ts"; +import { type SandboxClient, trustedSandboxShellScript } from "../fixtures/clients/sandbox.ts"; +import { expect, test } from "../fixtures/e2e-test.ts"; +import { shouldRunLiveE2EScenarios } from "../fixtures/live-project-gate.ts"; +import type { ShellProbeResult } from "../fixtures/shell-probe.ts"; + +const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); +const SANDBOX_NAME = process.env.NEMOCLAW_SANDBOX_NAME ?? "e2e-jetson-nvmap"; +const TIMEOUT_MS = 50 * 60_000; +const liveTest = shouldRunLiveE2EScenarios() ? test : test.skip; + +function env(extra: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { + return { + ...buildAvailabilityProbeEnv(), + NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1", + NEMOCLAW_NON_INTERACTIVE: "1", + NEMOCLAW_PROVIDER: process.env.NEMOCLAW_PROVIDER ?? "ollama", + NEMOCLAW_RECREATE_SANDBOX: "1", + NEMOCLAW_SANDBOX_NAME: SANDBOX_NAME, + OPENSHELL_GATEWAY: process.env.OPENSHELL_GATEWAY ?? "nemoclaw", + ...extra, + }; +} + +async function hostShell( + host: HostCliClient, + script: string, + artifactName: string, + timeoutMs = 60_000, +): Promise { + return await host.command("bash", ["-lc", script], { + artifactName, + cwd: REPO_ROOT, + env: env(), + timeoutMs, + }); +} + +async function cleanupJetsonSandbox(host: HostCliClient): Promise { + await hostShell( + host, + String.raw`set +e +if command -v nemoclaw >/dev/null 2>&1; then + nemoclaw "$NEMOCLAW_SANDBOX_NAME" destroy --yes 2>/dev/null || true +fi +if command -v openshell >/dev/null 2>&1; then + openshell sandbox delete "$NEMOCLAW_SANDBOX_NAME" 2>/dev/null || true + openshell gateway destroy -g nemoclaw 2>/dev/null || true +fi +pkill -f "ollama serve" 2>/dev/null || true +pkill -f "ollama-auth-proxy" 2>/dev/null || true`, + "cleanup-jetson-nvmap", + 120_000, + ).catch(() => undefined); +} + +function expectGroupMembership(idOutput: string, gid: string): void { + expect(gid).toMatch(/^[0-9]+$/u); + const groupPattern = new RegExp(`(^|[(,=])${gid}([(,) ]|$)`, "u"); + expect(idOutput).toMatch(groupPattern); +} + +liveTest( + "Jetson nvmap GPU onboard grants device-node group and reports verified CUDA", + { timeout: TIMEOUT_MS }, + async ({ artifacts, cleanup, host, sandbox, skip }) => { + await artifacts.writeJson("scenario.json", { + id: "jetson-nvmap-gpu", + legacySource: "test/e2e/test-jetson-nvmap-gpu.sh", + issue: 4231, + boundary: + "Jetson/Tegra host + install.sh Ollama onboard + Docker NVIDIA runtime + OpenShell sandbox exec + CUDA cuInit proof + nemoclaw status", + sandboxName: SANDBOX_NAME, + }); + + // A1: non-Jetson hosts skip cleanly before mutating Docker/OpenShell state. + const hardwareGate = await hostShell( + host, + String.raw`if [ -e /dev/nvmap ]; then + echo "jetson:/dev/nvmap" +elif [ -f /etc/nv_tegra_release ]; then + echo "jetson:/etc/nv_tegra_release" +elif [ -r /proc/device-tree/model ] && grep -qi "jetson\|orin\|tegra" /proc/device-tree/model 2>/dev/null; then + printf 'jetson:model:' + tr -d '\0' cleanupJetsonSandbox(host)); + await cleanupJetsonSandbox(host); + + const hostNvmap = await hostShell( + host, + "ls -l /dev/nvmap && stat -c 'gid=%g group=%G' /dev/nvmap", + "phase-0-host-nvmap", + ); + expect(hostNvmap.exitCode, resultText(hostNvmap)).toBe(0); + expect(hostNvmap.stdout).toContain("/dev/nvmap"); + const hostNvmapGid = hostNvmap.stdout.match(/gid=([0-9]+)/u)?.[1] ?? ""; + expect(hostNvmapGid).toMatch(/^[0-9]+$/u); + + expect(env().NEMOCLAW_NON_INTERACTIVE).toBe("1"); + expect(env().NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE).toBe("1"); + + // A2: Jetson prerequisites match the legacy lane: Docker and the NVIDIA runtime. + const docker = await host.command("docker", ["info"], { + artifactName: "phase-1-docker-info", + env: env(), + timeoutMs: 30_000, + }); + expect(docker.exitCode, resultText(docker)).toBe(0); + const dockerRuntimes = await host.command( + "docker", + ["info", "--format", "{{json .Runtimes}}"], + { + artifactName: "phase-1-docker-runtimes", + env: env(), + timeoutMs: 30_000, + }, + ); + expect(dockerRuntimes.exitCode, resultText(dockerRuntimes)).toBe(0); + expect(resultText(dockerRuntimes)).toMatch(/"nvidia"|nvidia:/u); + + // A3: preserve the reporter workflow by installing/running the real onboarding shell path. + const installOllama = await hostShell( + host, + 'if [ "${NEMOCLAW_PROVIDER:-ollama}" = "ollama" ] && ! command -v ollama >/dev/null 2>&1; then\n' + + " curl -fsSL https://ollama.com/install.sh | sh 2>&1 || true\n" + + " systemctl stop ollama 2>/dev/null || true\n" + + ' pkill -f "ollama serve" 2>/dev/null || true\n' + + "fi", + "phase-1-install-ollama-if-needed", + 10 * 60_000, + ); + expect(installOllama.exitCode, resultText(installOllama)).toBe(0); + + const install = await host.command("bash", ["install.sh", "--non-interactive"], { + artifactName: "phase-2-install-jetson-nvmap", + cwd: REPO_ROOT, + env: env(), + timeoutMs: 40 * 60_000, + }); + await artifacts.writeText("install-jetson-nvmap.log", resultText(install)); + expect(install.exitCode, resultText(install)).toBe(0); + + const installedCli = await hostShell(host, "command -v nemoclaw", "phase-2-command-v-nemoclaw"); + expect(installedCli.exitCode, resultText(installedCli)).toBe(0); + expect(installedCli.stdout.trim()).not.toBe(""); + + // A4: the Jetson recreate must grant Tegra device-node groups via --group-add. + expect(resultText(install)).toContain( + "Granting sandbox user access to Jetson Tegra GPU device nodes via --group-add", + ); + + // A5: the sandbox user must be in the host /dev/nvmap owning GID. + const sandboxId = await sandbox.execShell(SANDBOX_NAME, trustedSandboxShellScript("id"), { + artifactName: "phase-3-sandbox-id", + env: env(), + timeoutMs: 60_000, + }); + expect(sandboxId.exitCode, resultText(sandboxId)).toBe(0); + expectGroupMembership(resultText(sandboxId), hostNvmapGid); + + // A6: /dev/nvmap must be mounted/present inside the sandbox. + const sandboxNvmap = await sandbox.execShell( + SANDBOX_NAME, + trustedSandboxShellScript("ls -l /dev/nvmap"), + { artifactName: "phase-3-sandbox-nvmap", env: env(), timeoutMs: 60_000 }, + ); + expect(sandboxNvmap.exitCode, resultText(sandboxNvmap)).toBe(0); + expect(resultText(sandboxNvmap)).toContain("/dev/nvmap"); + + // A7: authoritative CUDA usability proof must succeed, not reproduce + // NvRmMemInitNvmap permission denial / cuInit(0)=999 from #4231. + const cudaProbe = await sandbox.execShell( + SANDBOX_NAME, + trustedSandboxShellScript( + `python3 -c 'import ctypes; lib = ctypes.CDLL("libcuda.so.1"); rc = lib.cuInit(0); print(f"cuInit(0)={rc}"); raise SystemExit(0 if rc == 0 else 1)'`, + ), + { artifactName: "phase-3-sandbox-cuda-cuinit", env: env(), timeoutMs: 120_000 }, + ); + expect(resultText(cudaProbe)).not.toMatch(/NvRmMemInitNvmap|Permission denied/u); + expect(cudaProbe.exitCode, resultText(cudaProbe)).toBe(0); + expect(resultText(cudaProbe)).toContain("cuInit(0)=0"); + + // A8: status must say enabled with verified CUDA, never bare/unverified/failed. + const status = await hostShell( + host, + `nemoclaw "$NEMOCLAW_SANDBOX_NAME" status`, + "phase-4-nemoclaw-status", + 120_000, + ); + expect(status.exitCode, resultText(status)).toBe(0); + expect(resultText(status)).toContain("Sandbox GPU: enabled"); + expect(resultText(status)).toContain("CUDA verified"); + expect(resultText(status)).not.toMatch(/last CUDA proof failed|CUDA unverified/u); + }, +); diff --git a/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts b/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts index 7578e91d51..37c1a21f5f 100644 --- a/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts +++ b/test/e2e-scenario/support-tests/e2e-scenarios-workflow.test.ts @@ -740,7 +740,7 @@ jobs: it( "keeps each free-standing scenario out of the registry matrix", - testTimeoutOptions(240_000), + testTimeoutOptions(420_000), () => { const inventory = readFreeStandingJobsInventory(); for (const job of inventory.allowedJobs) { diff --git a/tools/e2e-scenarios/workflow-boundary.mts b/tools/e2e-scenarios/workflow-boundary.mts index 09d2b25481..25788514f9 100644 --- a/tools/e2e-scenarios/workflow-boundary.mts +++ b/tools/e2e-scenarios/workflow-boundary.mts @@ -4840,6 +4840,8 @@ export function validateE2eVitestScenariosWorkflowBoundary( "gateway-health-honest", ); + validateFreeStandingJobSelector(errors, jobs, "jetson-nvmap-gpu-vitest", "jetson-nvmap-gpu"); + validateFreeStandingJobSelector( errors, jobs, From 52c886dbb232055fce5c4c6009973d95f4a7397d Mon Sep 17 00:00:00 2001 From: Julie Yaunches Date: Mon, 22 Jun 2026 17:04:51 -0400 Subject: [PATCH 2/3] test(e2e): avoid Jetson test if branch --- test/e2e-scenario/live/jetson-nvmap-gpu.test.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts index 998371585d..bbde02bb66 100644 --- a/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts +++ b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts @@ -99,11 +99,10 @@ fi`, "phase-0-jetson-hardware-gate", ); expect(hardwareGate.exitCode, resultText(hardwareGate)).toBe(0); - if (!hardwareGate.stdout.startsWith("jetson:")) { + hardwareGate.stdout.startsWith("jetson:") || skip( "Not a Jetson/Tegra host (/dev/nvmap absent) — reporter workflow requires Jetson hardware; hermetic #4231 coverage remains in src/lib/onboard/docker-gpu-patch.test.ts.", ); - } cleanup.add("destroy Jetson nvmap sandbox", () => cleanupJetsonSandbox(host)); await cleanupJetsonSandbox(host); From 4901991cc92b2de9c588dc80344e46706e3ec83d Mon Sep 17 00:00:00 2001 From: Carlos Villela Date: Mon, 22 Jun 2026 22:15:57 -0700 Subject: [PATCH 3/3] test(e2e): tighten Jetson nvmap group assertion Signed-off-by: Carlos Villela --- test/e2e-scenario/live/jetson-nvmap-gpu.test.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts index bbde02bb66..ed1c45910f 100644 --- a/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts +++ b/test/e2e-scenario/live/jetson-nvmap-gpu.test.ts @@ -63,10 +63,9 @@ pkill -f "ollama-auth-proxy" 2>/dev/null || true`, ).catch(() => undefined); } -function expectGroupMembership(idOutput: string, gid: string): void { +function expectGroupMembership(idGroupsOutput: string, gid: string): void { expect(gid).toMatch(/^[0-9]+$/u); - const groupPattern = new RegExp(`(^|[(,=])${gid}([(,) ]|$)`, "u"); - expect(idOutput).toMatch(groupPattern); + expect(idGroupsOutput.trim().split(/\s+/u)).toContain(gid); } liveTest( @@ -171,8 +170,8 @@ fi`, ); // A5: the sandbox user must be in the host /dev/nvmap owning GID. - const sandboxId = await sandbox.execShell(SANDBOX_NAME, trustedSandboxShellScript("id"), { - artifactName: "phase-3-sandbox-id", + const sandboxId = await sandbox.execShell(SANDBOX_NAME, trustedSandboxShellScript("id -G"), { + artifactName: "phase-3-sandbox-id-groups", env: env(), timeoutMs: 60_000, });