Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions .github/workflows/e2e-vitest-scenarios.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2437,6 +2437,109 @@ jobs:
if-no-files-found: ignore
retention-days: 14

jetson-nvmap-gpu-vitest:
needs: generate-matrix
if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',jetson-nvmap-gpu-vitest,') || contains(format(',{0},', inputs.scenarios), ',jetson-nvmap-gpu,') }}
runs-on: ${{ vars.JETSON_E2E_RUNNER_LABEL || 'linux-arm64-gpu-jetson-orin-latest-1' }}
timeout-minutes: 60
env:
FREE_STANDING_VITEST_JOB: "1"
FREE_STANDING_SCENARIO_ID: "jetson-nvmap-gpu"
DOCKER_CONFIG: ${{ github.workspace }}/.docker-config-jetson-nvmap-gpu
E2E_ARTIFACT_DIR: ${{ github.workspace }}/e2e-artifacts/vitest/jetson-nvmap-gpu
NEMOCLAW_CLI_BIN: ${{ github.workspace }}/bin/nemoclaw.js
NEMOCLAW_RUN_E2E_SCENARIOS: "1"
NEMOCLAW_NON_INTERACTIVE: "1"
NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1"
NEMOCLAW_SANDBOX_NAME: "e2e-jetson-nvmap"
NEMOCLAW_RECREATE_SANDBOX: "1"
NEMOCLAW_PROVIDER: "ollama"
OPENSHELL_GATEWAY: "nemoclaw"
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
persist-credentials: false

- name: Authenticate to Docker Hub
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
shell: bash
run: |
set -euo pipefail
if [[ -z "${DOCKERHUB_USERNAME}" || -z "${DOCKERHUB_TOKEN}" ]]; then
echo "::notice::Docker Hub credentials not configured; continuing with anonymous pulls."
exit 0
fi
mkdir -p "${DOCKER_CONFIG}"
chmod 700 "${DOCKER_CONFIG}"
login_succeeded=0
for attempt in 1 2 3; do
if echo "${DOCKERHUB_TOKEN}" | timeout 30s docker login docker.io --username "${DOCKERHUB_USERNAME}" --password-stdin; then
login_succeeded=1
break
fi
if [[ "$attempt" -lt 3 ]]; then
echo "::warning::Docker Hub login attempt ${attempt} failed; retrying."
sleep 5
fi
done
if [[ "$login_succeeded" -ne 1 ]]; then
echo "::warning::Docker Hub login failed after 3 attempts; continuing with anonymous pulls."
fi

- name: Set up Node
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.0.0
with:
node-version: 22
cache: npm

- name: Install root dependencies
run: npm ci --ignore-scripts

- name: Build CLI
run: npm run build:cli

- name: Verify Jetson GPU availability
run: |
set -euo pipefail
echo "=== Tegra release ==="
cat /etc/nv_tegra_release 2>/dev/null || echo "(no /etc/nv_tegra_release)"
echo ""
echo "=== /dev/nvmap ==="
ls -l /dev/nvmap 2>/dev/null || echo "(no /dev/nvmap)"
echo ""
echo "=== Docker ==="
docker info --format '{{.ServerVersion}}'
docker info --format '{{json .Runtimes}}'

- name: Run Jetson nvmap GPU live Vitest test
# Migrated from test/e2e/test-jetson-nvmap-gpu.sh. Keeps the
# Jetson/Tegra host, NVIDIA Docker runtime, install.sh, OpenShell
# sandbox exec, /dev/nvmap, CUDA cuInit(0), and status-proof boundary.
run: |
set -euo pipefail
npx vitest run --project e2e-scenarios-live \
test/e2e-scenario/live/jetson-nvmap-gpu.test.ts \
--silent=false --reporter=default

- name: Upload Jetson nvmap GPU artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: e2e-vitest-scenarios-jetson-nvmap-gpu
path: e2e-artifacts/vitest/jetson-nvmap-gpu/
include-hidden-files: false
if-no-files-found: ignore
retention-days: 14

- name: Clean up Docker auth
if: always()
run: |
set -euo pipefail
docker logout docker.io || true
rm -rf "${DOCKER_CONFIG}"

concurrent-gateway-ports-vitest:
needs: generate-matrix
if: ${{ (inputs.jobs == '' && inputs.scenarios == '') || contains(format(',{0},', inputs.jobs), ',concurrent-gateway-ports-vitest,') || contains(format(',{0},', inputs.scenarios), ',concurrent-gateway-ports,') }}
Expand Down Expand Up @@ -4816,6 +4919,7 @@ jobs:
messaging-providers-vitest,
launchable-smoke-vitest,
double-onboard-vitest,
jetson-nvmap-gpu-vitest,
concurrent-gateway-ports-vitest,
full-e2e-vitest,
cloud-onboard-vitest,
Expand Down
215 changes: 215 additions & 0 deletions test/e2e-scenario/live/jetson-nvmap-gpu.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

/** Live Vitest replacement for test/e2e/test-jetson-nvmap-gpu.sh. */

import path from "node:path";

import { buildAvailabilityProbeEnv } from "../fixtures/availability-env.ts";
import type { HostCliClient } from "../fixtures/clients/host.ts";
import { resultText } from "../fixtures/clients/index.ts";
import { type SandboxClient, trustedSandboxShellScript } from "../fixtures/clients/sandbox.ts";
import { expect, test } from "../fixtures/e2e-test.ts";
import { shouldRunLiveE2EScenarios } from "../fixtures/live-project-gate.ts";
import type { ShellProbeResult } from "../fixtures/shell-probe.ts";

const REPO_ROOT = path.resolve(import.meta.dirname, "../../..");
const SANDBOX_NAME = process.env.NEMOCLAW_SANDBOX_NAME ?? "e2e-jetson-nvmap";
const TIMEOUT_MS = 50 * 60_000;
const liveTest = shouldRunLiveE2EScenarios() ? test : test.skip;

function env(extra: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv {
return {
...buildAvailabilityProbeEnv(),
NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1",
NEMOCLAW_NON_INTERACTIVE: "1",
NEMOCLAW_PROVIDER: process.env.NEMOCLAW_PROVIDER ?? "ollama",
NEMOCLAW_RECREATE_SANDBOX: "1",
NEMOCLAW_SANDBOX_NAME: SANDBOX_NAME,
OPENSHELL_GATEWAY: process.env.OPENSHELL_GATEWAY ?? "nemoclaw",
...extra,
};
}

async function hostShell(
host: HostCliClient,
script: string,
artifactName: string,
timeoutMs = 60_000,
): Promise<ShellProbeResult> {
return await host.command("bash", ["-lc", script], {
artifactName,
cwd: REPO_ROOT,
env: env(),
timeoutMs,
});
}

async function cleanupJetsonSandbox(host: HostCliClient): Promise<void> {
await hostShell(
host,
String.raw`set +e
if command -v nemoclaw >/dev/null 2>&1; then
nemoclaw "$NEMOCLAW_SANDBOX_NAME" destroy --yes 2>/dev/null || true
fi
if command -v openshell >/dev/null 2>&1; then
openshell sandbox delete "$NEMOCLAW_SANDBOX_NAME" 2>/dev/null || true
openshell gateway destroy -g nemoclaw 2>/dev/null || true
fi
pkill -f "ollama serve" 2>/dev/null || true
pkill -f "ollama-auth-proxy" 2>/dev/null || true`,
"cleanup-jetson-nvmap",
120_000,
).catch(() => undefined);
}

function expectGroupMembership(idGroupsOutput: string, gid: string): void {
expect(gid).toMatch(/^[0-9]+$/u);
expect(idGroupsOutput.trim().split(/\s+/u)).toContain(gid);
}

liveTest(
"Jetson nvmap GPU onboard grants device-node group and reports verified CUDA",
{ timeout: TIMEOUT_MS },
async ({ artifacts, cleanup, host, sandbox, skip }) => {
await artifacts.writeJson("scenario.json", {
id: "jetson-nvmap-gpu",
legacySource: "test/e2e/test-jetson-nvmap-gpu.sh",
issue: 4231,
boundary:
"Jetson/Tegra host + install.sh Ollama onboard + Docker NVIDIA runtime + OpenShell sandbox exec + CUDA cuInit proof + nemoclaw status",
sandboxName: SANDBOX_NAME,
});

// A1: non-Jetson hosts skip cleanly before mutating Docker/OpenShell state.
const hardwareGate = await hostShell(
host,
String.raw`if [ -e /dev/nvmap ]; then
echo "jetson:/dev/nvmap"
elif [ -f /etc/nv_tegra_release ]; then
echo "jetson:/etc/nv_tegra_release"
elif [ -r /proc/device-tree/model ] && grep -qi "jetson\|orin\|tegra" /proc/device-tree/model 2>/dev/null; then
printf 'jetson:model:'
tr -d '\0' </proc/device-tree/model
printf '\n'
else
echo "non-jetson"
fi`,
"phase-0-jetson-hardware-gate",
);
expect(hardwareGate.exitCode, resultText(hardwareGate)).toBe(0);
hardwareGate.stdout.startsWith("jetson:") ||
skip(
"Not a Jetson/Tegra host (/dev/nvmap absent) — reporter workflow requires Jetson hardware; hermetic #4231 coverage remains in src/lib/onboard/docker-gpu-patch.test.ts.",
);

cleanup.add("destroy Jetson nvmap sandbox", () => cleanupJetsonSandbox(host));
await cleanupJetsonSandbox(host);

const hostNvmap = await hostShell(
host,
"ls -l /dev/nvmap && stat -c 'gid=%g group=%G' /dev/nvmap",
"phase-0-host-nvmap",
);
expect(hostNvmap.exitCode, resultText(hostNvmap)).toBe(0);
expect(hostNvmap.stdout).toContain("/dev/nvmap");
const hostNvmapGid = hostNvmap.stdout.match(/gid=([0-9]+)/u)?.[1] ?? "";
expect(hostNvmapGid).toMatch(/^[0-9]+$/u);

expect(env().NEMOCLAW_NON_INTERACTIVE).toBe("1");
expect(env().NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE).toBe("1");

// A2: Jetson prerequisites match the legacy lane: Docker and the NVIDIA runtime.
const docker = await host.command("docker", ["info"], {
artifactName: "phase-1-docker-info",
env: env(),
timeoutMs: 30_000,
});
expect(docker.exitCode, resultText(docker)).toBe(0);
const dockerRuntimes = await host.command(
"docker",
["info", "--format", "{{json .Runtimes}}"],
{
artifactName: "phase-1-docker-runtimes",
env: env(),
timeoutMs: 30_000,
},
);
expect(dockerRuntimes.exitCode, resultText(dockerRuntimes)).toBe(0);
expect(resultText(dockerRuntimes)).toMatch(/"nvidia"|nvidia:/u);

// A3: preserve the reporter workflow by installing/running the real onboarding shell path.
const installOllama = await hostShell(
host,
'if [ "${NEMOCLAW_PROVIDER:-ollama}" = "ollama" ] && ! command -v ollama >/dev/null 2>&1; then\n' +
" curl -fsSL https://ollama.com/install.sh | sh 2>&1 || true\n" +
" systemctl stop ollama 2>/dev/null || true\n" +
' pkill -f "ollama serve" 2>/dev/null || true\n' +
"fi",
"phase-1-install-ollama-if-needed",
10 * 60_000,
);
expect(installOllama.exitCode, resultText(installOllama)).toBe(0);

const install = await host.command("bash", ["install.sh", "--non-interactive"], {
artifactName: "phase-2-install-jetson-nvmap",
cwd: REPO_ROOT,
env: env(),
timeoutMs: 40 * 60_000,
});
await artifacts.writeText("install-jetson-nvmap.log", resultText(install));
expect(install.exitCode, resultText(install)).toBe(0);

const installedCli = await hostShell(host, "command -v nemoclaw", "phase-2-command-v-nemoclaw");
expect(installedCli.exitCode, resultText(installedCli)).toBe(0);
expect(installedCli.stdout.trim()).not.toBe("");

// A4: the Jetson recreate must grant Tegra device-node groups via --group-add.
expect(resultText(install)).toContain(
"Granting sandbox user access to Jetson Tegra GPU device nodes via --group-add",
);

// A5: the sandbox user must be in the host /dev/nvmap owning GID.
const sandboxId = await sandbox.execShell(SANDBOX_NAME, trustedSandboxShellScript("id -G"), {
artifactName: "phase-3-sandbox-id-groups",
env: env(),
timeoutMs: 60_000,
});
expect(sandboxId.exitCode, resultText(sandboxId)).toBe(0);
expectGroupMembership(resultText(sandboxId), hostNvmapGid);

// A6: /dev/nvmap must be mounted/present inside the sandbox.
const sandboxNvmap = await sandbox.execShell(
SANDBOX_NAME,
trustedSandboxShellScript("ls -l /dev/nvmap"),
{ artifactName: "phase-3-sandbox-nvmap", env: env(), timeoutMs: 60_000 },
);
expect(sandboxNvmap.exitCode, resultText(sandboxNvmap)).toBe(0);
expect(resultText(sandboxNvmap)).toContain("/dev/nvmap");

// A7: authoritative CUDA usability proof must succeed, not reproduce
// NvRmMemInitNvmap permission denial / cuInit(0)=999 from #4231.
const cudaProbe = await sandbox.execShell(
SANDBOX_NAME,
trustedSandboxShellScript(
`python3 -c 'import ctypes; lib = ctypes.CDLL("libcuda.so.1"); rc = lib.cuInit(0); print(f"cuInit(0)={rc}"); raise SystemExit(0 if rc == 0 else 1)'`,
),
{ artifactName: "phase-3-sandbox-cuda-cuinit", env: env(), timeoutMs: 120_000 },
);
expect(resultText(cudaProbe)).not.toMatch(/NvRmMemInitNvmap|Permission denied/u);
expect(cudaProbe.exitCode, resultText(cudaProbe)).toBe(0);
expect(resultText(cudaProbe)).toContain("cuInit(0)=0");

// A8: status must say enabled with verified CUDA, never bare/unverified/failed.
const status = await hostShell(
host,
`nemoclaw "$NEMOCLAW_SANDBOX_NAME" status`,
"phase-4-nemoclaw-status",
120_000,
);
expect(status.exitCode, resultText(status)).toBe(0);
expect(resultText(status)).toContain("Sandbox GPU: enabled");
expect(resultText(status)).toContain("CUDA verified");
expect(resultText(status)).not.toMatch(/last CUDA proof failed|CUDA unverified/u);
},
);
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ jobs:

it(
"keeps each free-standing scenario out of the registry matrix",
testTimeoutOptions(240_000),
testTimeoutOptions(420_000),
() => {
const inventory = readFreeStandingJobsInventory();
for (const job of inventory.allowedJobs) {
Expand Down
2 changes: 2 additions & 0 deletions tools/e2e-scenarios/workflow-boundary.mts
Original file line number Diff line number Diff line change
Expand Up @@ -4840,6 +4840,8 @@ export function validateE2eVitestScenariosWorkflowBoundary(
"gateway-health-honest",
);

validateFreeStandingJobSelector(errors, jobs, "jetson-nvmap-gpu-vitest", "jetson-nvmap-gpu");

validateFreeStandingJobSelector(
errors,
jobs,
Expand Down