diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79b079fb..260a6093 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -61,6 +61,7 @@ jobs: - name: Run tests env: + GO_TEST_TIMEOUT: 600s # Docker auth for tests running as root (sudo) DOCKER_CONFIG: /home/debianuser/.docker # TLS/ACME testing (optional - tests will skip if not configured) @@ -118,6 +119,7 @@ jobs: - name: Run tests env: + GO_TEST_TIMEOUT: 600s DEFAULT_HYPERVISOR: vz JWT_SECRET: ci-test-secret run: make test diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 21d7c836..1813f547 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -469,7 +469,7 @@ Note: Full integration tests require Linux. On macOS, focus on unit tests and ma 1. **Disk Format**: vz only supports raw disk images (not qcow2). The image pipeline handles conversion automatically. -2. **Snapshots**: Not currently supported on the vz hypervisor. +2. **Snapshot Compatibility**: vz save/restore requires macOS 14.0+ on Apple Silicon and a VM configuration that passes save/restore validation. ### Troubleshooting @@ -496,6 +496,7 @@ brew install caddy **"snapshot not supported"** - Requires macOS 14.0+ on Apple Silicon - Check: `sw_vers` and `uname -m` (should be arm64) +- Ensure the VM has been paused before standby and has a save/restore-compatible configuration **VM fails to start** - Check serial log: `/instances//serial.log` diff --git a/Makefile b/Makefile index f55af3b2..85ca0d43 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ SHELL := /bin/bash # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin +GO_TEST_TIMEOUT ?= 300s $(BIN_DIR): mkdir -p $(BIN_DIR) @@ -13,7 +14,7 @@ OAPI_CODEGEN_VERSION ?= v2.5.1 AIR ?= $(BIN_DIR)/air WIRE ?= $(BIN_DIR)/wire XCADDY ?= $(BIN_DIR)/xcaddy -TEST_TIMEOUT ?= 600s +TEST_TIMEOUT ?= $(GO_TEST_TIMEOUT) # Install oapi-codegen (pinned to match committed generated code) $(OAPI_CODEGEN): | $(BIN_DIR) diff --git a/README.md b/README.md index 833d7cfd..80bbdb98 100644 --- a/README.md +++ b/README.md @@ -123,10 +123,10 @@ hypeman stop my-app # Start a stopped VM hypeman start my-app -# Put the VM to sleep (paused) +# Put the VM in standby (snapshot to disk, stop hypervisor) hypeman standby my-app -# Wake the VM (resumed) +# Restore the VM from standby hypeman restore my-app # Delete all VMs diff --git a/cmd/vz-shim/main.go b/cmd/vz-shim/main.go index 1a1c887f..127a9a81 100644 --- a/cmd/vz-shim/main.go +++ b/cmd/vz-shim/main.go @@ -46,22 +46,36 @@ func main() { slog.Info("vz-shim starting", "control_socket", config.ControlSocket, "vsock_socket", config.VsockSocket) // Create the VM - vm, vmConfig, err := createVM(config) + vm, vmConfig, err := createVM(&config) if err != nil { slog.Error("failed to create VM", "error", err) fmt.Fprintf(os.Stderr, "failed to create VM: %v\n", err) os.Exit(1) } - if err := vm.Start(); err != nil { - slog.Error("failed to start VM", "error", err) - fmt.Fprintf(os.Stderr, "failed to start VM: %v\n", err) - os.Exit(1) + if config.RestoreMachineStatePath != "" { + if err := validateSaveRestoreSupport(vmConfig); err != nil { + slog.Error("save/restore not supported for VM config", "error", err) + fmt.Fprintf(os.Stderr, "save/restore not supported for VM config: %v\n", err) + os.Exit(1) + } + if err := restoreMachineState(vm, config.RestoreMachineStatePath); err != nil { + slog.Error("failed to restore VM machine state", "error", err, "path", config.RestoreMachineStatePath) + fmt.Fprintf(os.Stderr, "failed to restore VM machine state: %v\n", err) + os.Exit(1) + } + slog.Info("VM restored from machine state", "path", config.RestoreMachineStatePath, "state", vm.State()) + } else { + if err := vm.Start(); err != nil { + slog.Error("failed to start VM", "error", err) + fmt.Fprintf(os.Stderr, "failed to start VM: %v\n", err) + os.Exit(1) + } + slog.Info("VM started", "vcpus", config.VCPUs, "memory_mb", config.MemoryBytes/1024/1024) } - slog.Info("VM started", "vcpus", config.VCPUs, "memory_mb", config.MemoryBytes/1024/1024) // Create the shim server - server := NewShimServer(vm, vmConfig) + server := NewShimServer(vm, vmConfig, config) // Start control socket listener (remove stale socket from previous run) os.Remove(config.ControlSocket) diff --git a/cmd/vz-shim/save_restore_arm64.go b/cmd/vz-shim/save_restore_arm64.go new file mode 100644 index 00000000..29147c3e --- /dev/null +++ b/cmd/vz-shim/save_restore_arm64.go @@ -0,0 +1,29 @@ +//go:build darwin && arm64 + +package main + +import ( + "fmt" + + "github.com/Code-Hex/vz/v3" +) + +func validateSaveRestoreSupport(vmConfig *vz.VirtualMachineConfiguration) error { + ok, err := vmConfig.ValidateSaveRestoreSupport() + if err != nil { + return err + } + if !ok { + return fmt.Errorf("virtual machine configuration does not support save/restore") + } + return nil +} + +func saveMachineState(vm *vz.VirtualMachine, snapshotPath string) error { + return vm.SaveMachineStateToPath(snapshotPath) +} + +func restoreMachineState(vm *vz.VirtualMachine, snapshotPath string) error { + // The vz wrapper accepts a filesystem path and constructs a file URL internally. + return vm.RestoreMachineStateFromURL(snapshotPath) +} diff --git a/cmd/vz-shim/save_restore_unsupported.go b/cmd/vz-shim/save_restore_unsupported.go new file mode 100644 index 00000000..3dbcd184 --- /dev/null +++ b/cmd/vz-shim/save_restore_unsupported.go @@ -0,0 +1,22 @@ +//go:build darwin && !arm64 + +package main + +import ( + "fmt" + "runtime" + + "github.com/Code-Hex/vz/v3" +) + +func validateSaveRestoreSupport(vmConfig *vz.VirtualMachineConfiguration) error { + return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH) +} + +func saveMachineState(vm *vz.VirtualMachine, snapshotPath string) error { + return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH) +} + +func restoreMachineState(vm *vz.VirtualMachine, snapshotPath string) error { + return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH) +} diff --git a/cmd/vz-shim/server.go b/cmd/vz-shim/server.go index fe5ab603..43ba9142 100644 --- a/cmd/vz-shim/server.go +++ b/cmd/vz-shim/server.go @@ -10,23 +10,28 @@ import ( "log/slog" "net" "net/http" + "os" + "path/filepath" "sync" "github.com/Code-Hex/vz/v3" + "github.com/kernel/hypeman/lib/hypervisor/vz/shimconfig" ) // ShimServer handles control API and vsock proxy for a vz VM. type ShimServer struct { - vm *vz.VirtualMachine - vmConfig *vz.VirtualMachineConfiguration - mu sync.RWMutex + vm *vz.VirtualMachine + vmConfig *vz.VirtualMachineConfiguration + shimConfig shimconfig.ShimConfig + mu sync.RWMutex } // NewShimServer creates a new shim server. -func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfiguration) *ShimServer { +func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfiguration, config shimconfig.ShimConfig) *ShimServer { return &ShimServer{ - vm: vm, - vmConfig: vmConfig, + vm: vm, + vmConfig: vmConfig, + shimConfig: config, } } @@ -35,6 +40,10 @@ type VMInfoResponse struct { State string `json:"state"` } +type snapshotRequest struct { + DestinationPath string `json:"destination_path"` +} + // Handler returns the HTTP handler for the control API. func (s *ShimServer) Handler() http.Handler { mux := http.NewServeMux() @@ -44,6 +53,7 @@ func (s *ShimServer) Handler() http.Handler { mux.HandleFunc("PUT /api/v1/vm.pause", s.handlePause) mux.HandleFunc("PUT /api/v1/vm.resume", s.handleResume) mux.HandleFunc("PUT /api/v1/vm.shutdown", s.handleShutdown) + mux.HandleFunc("PUT /api/v1/vm.snapshot", s.handleSnapshot) mux.HandleFunc("PUT /api/v1/vm.power-button", s.handlePowerButton) mux.HandleFunc("GET /api/v1/vmm.ping", s.handlePing) mux.HandleFunc("PUT /api/v1/vmm.shutdown", s.handleVMMShutdown) @@ -118,6 +128,77 @@ func (s *ShimServer) handleShutdown(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } +func (s *ShimServer) handleSnapshot(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + + var req snapshotRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, fmt.Sprintf("invalid snapshot request: %v", err), http.StatusBadRequest) + return + } + if req.DestinationPath == "" { + http.Error(w, "destination_path is required", http.StatusBadRequest) + return + } + if s.vm.State() != vz.VirtualMachineStatePaused { + http.Error(w, "vm must be paused before snapshot", http.StatusBadRequest) + return + } + if err := validateSaveRestoreSupport(s.vmConfig); err != nil { + http.Error(w, fmt.Sprintf("save/restore not supported: %v", err), http.StatusBadRequest) + return + } + + if err := os.MkdirAll(req.DestinationPath, 0755); err != nil { + http.Error(w, fmt.Sprintf("create snapshot dir failed: %v", err), http.StatusInternalServerError) + return + } + snapshotComplete := false + defer func() { + if !snapshotComplete { + _ = os.RemoveAll(req.DestinationPath) + } + }() + + machineStatePath := filepath.Join(req.DestinationPath, shimconfig.SnapshotMachineStateFile) + if err := os.RemoveAll(machineStatePath); err != nil { + http.Error(w, fmt.Sprintf("prepare machine state path failed: %v", err), http.StatusInternalServerError) + return + } + if err := saveMachineState(s.vm, machineStatePath); err != nil { + http.Error(w, fmt.Sprintf("save machine state failed: %v", err), http.StatusInternalServerError) + return + } + + manifestPath := filepath.Join(req.DestinationPath, shimconfig.SnapshotManifestFile) + tmpManifestPath := manifestPath + ".tmp" + manifest := shimconfig.SnapshotManifest{ + Hypervisor: "vz", + MachineStateFile: shimconfig.SnapshotMachineStateFile, + ShimConfig: s.shimConfig, + } + // This field is runtime-only; restore path is populated by the caller on restore. + manifest.ShimConfig.RestoreMachineStatePath = "" + manifestBytes, err := json.Marshal(manifest) + if err != nil { + http.Error(w, fmt.Sprintf("marshal manifest failed: %v", err), http.StatusInternalServerError) + return + } + if err := os.WriteFile(tmpManifestPath, manifestBytes, 0644); err != nil { + http.Error(w, fmt.Sprintf("write manifest failed: %v", err), http.StatusInternalServerError) + return + } + if err := os.Rename(tmpManifestPath, manifestPath); err != nil { + http.Error(w, fmt.Sprintf("finalize manifest failed: %v", err), http.StatusInternalServerError) + return + } + + snapshotComplete = true + slog.Info("VM snapshot saved", "destination", req.DestinationPath, "machine_state", machineStatePath) + w.WriteHeader(http.StatusNoContent) +} + func (s *ShimServer) handlePowerButton(w http.ResponseWriter, r *http.Request) { s.mu.Lock() defer s.mu.Unlock() @@ -173,6 +254,10 @@ func vzStateToString(state vz.VirtualMachineState) string { return "Resuming" case vz.VirtualMachineStateStopping: return "Stopping" + case vz.VirtualMachineStateSaving: + return "Saving" + case vz.VirtualMachineStateRestoring: + return "Restoring" default: return "Unknown" } diff --git a/cmd/vz-shim/vm.go b/cmd/vz-shim/vm.go index b995f0c7..9fa34012 100644 --- a/cmd/vz-shim/vm.go +++ b/cmd/vz-shim/vm.go @@ -3,6 +3,7 @@ package main import ( + "encoding/base64" "fmt" "log/slog" "net" @@ -15,7 +16,7 @@ import ( ) // createVM creates and configures a vz.VirtualMachine from ShimConfig. -func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMachineConfiguration, error) { +func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMachineConfiguration, error) { // Prepare kernel command line (vz uses hvc0 for serial console) kernelArgs := config.KernelArgs if kernelArgs == "" { @@ -61,15 +62,19 @@ func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMach return nil, nil, fmt.Errorf("configure storage: %w", err) } + if err := configurePlatform(vmConfig, config); err != nil { + return nil, nil, fmt.Errorf("configure platform: %w", err) + } + vsockConfig, err := vz.NewVirtioSocketDeviceConfiguration() if err != nil { return nil, nil, fmt.Errorf("create vsock device: %w", err) } vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig}) - if balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration(); err == nil { - vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{balloonConfig}) - } + // Do not attach memory balloon for now. + // Save/restore compatibility on VZ can fail with "invalid argument" for some + // Linux guest configurations when a balloon device is present. if validated, err := vmConfig.Validate(); !validated || err != nil { return nil, nil, fmt.Errorf("invalid vm configuration: %w", err) @@ -83,6 +88,37 @@ func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMach return vm, vmConfig, nil } +func configurePlatform(vmConfig *vz.VirtualMachineConfiguration, config *shimconfig.ShimConfig) error { + var machineID *vz.GenericMachineIdentifier + var err error + + if config.MachineIdentifierData != "" { + b, decodeErr := base64.StdEncoding.DecodeString(config.MachineIdentifierData) + if decodeErr != nil { + return fmt.Errorf("decode machine identifier data: %w", decodeErr) + } + machineID, err = vz.NewGenericMachineIdentifierWithData(b) + if err != nil { + return fmt.Errorf("recreate machine identifier: %w", err) + } + } else { + machineID, err = vz.NewGenericMachineIdentifier() + if err != nil { + return fmt.Errorf("create machine identifier: %w", err) + } + config.MachineIdentifierData = base64.StdEncoding.EncodeToString(machineID.DataRepresentation()) + } + + platformConfig, err := vz.NewGenericPlatformConfiguration( + vz.WithGenericMachineIdentifier(machineID), + ) + if err != nil { + return fmt.Errorf("create generic platform config: %w", err) + } + vmConfig.SetPlatformVirtualMachineConfiguration(platformConfig) + return nil +} + func configureSerialConsole(vmConfig *vz.VirtualMachineConfiguration, logPath string) error { var serialAttachment *vz.FileHandleSerialPortAttachment diff --git a/lib/forkvm/README.md b/lib/forkvm/README.md index 24b5cefa..1bff4318 100644 --- a/lib/forkvm/README.md +++ b/lib/forkvm/README.md @@ -52,9 +52,22 @@ instead of reusing the source identity. ## VZ (Virtualization.framework) -- Fork is not supported. -- Snapshot restore for Linux guests is not available in this mode, so standby - snapshot-based fork mechanics cannot be implemented. +- Stopped-source fork is supported (directory clone, no snapshot rewrite). +- Standby-source fork is supported (snapshot copy + VZ manifest rewrite + + restore). +- Running-source fork is supported (standby source -> fork from standby -> + restore source). +- VZ fork preparation rewrites instance-local paths in serialized shim config: + disks, kernel/initrd, serial log, control socket, vsock socket, shim log. +- VZ keeps snapshotted NIC identity unchanged during fork prep because + save/restore validation can reject machine-state restore when NIC identity + fields are mutated. +- For forked standby restores with networking, a fresh network allocation is + applied post-restore via the generic restore networking flow. +- Vsock socket naming is resolved generically through hypervisor registration + (`vz.vsock` for VZ), so no instance-layer VZ-specific branching is required. +- Vsock CID rewrites are not required for VZ fork flows because VZ routing is + socket-path based. ## Operational constraints diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index 0ea2fb8e..4026a425 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -41,6 +41,10 @@ const ( // Registered by each hypervisor package's init() function. var socketNames = make(map[Type]string) +// vsockSocketNames maps hypervisor types to their vsock socket filenames. +// Registered by hypervisor packages when they use socket-based vsock routing. +var vsockSocketNames = make(map[Type]string) + // RegisterSocketName registers the socket filename for a hypervisor type. // Called by each hypervisor implementation's init() function. func RegisterSocketName(t Type, name string) { @@ -56,6 +60,20 @@ func SocketNameForType(t Type) string { return string(t) + ".sock" } +// RegisterVsockSocketName registers the vsock socket filename for a hypervisor type. +func RegisterVsockSocketName(t Type, name string) { + vsockSocketNames[t] = name +} + +// VsockSocketNameForType returns the vsock socket filename for a hypervisor type. +// Falls back to "vsock.sock" when a hypervisor doesn't require a custom name. +func VsockSocketNameForType(t Type) string { + if name, ok := vsockSocketNames[t]; ok { + return name + } + return "vsock.sock" +} + // VMStarter handles the full VM startup sequence. // Each hypervisor implements its own startup flow: // - Cloud Hypervisor: starts process, configures via HTTP API, boots via HTTP API diff --git a/lib/hypervisor/vz/client.go b/lib/hypervisor/vz/client.go index 5f9ec0d1..55447936 100644 --- a/lib/hypervisor/vz/client.go +++ b/lib/hypervisor/vz/client.go @@ -3,12 +3,14 @@ package vz import ( + "bytes" "context" "encoding/json" "fmt" "io" "net" "net/http" + "runtime" "time" "github.com/kernel/hypeman/lib/hypervisor" @@ -16,8 +18,9 @@ import ( // Client implements hypervisor.Hypervisor via HTTP to the vz-shim process. type Client struct { - socketPath string - httpClient *http.Client + socketPath string + httpClient *http.Client + longRunningHTTPClient *http.Client } // NewClient creates a new vz shim client. @@ -31,6 +34,9 @@ func NewClient(socketPath string) (*Client, error) { Transport: transport, Timeout: 10 * time.Second, } + longRunningHTTPClient := &http.Client{ + Transport: transport, + } // Verify connectivity with a short timeout ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) @@ -47,8 +53,9 @@ func NewClient(socketPath string) (*Client, error) { resp.Body.Close() return &Client{ - socketPath: socketPath, - httpClient: httpClient, + socketPath: socketPath, + httpClient: httpClient, + longRunningHTTPClient: longRunningHTTPClient, }, nil } @@ -59,9 +66,13 @@ type vmInfoResponse struct { State string `json:"state"` } +type snapshotRequest struct { + DestinationPath string `json:"destination_path"` +} + func (c *Client) Capabilities() hypervisor.Capabilities { return hypervisor.Capabilities{ - SupportsSnapshot: false, + SupportsSnapshot: runtime.GOARCH == "arm64", SupportsHotplugMemory: false, SupportsPause: true, SupportsVsock: true, @@ -72,6 +83,10 @@ func (c *Client) Capabilities() hypervisor.Capabilities { // doPut sends a PUT request to the shim and checks for success. func (c *Client) doPut(ctx context.Context, path string, body io.Reader) error { + return c.doPutWithClient(ctx, c.httpClient, path, body) +} + +func (c *Client) doPutWithClient(ctx context.Context, client *http.Client, path string, body io.Reader) error { req, err := http.NewRequestWithContext(ctx, http.MethodPut, "http://vz-shim"+path, body) if err != nil { return err @@ -79,7 +94,7 @@ func (c *Client) doPut(ctx context.Context, path string, body io.Reader) error { if body != nil { req.Header.Set("Content-Type", "application/json") } - resp, err := c.httpClient.Do(req) + resp, err := client.Do(req) if err != nil { return err } @@ -136,13 +151,13 @@ func (c *Client) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { var state hypervisor.VMState switch info.State { - case "Running": + case "Running", "Resuming": state = hypervisor.StateRunning - case "Paused": + case "Paused", "Pausing", "Saving": state = hypervisor.StatePaused - case "Starting": + case "Starting", "Restoring": state = hypervisor.StateCreated - case "Shutdown", "Stopped", "Error": + case "Shutdown", "Stopped", "Stopping", "Error": state = hypervisor.StateShutdown default: state = hypervisor.StateShutdown @@ -160,7 +175,14 @@ func (c *Client) Resume(ctx context.Context) error { } func (c *Client) Snapshot(ctx context.Context, destPath string) error { - return hypervisor.ErrNotSupported + req := snapshotRequest{DestinationPath: destPath} + body, err := json.Marshal(req) + if err != nil { + return fmt.Errorf("marshal snapshot request: %w", err) + } + // Snapshot duration scales with guest RAM size, so rely on caller context + // rather than the default short client timeout. + return c.doPutWithClient(ctx, c.longRunningHTTPClient, "/api/v1/vm.snapshot", bytes.NewReader(body)) } func (c *Client) ResizeMemory(ctx context.Context, bytes int64) error { diff --git a/lib/hypervisor/vz/fork.go b/lib/hypervisor/vz/fork.go new file mode 100644 index 00000000..87c4aced --- /dev/null +++ b/lib/hypervisor/vz/fork.go @@ -0,0 +1,103 @@ +//go:build darwin + +package vz + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strings" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/hypervisor/vz/shimconfig" +) + +// PrepareFork prepares VZ snapshot state for forked instances. +// For stopped forks (no snapshot), this is a no-op. +func (s *Starter) PrepareFork(ctx context.Context, req hypervisor.ForkPrepareRequest) (hypervisor.ForkPrepareResult, error) { + _ = ctx + if req.SnapshotConfigPath == "" { + return hypervisor.ForkPrepareResult{}, nil + } + + if err := rewriteSnapshotManifestForFork(req.SnapshotConfigPath, req); err != nil { + return hypervisor.ForkPrepareResult{}, err + } + return hypervisor.ForkPrepareResult{ + // VZ vsock dialing is socket-path based; CID rewrites are not required. + VsockCIDUpdated: false, + }, nil +} + +func rewriteSnapshotManifestForFork(manifestPath string, req hypervisor.ForkPrepareRequest) error { + data, err := os.ReadFile(manifestPath) + if err != nil { + return fmt.Errorf("read snapshot manifest: %w", err) + } + + var manifest shimconfig.SnapshotManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return fmt.Errorf("unmarshal snapshot manifest: %w", err) + } + + if manifest.Hypervisor != "" && manifest.Hypervisor != string(hypervisor.TypeVZ) { + return fmt.Errorf("snapshot hypervisor mismatch: expected vz, got %s", manifest.Hypervisor) + } + if manifest.Hypervisor == "" { + manifest.Hypervisor = string(hypervisor.TypeVZ) + } + if manifest.MachineStateFile == "" { + manifest.MachineStateFile = shimconfig.SnapshotMachineStateFile + } + + if req.SourceDataDir != "" && req.TargetDataDir != "" && req.SourceDataDir != req.TargetDataDir { + manifest.ShimConfig = rewriteShimConfigPaths(manifest.ShimConfig, req.SourceDataDir, req.TargetDataDir) + } + + if req.VsockSocket != "" { + manifest.ShimConfig.VsockSocket = req.VsockSocket + } + if req.SerialLogPath != "" { + manifest.ShimConfig.SerialLogPath = req.SerialLogPath + } + + // VZ machine-state restore requires device configuration compatibility. + // Rewriting network identity fields in the serialized config can cause + // restore to fail with "invalid argument", so keep NIC identity unchanged. + + // Runtime-only field; restore path is provided by the caller. + manifest.ShimConfig.RestoreMachineStatePath = "" + + updated, err := json.Marshal(manifest) + if err != nil { + return fmt.Errorf("marshal snapshot manifest: %w", err) + } + if err := os.WriteFile(manifestPath, updated, 0644); err != nil { + return fmt.Errorf("write snapshot manifest: %w", err) + } + return nil +} + +func rewriteShimConfigPaths(cfg shimconfig.ShimConfig, sourceDir, targetDir string) shimconfig.ShimConfig { + replace := func(value string) string { + if value == sourceDir || strings.HasPrefix(value, sourceDir+"/") { + return targetDir + strings.TrimPrefix(value, sourceDir) + } + return value + } + + cfg.SerialLogPath = replace(cfg.SerialLogPath) + cfg.KernelPath = replace(cfg.KernelPath) + cfg.InitrdPath = replace(cfg.InitrdPath) + cfg.ControlSocket = replace(cfg.ControlSocket) + cfg.VsockSocket = replace(cfg.VsockSocket) + cfg.LogPath = replace(cfg.LogPath) + cfg.RestoreMachineStatePath = replace(cfg.RestoreMachineStatePath) + + for i := range cfg.Disks { + cfg.Disks[i].Path = replace(cfg.Disks[i].Path) + } + + return cfg +} diff --git a/lib/hypervisor/vz/fork_test.go b/lib/hypervisor/vz/fork_test.go new file mode 100644 index 00000000..240ddb1f --- /dev/null +++ b/lib/hypervisor/vz/fork_test.go @@ -0,0 +1,85 @@ +//go:build darwin + +package vz + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/hypervisor/vz/shimconfig" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPrepareFork_NoSnapshotPathIsNoOp(t *testing.T) { + starter := NewStarter() + result, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{}) + require.NoError(t, err) + assert.False(t, result.VsockCIDUpdated) +} + +func TestPrepareFork_RewritesSnapshotManifest(t *testing.T) { + starter := NewStarter() + tmp := t.TempDir() + manifestPath := filepath.Join(tmp, shimconfig.SnapshotManifestFile) + + sourceDir := "/src/guests/a" + targetDir := "/dst/guests/b" + orig := shimconfig.SnapshotManifest{ + Hypervisor: string(hypervisor.TypeVZ), + MachineStateFile: shimconfig.SnapshotMachineStateFile, + ShimConfig: shimconfig.ShimConfig{ + SerialLogPath: sourceDir + "/logs/serial.log", + KernelPath: sourceDir + "/kernel/vmlinuz", + InitrdPath: sourceDir + "/kernel/initrd", + ControlSocket: sourceDir + "/vz.sock", + VsockSocket: sourceDir + "/vz.vsock", + LogPath: sourceDir + "/logs/vz-shim.log", + Disks: []shimconfig.DiskConfig{ + {Path: sourceDir + "/overlay.raw"}, + {Path: "/volumes/shared.raw"}, + }, + Networks: []shimconfig.NetworkConfig{ + {MAC: "02:00:00:00:00:01"}, + }, + }, + } + data, err := json.Marshal(orig) + require.NoError(t, err) + require.NoError(t, os.WriteFile(manifestPath, data, 0644)) + + result, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{ + SnapshotConfigPath: manifestPath, + SourceDataDir: sourceDir, + TargetDataDir: targetDir, + VsockCID: 22222, + VsockSocket: targetDir + "/fork.vsock", + SerialLogPath: targetDir + "/logs/fork-serial.log", + }) + require.NoError(t, err) + assert.False(t, result.VsockCIDUpdated) + + updatedData, err := os.ReadFile(manifestPath) + require.NoError(t, err) + + var updated shimconfig.SnapshotManifest + require.NoError(t, json.Unmarshal(updatedData, &updated)) + + assert.Equal(t, string(hypervisor.TypeVZ), updated.Hypervisor) + assert.Equal(t, shimconfig.SnapshotMachineStateFile, updated.MachineStateFile) + assert.Equal(t, targetDir+"/logs/fork-serial.log", updated.ShimConfig.SerialLogPath) + assert.Equal(t, targetDir+"/fork.vsock", updated.ShimConfig.VsockSocket) + assert.Equal(t, targetDir+"/kernel/vmlinuz", updated.ShimConfig.KernelPath) + assert.Equal(t, targetDir+"/kernel/initrd", updated.ShimConfig.InitrdPath) + assert.Equal(t, targetDir+"/vz.sock", updated.ShimConfig.ControlSocket) + assert.Equal(t, targetDir+"/logs/vz-shim.log", updated.ShimConfig.LogPath) + require.Len(t, updated.ShimConfig.Disks, 2) + assert.Equal(t, targetDir+"/overlay.raw", updated.ShimConfig.Disks[0].Path) + assert.Equal(t, "/volumes/shared.raw", updated.ShimConfig.Disks[1].Path) + require.Len(t, updated.ShimConfig.Networks, 1) + assert.Equal(t, "02:00:00:00:00:01", updated.ShimConfig.Networks[0].MAC) +} diff --git a/lib/hypervisor/vz/shimconfig/config.go b/lib/hypervisor/vz/shimconfig/config.go index 23056267..630e841b 100644 --- a/lib/hypervisor/vz/shimconfig/config.go +++ b/lib/hypervisor/vz/shimconfig/config.go @@ -4,6 +4,14 @@ // the hypeman API server and the vz-shim subprocess. package shimconfig +const ( + // SnapshotManifestFile is the metadata file stored in snapshot directories. + // Kept as config.json to match existing snapshot path conventions. + SnapshotManifestFile = "config.json" + // SnapshotMachineStateFile is the serialized VM machine state filename. + SnapshotMachineStateFile = "machine-state.vzm" +) + // ShimConfig is the configuration passed from hypeman to the shim. type ShimConfig struct { // Compute resources @@ -30,6 +38,14 @@ type ShimConfig struct { // Logging LogPath string `json:"log_path"` + + // Generic machine identifier data representation (base64), used to keep + // platform identity stable across save/restore. + MachineIdentifierData string `json:"machine_identifier_data,omitempty"` + + // Optional restore source (snapshot machine state file path). + // When set, the shim restores instead of starting from cold boot. + RestoreMachineStatePath string `json:"restore_machine_state_path,omitempty"` } // DiskConfig represents a disk attached to the VM. @@ -42,3 +58,10 @@ type DiskConfig struct { type NetworkConfig struct { MAC string `json:"mac"` } + +// SnapshotManifest is persisted in snapshot directories to allow restore. +type SnapshotManifest struct { + Hypervisor string `json:"hypervisor"` + MachineStateFile string `json:"machine_state_file"` + ShimConfig ShimConfig `json:"shim_config"` +} diff --git a/lib/hypervisor/vz/starter.go b/lib/hypervisor/vz/starter.go index cc8daf1b..fc7a041d 100644 --- a/lib/hypervisor/vz/starter.go +++ b/lib/hypervisor/vz/starter.go @@ -24,6 +24,7 @@ import ( func init() { hypervisor.RegisterSocketName(hypervisor.TypeVZ, "vz.sock") + hypervisor.RegisterVsockSocketName(hypervisor.TypeVZ, "vz.vsock") hypervisor.RegisterVsockDialerFactory(hypervisor.TypeVZ, NewVsockDialer) hypervisor.RegisterClientFactory(hypervisor.TypeVZ, func(socketPath string) (hypervisor.Hypervisor, error) { return NewClient(socketPath) @@ -114,37 +115,77 @@ func (s *Starter) GetVersion(p *paths.Paths) (string, error) { // StartVM spawns a vz-shim subprocess to host the VM. func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, socketPath string, config hypervisor.VMConfig) (int, hypervisor.Hypervisor, error) { - log := logger.FromContext(ctx) + shimConfig := buildShimConfigFromVMConfig(config, socketPath) + return s.startShim(ctx, p, version, shimConfig, 30*time.Second) +} +// RestoreVM starts a vz-shim process and restores VM state from a snapshot. +// The VM is in paused state after restore; caller should call Resume(). +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { + manifestPath := filepath.Join(snapshotPath, shimconfig.SnapshotManifestFile) + manifestData, err := os.ReadFile(manifestPath) + if err != nil { + return 0, nil, fmt.Errorf("read snapshot manifest: %w", err) + } + + var manifest shimconfig.SnapshotManifest + if err := json.Unmarshal(manifestData, &manifest); err != nil { + return 0, nil, fmt.Errorf("decode snapshot manifest: %w", err) + } + if manifest.Hypervisor != "" && manifest.Hypervisor != string(hypervisor.TypeVZ) { + return 0, nil, fmt.Errorf("snapshot hypervisor mismatch: expected vz, got %s", manifest.Hypervisor) + } + if manifest.MachineStateFile == "" { + manifest.MachineStateFile = shimconfig.SnapshotMachineStateFile + } + + restorePath := filepath.Join(snapshotPath, manifest.MachineStateFile) + if _, err := os.Stat(restorePath); err != nil { + return 0, nil, fmt.Errorf("snapshot machine state not found: %w", err) + } + + shimConfig := manifest.ShimConfig + if shimConfig.KernelPath == "" || shimConfig.InitrdPath == "" { + return 0, nil, fmt.Errorf("invalid snapshot manifest: missing kernel/initrd in shim config") + } instanceDir := filepath.Dir(socketPath) - controlSocket := socketPath - vsockSocket := filepath.Join(instanceDir, "vz.vsock") - logPath := filepath.Join(instanceDir, "logs", "vz-shim.log") + shimConfig.ControlSocket = socketPath + shimConfig.VsockSocket = filepath.Join(instanceDir, "vz.vsock") + shimConfig.LogPath = filepath.Join(instanceDir, "logs", "vz-shim.log") + shimConfig.RestoreMachineStatePath = restorePath - shimConfig := shimconfig.ShimConfig{ + return s.startShim(ctx, p, version, shimConfig, 90*time.Second) +} + +func buildShimConfigFromVMConfig(config hypervisor.VMConfig, socketPath string) shimconfig.ShimConfig { + instanceDir := filepath.Dir(socketPath) + cfg := shimconfig.ShimConfig{ VCPUs: config.VCPUs, MemoryBytes: config.MemoryBytes, SerialLogPath: config.SerialLogPath, KernelPath: config.KernelPath, InitrdPath: config.InitrdPath, KernelArgs: config.KernelArgs, - ControlSocket: controlSocket, - VsockSocket: vsockSocket, - LogPath: logPath, + ControlSocket: socketPath, + VsockSocket: filepath.Join(instanceDir, "vz.vsock"), + LogPath: filepath.Join(instanceDir, "logs", "vz-shim.log"), } - for _, disk := range config.Disks { - shimConfig.Disks = append(shimConfig.Disks, shimconfig.DiskConfig{ + cfg.Disks = append(cfg.Disks, shimconfig.DiskConfig{ Path: disk.Path, Readonly: disk.Readonly, }) } - for _, net := range config.Networks { - shimConfig.Networks = append(shimConfig.Networks, shimconfig.NetworkConfig{ + cfg.Networks = append(cfg.Networks, shimconfig.NetworkConfig{ MAC: net.MAC, }) } + return cfg +} + +func (s *Starter) startShim(ctx context.Context, p *paths.Paths, version string, shimConfig shimconfig.ShimConfig, timeout time.Duration) (int, hypervisor.Hypervisor, error) { + log := logger.FromContext(ctx) configJSON, err := json.Marshal(shimConfig) if err != nil { @@ -172,17 +213,21 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s } pid := cmd.Process.Pid - log.InfoContext(ctx, "vz-shim started", "pid", pid, "control_socket", controlSocket) + log.InfoContext(ctx, "vz-shim started", + "pid", pid, + "control_socket", shimConfig.ControlSocket, + "restore_machine_state_path", shimConfig.RestoreMachineStatePath, + ) // Wait for shim in a goroutine so we can detect early exit waitDone := make(chan error, 1) go func() { waitDone <- cmd.Wait() }() - client, err := s.waitForShim(ctx, controlSocket, 30*time.Second) + client, err := s.waitForShim(ctx, shimConfig.ControlSocket, timeout) if err != nil { // Read shim log file for diagnostics (before instance dir cleanup deletes it) shimLog := "" - if logData, readErr := os.ReadFile(logPath); readErr == nil && len(logData) > 0 { + if logData, readErr := os.ReadFile(shimConfig.LogPath); readErr == nil && len(logData) > 0 { shimLog = string(logData) } @@ -212,18 +257,6 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return pid, client, nil } -// RestoreVM is not supported by vz (Virtualization.framework cannot restore Linux guests). -func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { - return 0, nil, hypervisor.ErrNotSupported -} - -// PrepareFork is not supported for vz. -func (s *Starter) PrepareFork(ctx context.Context, req hypervisor.ForkPrepareRequest) (hypervisor.ForkPrepareResult, error) { - _ = ctx - _ = req - return hypervisor.ForkPrepareResult{}, hypervisor.ErrNotSupported -} - func (s *Starter) waitForShim(ctx context.Context, socketPath string, timeout time.Duration) (*Client, error) { deadline := time.Now().Add(timeout) diff --git a/lib/instances/create.go b/lib/instances/create.go index 31c78574..4566544c 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -109,13 +109,12 @@ func (m *manager) createInstance( // 4. Generate vsock configuration vsockCID := generateVsockCID(id) - vsockSocket := m.paths.InstanceVsockSocket(id) - log.DebugContext(ctx, "generated vsock config", "instance_id", id, "cid", vsockCID) - - // Override vsock socket path for vz (uses Virtio socket, not vhost-user) - if req.Hypervisor == hypervisor.TypeVZ || (req.Hypervisor == "" && m.defaultHypervisor == hypervisor.TypeVZ) { - vsockSocket = filepath.Join(m.paths.InstanceDir(id), "vz.vsock") + hvTypeForVsock := req.Hypervisor + if hvTypeForVsock == "" { + hvTypeForVsock = m.defaultHypervisor } + vsockSocket := m.paths.InstanceSocket(id, hypervisor.VsockSocketNameForType(hvTypeForVsock)) + log.DebugContext(ctx, "generated vsock config", "instance_id", id, "cid", vsockCID) // 5. Check instance doesn't already exist if _, err := m.loadMetadata(id); err == nil { diff --git a/lib/instances/fork.go b/lib/instances/fork.go index ab619af7..526b7100 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -259,7 +259,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin forkMeta.HypervisorPID = nil forkMeta.SocketPath = m.paths.InstanceSocket(forkID, starter.SocketName()) forkMeta.DataDir = dstDir - forkMeta.VsockSocket = m.paths.InstanceVsockSocket(forkID) + forkMeta.VsockSocket = m.paths.InstanceSocket(forkID, hypervisor.VsockSocketNameForType(forkMeta.HypervisorType)) forkMeta.ExitCode = nil forkMeta.ExitMessage = "" diff --git a/lib/instances/fork_test.go b/lib/instances/fork_test.go index 3e036cbf..047eac8a 100644 --- a/lib/instances/fork_test.go +++ b/lib/instances/fork_test.go @@ -21,7 +21,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestForkInstanceNotSupportedHypervisor(t *testing.T) { +func TestForkInstance_VZStoppedSourceSupported(t *testing.T) { manager, _ := setupTestManager(t) ctx := context.Background() if _, err := manager.getVMStarter(hypervisor.TypeVZ); err != nil { @@ -45,9 +45,12 @@ func TestForkInstanceNotSupportedHypervisor(t *testing.T) { }} require.NoError(t, manager.saveMetadata(meta)) - _, err := manager.ForkInstance(ctx, sourceID, ForkInstanceRequest{Name: "fork-vz-copy"}) - require.Error(t, err) - assert.ErrorIs(t, err, ErrNotSupported) + forked, err := manager.ForkInstance(ctx, sourceID, ForkInstanceRequest{Name: "fork-vz-copy"}) + require.NoError(t, err) + require.NotNil(t, forked) + assert.Equal(t, StateStopped, forked.State) + assert.Equal(t, hypervisor.TypeVZ, forked.HypervisorType) + assert.NotEqual(t, sourceID, forked.Id) } func TestResolveForkTargetState_DefaultsToSourceState(t *testing.T) { diff --git a/lib/instances/manager_darwin_test.go b/lib/instances/manager_darwin_test.go index 95309751..719f89c4 100644 --- a/lib/instances/manager_darwin_test.go +++ b/lib/instances/manager_darwin_test.go @@ -7,8 +7,10 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" "runtime" + "strconv" "strings" "syscall" "testing" @@ -371,6 +373,297 @@ func TestVZExecAndShutdown(t *testing.T) { t.Log("Instance deleted") } +// TestVZStandbyAndRestore tests the full standby/restore cycle for the vz hypervisor. +func TestVZStandbyAndRestore(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("vz tests require macOS") + } + if runtime.GOARCH != "arm64" { + t.Skip("vz standby/restore requires Apple Silicon (arm64)") + } + if !isMacOS14OrLater(t) { + t.Skip("vz standby/restore requires macOS 14+") + } + ensureMkfsExt4Available(t) + + mgr, tmpDir := setupVZTestManager(t) + ctx := context.Background() + p := paths.New(tmpDir) + + // Prepare image + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + t.Log("Pulling alpine:latest image...") + alpineImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: "docker.io/library/alpine:latest", + }) + require.NoError(t, err) + + alpineRef, err := images.ParseNormalizedRef(alpineImage.Name) + require.NoError(t, err) + waitName := alpineImage.Name + if alpineImage.Digest != "" { + waitName = alpineRef.Repository() + "@" + alpineImage.Digest + } + + waitCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + err = imageManager.WaitForReady(waitCtx, waitName) + require.NoError(t, err, "Image should become ready") + + alpineImage, err = imageManager.GetImage(ctx, waitName) + require.NoError(t, err) + require.Equal(t, images.StatusReady, alpineImage.Status, "Image should be ready") + t.Log("Alpine image ready") + + // Ensure system files (kernel + initrd) + systemManager := system.NewManager(p) + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + t.Log("System files ready") + + // Create instance using vz hypervisor + inst, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "test-vz-standby", + Image: "docker.io/library/alpine:latest", + Size: 2 * 1024 * 1024 * 1024, + OverlaySize: 10 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeVZ, + Cmd: []string{"sleep", "infinity"}, + }) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + require.NotNil(t, inst) + assert.Equal(t, StateRunning, inst.State) + assert.Equal(t, hypervisor.TypeVZ, inst.HypervisorType) + t.Logf("Instance created: %s (hypervisor: %s)", inst.Id, inst.HypervisorType) + + instanceID := inst.Id + deleted := false + t.Cleanup(func() { + if !deleted { + _ = mgr.DeleteInstance(ctx, instanceID) + } + }) + + // Wait for guest agent to be ready + err = waitForExecAgent(ctx, mgr, inst.Id, 30*time.Second) + require.NoError(t, err, "guest agent should be ready") + t.Log("Guest agent ready") + + // Exec before standby + output, exitCode, err := vzExecCommand(ctx, inst, "echo", "before-standby") + require.NoError(t, err, "exec should succeed before standby") + require.Equal(t, 0, exitCode) + assert.Equal(t, "before-standby", strings.TrimSpace(output)) + + // Capture current shim PID so we can ensure it is gone after standby. + var runningPID int + if inst.HypervisorPID != nil { + runningPID = *inst.HypervisorPID + } + + // Standby instance + t.Log("Putting instance in standby...") + inst, err = mgr.StandbyInstance(ctx, inst.Id) + require.NoError(t, err) + assert.Equal(t, StateStandby, inst.State) + assert.True(t, inst.HasSnapshot) + t.Log("Instance in standby") + + // Verify snapshot files + snapshotDir := p.InstanceSnapshotLatest(inst.Id) + assert.DirExists(t, snapshotDir) + assert.FileExists(t, filepath.Join(snapshotDir, "config.json"), "vz snapshot config should exist") + assert.FileExists(t, filepath.Join(snapshotDir, "machine-state.vzm"), "vz machine state file should exist") + + // Verify old shim process is gone + if runningPID > 0 { + time.Sleep(500 * time.Millisecond) + assert.NoError(t, checkProcessGone(runningPID), "vz-shim process should be gone after standby") + } + + // Restore from standby + t.Log("Restoring instance...") + inst, err = mgr.RestoreInstance(ctx, inst.Id) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + assert.Equal(t, StateRunning, inst.State) + assert.False(t, inst.HasSnapshot) + t.Log("Instance restored and running") + + // Re-read instance and verify exec works after restore. + inst, err = mgr.GetInstance(ctx, instanceID) + require.NoError(t, err) + + t.Log("Waiting for exec to work after restore...") + var execErr error + for i := 0; i < 30; i++ { + time.Sleep(1 * time.Second) + inst, err = mgr.GetInstance(ctx, instanceID) + if err != nil { + continue + } + output, exitCode, execErr = vzExecCommand(ctx, inst, "echo", "after-restore") + if execErr == nil && exitCode == 0 { + break + } + t.Logf("Exec attempt %d after restore: err=%v", i+1, execErr) + } + if execErr != nil { + dumpVZShimLogs(t, tmpDir) + } + require.NoError(t, execErr, "exec should succeed after restore") + require.Equal(t, 0, exitCode) + assert.Equal(t, "after-restore", strings.TrimSpace(output)) + t.Log("Exec after restore passed") + + // Cleanup + t.Log("Deleting instance...") + err = mgr.DeleteInstance(ctx, instanceID) + require.NoError(t, err) + deleted = true + assert.NoDirExists(t, p.InstanceDir(instanceID)) +} + +// TestVZForkFromRunningNetwork mirrors the running-source fork flow validated for +// cloud-hypervisor, but on macOS VZ. +func TestVZForkFromRunningNetwork(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("vz tests require macOS") + } + if runtime.GOARCH != "arm64" { + t.Skip("vz running fork requires Apple Silicon (arm64)") + } + if !isMacOS14OrLater(t) { + t.Skip("vz running fork requires macOS 14+") + } + ensureMkfsExt4Available(t) + + mgr, tmpDir := setupVZTestManager(t) + ctx := context.Background() + p := paths.New(tmpDir) + + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + t.Log("Pulling alpine:latest image...") + alpineImage, err := imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: "docker.io/library/alpine:latest", + }) + require.NoError(t, err) + + alpineRef, err := images.ParseNormalizedRef(alpineImage.Name) + require.NoError(t, err) + waitName := alpineImage.Name + if alpineImage.Digest != "" { + waitName = alpineRef.Repository() + "@" + alpineImage.Digest + } + + waitCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + err = imageManager.WaitForReady(waitCtx, waitName) + require.NoError(t, err, "Image should become ready") + + alpineImage, err = imageManager.GetImage(ctx, waitName) + require.NoError(t, err) + require.Equal(t, images.StatusReady, alpineImage.Status, "Image should be ready") + t.Log("Alpine image ready") + + systemManager := system.NewManager(p) + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + t.Log("System files ready") + + source, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "test-vz-fork-src", + Image: "docker.io/library/alpine:latest", + Size: 2 * 1024 * 1024 * 1024, + OverlaySize: 10 * 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: true, + Hypervisor: hypervisor.TypeVZ, + Cmd: []string{"sleep", "infinity"}, + }) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + require.Equal(t, StateRunning, source.State) + require.NotEmpty(t, source.IP) + require.NotEmpty(t, source.MAC) + + sourceID := source.Id + t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), sourceID) }) + + err = waitForExecAgent(ctx, mgr, sourceID, 30*time.Second) + require.NoError(t, err, "source guest agent should be ready") + + output, exitCode, err := vzExecCommand(ctx, source, "echo", "source-before-fork") + require.NoError(t, err) + require.Equal(t, 0, exitCode) + assert.Equal(t, "source-before-fork", strings.TrimSpace(output)) + + // Running fork requires explicit opt-in. + _, err = mgr.ForkInstance(ctx, sourceID, ForkInstanceRequest{Name: "test-vz-fork-no-flag"}) + require.Error(t, err) + assert.ErrorIs(t, err, ErrInvalidState) + + forked, err := mgr.ForkInstance(ctx, sourceID, ForkInstanceRequest{ + Name: "test-vz-fork-copy", + FromRunning: true, + TargetState: StateRunning, + }) + if err != nil { + dumpVZShimLogs(t, tmpDir) + require.NoError(t, err) + } + require.Equal(t, StateRunning, forked.State) + require.NotEqual(t, sourceID, forked.Id) + forkID := forked.Id + t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), forkID) }) + + sourceAfterFork, err := mgr.GetInstance(ctx, sourceID) + require.NoError(t, err) + require.Equal(t, StateRunning, sourceAfterFork.State) + require.NotEmpty(t, sourceAfterFork.IP) + require.NotEmpty(t, sourceAfterFork.MAC) + require.False(t, sourceAfterFork.HasSnapshot) + + forked, err = mgr.GetInstance(ctx, forkID) + require.NoError(t, err) + require.Equal(t, StateRunning, forked.State) + require.NotEmpty(t, forked.IP) + require.NotEmpty(t, forked.MAC) + require.False(t, forked.HasSnapshot) + + // Fork gets a fresh network identity. + assert.NotEqual(t, sourceAfterFork.IP, forked.IP) + assert.NotEqual(t, sourceAfterFork.MAC, forked.MAC) + + err = waitForExecAgent(ctx, mgr, sourceID, 30*time.Second) + require.NoError(t, err, "source guest agent should recover after restore") + err = waitForExecAgent(ctx, mgr, forkID, 30*time.Second) + require.NoError(t, err, "fork guest agent should be ready") + + output, exitCode, err = vzExecCommand(ctx, sourceAfterFork, "echo", "source-after-fork") + require.NoError(t, err) + require.Equal(t, 0, exitCode) + assert.Equal(t, "source-after-fork", strings.TrimSpace(output)) + + output, exitCode, err = vzExecCommand(ctx, forked, "echo", "fork-after-restore") + require.NoError(t, err) + require.Equal(t, 0, exitCode) + assert.Equal(t, "fork-after-restore", strings.TrimSpace(output)) +} + // dumpVZShimLogs logs any vz-shim log files found under tmpDir for debugging CI failures. func dumpVZShimLogs(t *testing.T, tmpDir string) { t.Helper() @@ -395,3 +688,47 @@ func checkProcessGone(pid int) error { } return fmt.Errorf("process %d still running", pid) } + +func isMacOS14OrLater(t *testing.T) bool { + t.Helper() + out, err := exec.Command("sw_vers", "-productVersion").Output() + if err != nil { + t.Logf("failed to check macOS version: %v", err) + return false + } + + version := strings.TrimSpace(string(out)) + parts := strings.Split(version, ".") + if len(parts) == 0 { + return false + } + + major, err := strconv.Atoi(parts[0]) + if err != nil { + return false + } + return major >= 14 +} + +func ensureMkfsExt4Available(t *testing.T) { + t.Helper() + if _, err := exec.LookPath("mkfs.ext4"); err == nil { + return + } + + candidates := []string{ + "/opt/homebrew/opt/e2fsprogs/sbin", + "/usr/local/opt/e2fsprogs/sbin", + } + for _, dir := range candidates { + if _, err := os.Stat(filepath.Join(dir, "mkfs.ext4")); err == nil { + pathWithTool := dir + ":" + os.Getenv("PATH") + require.NoError(t, os.Setenv("PATH", pathWithTool)) + if _, err := exec.LookPath("mkfs.ext4"); err == nil { + return + } + } + } + + t.Fatalf("mkfs.ext4 not found; install e2fsprogs and ensure it is on PATH") +} diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 64644bf0..25cc3536 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -289,6 +289,7 @@ func ProvideBuildManager(p *paths.Paths, cfg *config.Config, instanceManager ins RegistryCACert: registryCACert, DefaultTimeout: cfg.Build.Timeout, RegistrySecret: cfg.JwtSecret, // Use same secret for registry tokens + DockerSocket: cfg.Build.DockerSocket, } // Configure secret provider (use NoOpSecretProvider as fallback to avoid nil panics)