From 8d81fc590c7e845204cb191dfeedb60e2a77cd8f Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Fri, 15 May 2026 19:39:10 +0300 Subject: [PATCH 1/3] fix(modeline): split modeline parts on top-level commas only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ParseModeline previously split the modeline body on the literal `, ` (comma+space) string, which is the separator GenerateModeline emits between key-pairs. JSON arrays serialised by encoding/json carry no space after each inner comma ("["a","b"]"), so the canonical talm-generated form parsed back cleanly. But a hand-authored modeline with whitespace inside the array ("nodes=["a", "b"]" — the natural shape) was cut at the first inner comma. The leading half ("nodes=["a"") then failed JSON parsing with "unexpected end of JSON input", with a hint that misled the operator into thinking their JSON was malformed. Replace the literal split with a depth-aware tokenizer that tracks `[`/`]` nesting and JSON string state (including backslash escapes) and only emits a key-pair boundary when it sees a `,` at depth 0 outside a string literal. The canonical no-space form still parses; the human-written form now parses; commas inside string literals never split the value. Update the contract test that pinned the old strict separator to document the relaxation, and add cases for multi-element arrays with various whitespace shapes plus comma-in-string-literal coverage. Extend the manual-test-plan with section C6f covering the parser behaviour on both anchor and side-patch slots. Signed-off-by: Aleksei Sviridkin --- docs/manual-test-plan.md | 28 +++++++ pkg/modeline/contract_test.go | 143 ++++++++++++++++++++++++++++++---- pkg/modeline/modeline.go | 66 +++++++++++++++- pkg/modeline/modeline_test.go | 49 ++++++++++++ 4 files changed, 270 insertions(+), 16 deletions(-) diff --git a/docs/manual-test-plan.md b/docs/manual-test-plan.md index d4ef4788..e4fee400 100644 --- a/docs/manual-test-plan.md +++ b/docs/manual-test-plan.md @@ -377,6 +377,34 @@ Inspect any apply progress line from C1, C3, C6a, C6c. Expect `nodes=[n1,n2,n3]` Regression anchor: the format uses `strings.Join(slice, ",")` with surrounding brackets in the format string; `%s` or `%v` directly on a `[]string` produces the space-separated `[n1 n2 n3]` artifact and is a regression. Verify with a multi-node modeline (`# talm: nodes=["1.2.3.4","5.6.7.8"], ...`) → progress line shows `nodes=[1.2.3.4,5.6.7.8]`. +### C6f. Multi-IP modeline parses regardless of whitespace inside the JSON array + +```bash +# Anchor with hand-written multi-IP modeline (note spaces after commas): +cat > nodes/multi.yaml <<'EOF' +# talm: nodes=["1.2.3.4", "5.6.7.8"], endpoints=["1.2.3.4", "5.6.7.8"], templates=["templates/controlplane.yaml"] +machine: {} +EOF +talm apply --dry-run -f nodes/multi.yaml + +# Side-patch slot that accidentally carries a modeline (rejection path): +cat > /tmp/side-modelined.yaml <<'EOF' +# talm: nodes=["1.2.3.4", "5.6.7.8"], endpoints=["1.2.3.4", "5.6.7.8"] +machine: + kernel: + modules: + - name: dm_thin_pool +EOF +talm apply --dry-run -f nodes/node0.yaml -f /tmp/side-modelined.yaml +``` + +Expected: + +- First invocation: progress line includes `nodes=[1.2.3.4,5.6.7.8]` (two nodes, parsed from the multi-element array). The whitespace after each comma inside `nodes=[…]` and `endpoints=[…]` is tolerated. +- Second invocation: rejection error `side-patch /tmp/side-modelined.yaml carries its own modeline; the apply chain treats only the first -f file as the anchor and stacks subsequent files as side-patches` with the per-file-loop / strip-modeline hint. The error is the high-level "side-patches must not have modelines" gate from `rejectModelinedSidePatches`, NOT a low-level `error parsing JSON array for key nodes` — the parser successfully reads the multi-IP form and the rejection happens on the (correctly identified) modelined-side-patch shape. + +Regression anchor: the modeline parser splits on commas at JSON-array depth 0 only. A regression that re-introduces a literal comma-plus-space split would silently truncate multi-element arrays — the anchor case above would fail with `unexpected end of JSON input` on `["1.2.3.4"` (array cut at the first comma). Verify the depth tracking by adding a comma inside a string literal: `# talm: nodes=["a,b"]` must parse to `Nodes=["a,b"]`, not split into two tokens. + ### C7. Phase 1 walker rejects malformed net-addr fields before the RPC When a rendered MachineConfig carries a malformed value in any of the new walker-covered fields, Phase 1 must block before the apply RPC fires: diff --git a/pkg/modeline/contract_test.go b/pkg/modeline/contract_test.go index 3f9884e9..e9526945 100644 --- a/pkg/modeline/contract_test.go +++ b/pkg/modeline/contract_test.go @@ -94,25 +94,140 @@ func TestContract_ParseModeline_RejectsMalformedKeyValue(t *testing.T) { } } -// Contract: keys are separated by `, ` (comma then a single space). -// This is the same separator GenerateModeline emits, so a generated -// modeline always parses back. Missing the space before the next key -// fails to find the part-boundary; trailing whitespace inside a JSON -// array is tolerated (json.Unmarshal handles it). -func TestContract_ParseModeline_KeyValueSeparatorContract(t *testing.T) { - // Canonical (matches GenerateModeline output). - canonical := `# talm: nodes=["a"], endpoints=["b"], templates=["c"]` - got, err := ParseModeline(canonical) - if err != nil { - t.Fatalf("canonical line failed: %v", err) - } +// Contract: keys are separated by `,` at JSON-array depth 0. The +// canonical separator GenerateModeline emits is `, ` (comma+space), +// but the parser is depth-aware so it also accepts the no-space form +// and arbitrary whitespace around the comma. A comma INSIDE a +// `nodes=["a", "b"]` array is array-internal and never splits the +// key-pair. This relaxation lets operators hand-author shared +// side-patches with multi-IP modelines in the natural form. +// +// All four forms below MUST parse to the same Config — the parser is +// liberal in what it accepts; GenerateModeline stays strict on output. +func TestContract_ParseModeline_KeyValueSeparator(t *testing.T) { want := &Config{ Nodes: []string{"a"}, Endpoints: []string{"b"}, Templates: []string{"c"}, } - if !reflect.DeepEqual(got, want) { - t.Errorf("canonical parse mismatch\n got: %+v\nwant: %+v", got, want) + cases := []struct { + name string + line string + }{ + {"canonical (matches GenerateModeline)", `# talm: nodes=["a"], endpoints=["b"], templates=["c"]`}, + {"no space after comma", `# talm: nodes=["a"],endpoints=["b"],templates=["c"]`}, + {"multiple spaces after comma", `# talm: nodes=["a"], endpoints=["b"], templates=["c"]`}, + {"space before comma too", `# talm: nodes=["a"] , endpoints=["b"] , templates=["c"]`}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := ParseModeline(tc.line) + if err != nil { + t.Fatalf("expected to parse, got: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Errorf("parse mismatch\n got: %+v\nwant: %+v", got, want) + } + }) + } +} + +// Contract: a comma inside a JSON array (multi-element value) never +// splits the key-pair. This is the depth-0 promise that the old +// literal `, ` split broke: `nodes=["a", "b"]` was cut to +// `nodes=["a"` + ` "b"]` and the first half failed JSON parsing. +func TestContract_ParseModeline_CommaInsideArrayNeverSplits(t *testing.T) { + cases := []struct { + name string + line string + want *Config + }{ + { + "multi-element nodes", + `# talm: nodes=["1.2.3.4", "5.6.7.8"]`, + &Config{Nodes: []string{testNodeIP1, "5.6.7.8"}}, + }, + { + "comma inside string literal", + `# talm: nodes=["a,b","c"]`, + &Config{Nodes: []string{"a,b", "c"}}, + }, + { + "escaped quote inside string literal", + `# talm: nodes=["a\"b","c"]`, + &Config{Nodes: []string{`a"b`, "c"}}, + }, + { + // Square brackets inside a string literal must not affect + // the depth counter — inString short-circuits the `[`/`]` + // arms of the splitter switch. A regression that drops the + // short-circuit would miscount depth and either split on a + // `,` inside the string, or fail to split on a legitimate + // key-pair `,` that follows. + "square brackets inside string literal", + `# talm: nodes=["[,]","x"]`, + &Config{Nodes: []string{"[,]", "x"}}, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := ParseModeline(tc.line) + if err != nil { + t.Fatalf("expected to parse, got: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Errorf("parse mismatch\n got: %+v\nwant: %+v", got, tc.want) + } + }) + } +} + +// Contract: a trailing `,` at depth 0 (e.g. `nodes=["a"],` with +// nothing after it) is REJECTED. The splitter emits an empty token +// for the missing key=value pair, which fails the `SplitN(part, "=", 2)` +// length check with `invalid format of modeline part`. Pinned so a +// future "helpfully ignore empty tokens" patch can't silently flip +// rejection semantics. +func TestContract_ParseModeline_TrailingCommaRejected(t *testing.T) { + cases := []string{ + `# talm: nodes=["a"],`, + `# talm: nodes=["a"], `, + `# talm: nodes=["a"], endpoints=["b"],`, + } + for _, line := range cases { + t.Run(line, func(t *testing.T) { + _, err := ParseModeline(line) + if err == nil { + t.Errorf("expected rejection for trailing comma in %q, got nil", line) + } + }) + } +} + +// Contract: an unbalanced closing `]` does not panic the depth +// counter. The splitter clamps depth at zero, so a stray `]` at +// depth 0 stays at depth 0; subsequent commas are still treated as +// key-pair separators. The resulting token shape fails downstream +// JSON unmarshalling, producing a parse error rather than a panic. +// A regression that drops the `if depth > 0` clamp would let depth +// go negative — at which point an inner array's content would be +// misclassified. +func TestContract_ParseModeline_UnbalancedClosingBracketDoesNotPanic(t *testing.T) { + // No assertion on specific error text; the contract is "rejects + // without panicking". A nil error would mean the parser somehow + // accepted clearly malformed JSON, which is a regression too. + cases := []string{ + `# talm: nodes="a"]`, + `# talm: nodes=]`, + `# talm: nodes=["a"]], endpoints=["b"]`, + } + for _, line := range cases { + t.Run(line, func(t *testing.T) { + _, err := ParseModeline(line) + if err == nil { + t.Errorf("expected error for unbalanced bracket in %q, got nil", line) + } + }) } } diff --git a/pkg/modeline/modeline.go b/pkg/modeline/modeline.go index 2fd68c0d..1e93a6fd 100644 --- a/pkg/modeline/modeline.go +++ b/pkg/modeline/modeline.go @@ -10,6 +10,69 @@ import ( "github.com/cockroachdb/errors" ) +// splitModelineParts splits the modeline body (everything after the +// `# talm: ` prefix) into `key=value` tokens. Splits on `,` only at +// JSON nesting depth 0, so a comma inside a `nodes=["a", "b"]` array +// no longer cuts the value mid-stream. Tracks `[`/`]` depth and `"` +// string state with JSON-style backslash escapes; a `,` or `]` inside +// a JSON string literal is treated as data, not structure. +// +// This is more permissive than the old `, ` literal split — both the +// canonical talm-generated form (no whitespace inside arrays) and the +// human-written form (whitespace after each array element) now parse. +// Whitespace AROUND tokens is trimmed by the caller's SplitN step. +// +// Scope: JSON-array values only. The splitter does NOT track `{`/`}` +// nesting because every modeline key in the current contract (nodes, +// endpoints, templates) is a JSON array — a `{` at depth 0 will fall +// through to the downstream json.Unmarshal which rejects non-array +// inputs. If a future modeline key takes a JSON-object value, extend +// the depth counter to track `{`/`}` too. +func splitModelineParts(content string) []string { + parts := make([]string, 0) + depth := 0 + inString := false + escape := false + start := 0 + + for i := range len(content) { + c := content[i] + + if inString { + switch { + case escape: + escape = false + case c == '\\': + escape = true + case c == '"': + inString = false + } + + continue + } + + switch c { + case '"': + inString = true + case '[': + depth++ + case ']': + if depth > 0 { + depth-- + } + case ',': + if depth == 0 { + parts = append(parts, content[start:i]) + start = i + 1 + } + } + } + + parts = append(parts, content[start:]) + + return parts +} + // Config structure for storing settings from modeline. type Config struct { Nodes []string @@ -35,8 +98,7 @@ func ParseModeline(line string) (*Config, error) { if after, ok := strings.CutPrefix(trimLine, prefix); ok { content := after - parts := strings.SplitSeq(content, ", ") - for part := range parts { + for _, part := range splitModelineParts(content) { keyVal := strings.SplitN(strings.TrimSpace(part), "=", 2) if len(keyVal) != 2 { //nolint:wrapcheck // cockroachdb/errors.WithHintf is the project's wrapping/hinting idiom diff --git a/pkg/modeline/modeline_test.go b/pkg/modeline/modeline_test.go index 5d75d60f..6e5866d0 100644 --- a/pkg/modeline/modeline_test.go +++ b/pkg/modeline/modeline_test.go @@ -31,6 +31,55 @@ func TestParseModeline(t *testing.T) { }, wantErr: false, }, + { + // Human-written form of a shared side-patch modeline: each + // element of the JSON array is followed by a single space. + // The old `SplitSeq(content, ", ")` literal split treated + // the inner space-comma as the key-pair separator and cut + // the array value at the first element. + name: "multi-element array with space after comma", + line: `# talm: nodes=["1.2.3.4", "127.0.0.1", "192.168.100.2"]`, + want: &Config{ + Nodes: []string{testNodeIP1, testLoopback, testNodeIP2}, + }, + wantErr: false, + }, + { + // Same payload, all three keys, mixed spacing — guards the + // depth-0 boundary so a comma INSIDE an array never collides + // with a comma BETWEEN key-pairs. + name: "all keys multi-element mixed spacing", + line: `# talm: nodes=["1.2.3.4", "127.0.0.1"], endpoints=["1.2.3.4","127.0.0.1"], templates=["templates/controlplane.yaml", "templates/worker.yaml"]`, + want: &Config{ + Nodes: []string{testNodeIP1, testLoopback}, + Endpoints: []string{testNodeIP1, testLoopback}, + Templates: []string{testTemplateControlPln, "templates/worker.yaml"}, + }, + wantErr: false, + }, + { + // Comma inside a JSON string literal must NOT split. Rare in + // practice (IPs and template paths have no commas) but the + // splitter's depth/string tracking promises this. + name: "comma inside string literal", + line: `# talm: nodes=["a,b","c"]`, + want: &Config{ + Nodes: []string{"a,b", "c"}, + }, + wantErr: false, + }, + { + // Key-pair separator without the trailing space: a stricter + // form than canonical talm output but operators may hand-edit + // to this shape. Now accepted (was previously rejected). + name: "key-pair separator without trailing space", + line: `# talm: nodes=["1.2.3.4"],endpoints=["127.0.0.1"]`, + want: &Config{ + Nodes: []string{testNodeIP1}, + Endpoints: []string{testLoopback}, + }, + wantErr: false, + }, } for _, tc := range testCases { From e05272724b4829bf12b9bacaf2b7a40fe67d586b Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Fri, 15 May 2026 20:15:14 +0300 Subject: [PATCH 2/3] feat(charts): expose Talos operator extension points in cozystack + generic presets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operators wanting to extend the Talos machine config rendered by the cozystack or generic presets had to fork the chart or hand-edit the generated per-node YAML against the autogen banner asking them to keep template edits in values. Add four "extra*" values keys that plug into the load-bearing sections of machine.* — kernel modules, sysctls, kubelet.extraConfig, machine.files — without overriding the preset's hardcoded defaults. Cozystack preset (append / merge to existing built-ins): extraKernelModules [] append to openvswitch / drbd / zfs / spl / vfio_pci / vfio_iommu_type1 extraKubeletExtraArgs {} merge into cpuManagerPolicy + maxPods; last-key-wins on conflict (escape hatch) extraSysctls {} merge into gc_thresh* + nr_hugepages extraMachineFiles [] append to CRI customization + lvm.conf Generic preset (emit only when set): extraKernelModules [] emits machine.kernel.modules block extraKubeletExtraArgs {} emits kubelet.extraConfig block extraSysctls {} emits machine.sysctls block extraMachineFiles [] emits machine.files block Contract tests under pkg/engine pin append vs merge semantics for every key on cozystack, on / off state on generic, and the default generic render still emits no machine.kernel / sysctls / files / extraConfig blocks (the existing NoCozystackOpinionsOnGeneric guard stays green). Manual-test-plan B7 / B8 cover both presets with forward-looking yq queries against the rendered output. Signed-off-by: Aleksei Sviridkin --- charts/cozystack/templates/_helpers.tpl | 31 ++ charts/cozystack/values.yaml | 59 ++++ charts/generic/templates/_helpers.tpl | 17 ++ charts/generic/values.yaml | 37 +++ docs/manual-test-plan.md | 70 +++++ pkg/engine/contract_machine_test.go | 386 ++++++++++++++++++++++++ 6 files changed, 600 insertions(+) diff --git a/charts/cozystack/templates/_helpers.tpl b/charts/cozystack/templates/_helpers.tpl index a744a5ee..8dc859d2 100644 --- a/charts/cozystack/templates/_helpers.tpl +++ b/charts/cozystack/templates/_helpers.tpl @@ -39,9 +39,31 @@ machine: - {{ . }} {{- end }} {{- end }} + {{- /* extraKubeletExtraArgs MUST NOT collide with the preset's + built-in extraConfig keys — yaml.v3 (used by Talos config + decode and by the upgrade-time body write-back) rejects + duplicate map keys, so a silent merge would emit a config + that cannot decode. Fail at render with a precise hint + naming the offending key; operators wanting a different + default fork the preset. */ -}} + {{- range $k, $_ := .Values.extraKubeletExtraArgs }} + {{- if or (eq $k "cpuManagerPolicy") (eq $k "maxPods") }} + {{- fail (printf "values.yaml: extraKubeletExtraArgs.%s collides with the cozystack preset's built-in kubelet.extraConfig — keys never override (yaml.v3 rejects duplicate map keys on decode). Remove the entry from extraKubeletExtraArgs, or fork the chart preset if you need a different default." $k) }} + {{- end }} + {{- end }} extraConfig: cpuManagerPolicy: static maxPods: 512 + {{- with .Values.extraKubeletExtraArgs }} + {{- toYaml . | nindent 6 }} + {{- end }} + {{- /* extraSysctls MUST NOT collide with the preset's built-in + sysctls; same rationale as extraKubeletExtraArgs. */ -}} + {{- range $k, $_ := .Values.extraSysctls }} + {{- if or (eq $k "vm.nr_hugepages") (eq $k "net.ipv4.neigh.default.gc_thresh1") (eq $k "net.ipv4.neigh.default.gc_thresh2") (eq $k "net.ipv4.neigh.default.gc_thresh3") }} + {{- fail (printf "values.yaml: extraSysctls.%s collides with the cozystack preset's built-in machine.sysctls — keys never override (yaml.v3 rejects duplicate map keys on decode). Remove the entry from extraSysctls, or fork the chart preset if you need a different default." $k) }} + {{- end }} + {{- end }} sysctls: {{- with $.Values.nr_hugepages }} vm.nr_hugepages: {{ . | quote }} @@ -49,6 +71,9 @@ machine: net.ipv4.neigh.default.gc_thresh1: "4096" net.ipv4.neigh.default.gc_thresh2: "8192" net.ipv4.neigh.default.gc_thresh3: "16384" + {{- with .Values.extraSysctls }} + {{- toYaml . | nindent 4 }} + {{- end }} kernel: modules: - name: openvswitch @@ -59,6 +84,9 @@ machine: - name: spl - name: vfio_pci - name: vfio_iommu_type1 + {{- with .Values.extraKernelModules }} + {{- toYaml . | nindent 4 }} + {{- end }} certSANs: - 127.0.0.1 {{- with .Values.certSANs }} @@ -84,6 +112,9 @@ machine: devices { global_filter = [ "r|^/dev/drbd.*|", "r|^/dev/dm-.*|", "r|^/dev/zd.*|" ] } + {{- with .Values.extraMachineFiles }} + {{- toYaml . | nindent 2 }} + {{- end }} install: {{- with .Values.image }} image: {{ . }} diff --git a/charts/cozystack/values.yaml b/charts/cozystack/values.yaml index 33b7be82..5a854977 100644 --- a/charts/cozystack/values.yaml +++ b/charts/cozystack/values.yaml @@ -102,3 +102,62 @@ oidcIssuerUrl: "" certSANs: [] nr_hugepages: 0 allocateNodeCIDRs: true + +# Operator-supplied extension points: each `extra*` key ADDS to a +# load-bearing default the cozystack preset ships. For every key +# below: leaving it at the empty default is a no-op; the preset's +# hardcoded set continues to render unchanged. Keys never override +# the preset's built-ins — if a map-shaped `extra*` value contains a +# key that collides with a built-in, the chart fails at render time +# with a guidance message pointing at the offending key. yaml.v3 +# rejects duplicate map keys on decode anyway, so a silent override +# would produce a Talos config that cannot round-trip. Operators who +# need different defaults fork the preset. + +# Extra kernel modules to load on every node, appended to the preset's +# built-in list (openvswitch, drbd, zfs, spl, vfio_pci, vfio_iommu_type1). +# Lists append unconditionally — duplicates by `name` are tolerated by +# Talos at load time. Each entry is a Talos kernel module spec; see +# Talos docs for the full shape. Examples: +# extraKernelModules: +# - name: nf_conntrack +# - name: foo +# parameters: +# - bar=baz +extraKernelModules: [] + +# Extra kubelet args added to the preset's kubelet.extraConfig +# (cpuManagerPolicy: static, maxPods: 512). Operator keys must be +# DISJOINT from the built-in set; a collision fails the render with +# a hint pointing at the offending key. Example: +# extraKubeletExtraArgs: +# feature-gates: "NodeSwap=true" +extraKubeletExtraArgs: {} + +# Extra kernel sysctls added to machine.sysctls alongside the +# preset's built-ins (net.ipv4.neigh.default.gc_thresh{1,2,3} and +# vm.nr_hugepages — the latter is gated by the dedicated `nr_hugepages` +# values key above; set THAT key, not extraSysctls.vm.nr_hugepages, +# even when the gate is currently inactive — the collision check +# treats vm.nr_hugepages as preset-owned regardless of the gate state). +# Operator keys must be DISJOINT from the built-in set; a collision +# fails the render. Values must be YAML strings (Talos expects +# strings even for numeric sysctls). Example: +# extraSysctls: +# net.core.somaxconn: "65535" +# fs.inotify.max_user_watches: "524288" +extraSysctls: {} + +# Extra machine.files entries appended to the preset's built-in +# files (CRI customization and lvm.conf). Lists append; Talos +# rejects duplicate `path:` entries at apply time, so operator +# entries must not name a path the preset already writes. Each +# entry is a Talos machine.files spec (content / path / permissions +# / op). Example: +# extraMachineFiles: +# - path: /etc/cri/conf.d/30-extra.part +# op: create +# content: | +# [debug] +# level = "info" +extraMachineFiles: [] diff --git a/charts/generic/templates/_helpers.tpl b/charts/generic/templates/_helpers.tpl index 008bc68d..9bfb11f9 100644 --- a/charts/generic/templates/_helpers.tpl +++ b/charts/generic/templates/_helpers.tpl @@ -34,10 +34,27 @@ machine: - {{ . }} {{- end }} {{- end }} + {{- with .Values.extraKubeletExtraArgs }} + extraConfig: + {{- toYaml . | nindent 6 }} + {{- end }} {{- with .Values.certSANs }} certSANs: {{- toYaml . | nindent 2 }} {{- end }} + {{- with .Values.extraSysctls }} + sysctls: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.extraKernelModules }} + kernel: + modules: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.extraMachineFiles }} + files: + {{- toYaml . | nindent 2 }} + {{- end }} install: {{- (include "talm.discovered.disks_info" .) | nindent 4 }} disk: {{ include "talm.discovered.system_disk_name" . | quote }} diff --git a/charts/generic/values.yaml b/charts/generic/values.yaml index 2a9d5458..a8f05dd1 100644 --- a/charts/generic/values.yaml +++ b/charts/generic/values.yaml @@ -87,3 +87,40 @@ serviceSubnets: advertisedSubnets: [] certSANs: [] + +# Operator-supplied extension points. The generic preset ships no +# defaults for any of the keys below, so each rendered block appears +# only when the corresponding value is non-empty. Keep parity with the +# cozystack preset's `extra*` keys so a values file is portable +# between presets. + +# Kernel modules to load on every node. Each entry is a Talos kernel +# module spec — see Talos docs for the full shape. Example: +# extraKernelModules: +# - name: nf_conntrack +# - name: foo +# parameters: +# - bar=baz +extraKernelModules: [] + +# Kubelet command-line args, rendered as machine.kubelet.extraConfig. +# Talos expects string values. Example: +# extraKubeletExtraArgs: +# maxPods: "256" +# feature-gates: "NodeSwap=true" +extraKubeletExtraArgs: {} + +# Kernel sysctls, rendered as machine.sysctls. Values must be YAML +# strings (Talos expects strings even for numeric sysctls). Example: +# extraSysctls: +# net.core.somaxconn: "65535" +extraSysctls: {} + +# Machine files to drop on every node. Each entry is a Talos +# machine.files spec (content / path / permissions / op). Example: +# extraMachineFiles: +# - path: /etc/example.conf +# op: create +# content: | +# hello = "world" +extraMachineFiles: [] diff --git a/docs/manual-test-plan.md b/docs/manual-test-plan.md index e4fee400..8a5f8098 100644 --- a/docs/manual-test-plan.md +++ b/docs/manual-test-plan.md @@ -246,6 +246,76 @@ Expected: both renders print "OK" — no `address: 169.254.…` lines anywhere i Regression anchor: a regression that re-introduces the v1.11-only `host`-scope filter would let link-local addresses leak into the legacy `machine.network.interfaces[].addresses` block on a future COSI schema bump that emitted them on the default-gateway-bearing link. Pinned by `TestContract_NetworkLegacy_DefaultAddressesFilterLinkScope` with a `linkScopedAddressOnDefaultGatewayLookup` fixture, but the live render is still useful as a sanity check against real cluster discovery output. +### B7. `extra*` operator extension points (cozystack: append to defaults) + +The cozystack preset ships hardcoded defaults for kernel modules, sysctls, kubelet extraConfig, and machine.files. The matching `extra*` values keys let an operator extend each block without forking the chart. Set every key, re-render, and inspect each affected section: + +```yaml +# values.yaml additions: +extraKernelModules: + - name: nf_conntrack +extraKubeletExtraArgs: + feature-gates: "NodeSwap=true" +extraSysctls: + net.core.somaxconn: "65535" +extraMachineFiles: + - path: /etc/example/operator.conf + op: create + content: | + hello = "world" +``` + +```bash +talm template -f nodes/node0.yaml > /tmp/render.yaml +yq '.machine.kernel.modules' /tmp/render.yaml +yq '.machine.kubelet.extraConfig' /tmp/render.yaml +yq '.machine.sysctls' /tmp/render.yaml +yq '.machine.files[].path' /tmp/render.yaml +``` + +Expected: + +- `.machine.kernel.modules` lists the built-in six (`openvswitch`, `drbd` with `usermode_helper=disabled`, `zfs`, `spl`, `vfio_pci`, `vfio_iommu_type1`) AND `nf_conntrack` — append, never override. +- `.machine.kubelet.extraConfig` carries the built-ins (`cpuManagerPolicy: static`, `maxPods: 512`) AND `feature-gates: NodeSwap=true`. Operator keys MUST NOT collide with built-ins; a collision fails the render at template time. +- `.machine.sysctls` carries the built-in `gc_thresh1/2/3` and `vm.nr_hugepages` (when set) AND `net.core.somaxconn`. Same rejection-on-collision rule. +- `.machine.files[].path` lists `/etc/cri/conf.d/20-customization.part`, `/etc/lvm/lvm.conf`, AND `/etc/example/operator.conf`. + +Verify the rejection path explicitly. Set an operator key that collides with a built-in (e.g. `extraSysctls: { "net.ipv4.neigh.default.gc_thresh1": "9000" }`) and re-render: + +```bash +talm template -f nodes/node0.yaml 2>&1 | grep -E 'extraSysctls\.net\.ipv4\.neigh\.default\.gc_thresh1 collides' +``` + +Expected: render fails with `Error: values.yaml: extraSysctls.net.ipv4.neigh.default.gc_thresh1 collides with the cozystack preset's built-in machine.sysctls — keys never override (yaml.v3 rejects duplicate map keys on decode). Remove the entry from extraSysctls, or fork the chart preset if you need a different default.` Same shape for `extraKubeletExtraArgs.maxPods` etc. The "fork the preset" escape hatch is the only supported path to change a built-in default. + +Regression anchor: contract tests `TestContract_Machine_Extra{KernelModules,KubeletExtraArgs,Sysctls,MachineFiles}_Cozystack_*` pin append / merge semantics for each section. The `_RejectsCollisionWithBuiltin` tests pin the failure mode for every built-in key. A regression that swaps the chart-level `fail` for a silent `toYaml | nindent` would render duplicate map keys; the matching `_MergeInto*` test round-trip-decodes through yaml.v3 (which refuses duplicates), and the `_RejectsCollisionWithBuiltin` cases would also fail. + +### B8. `extra*` operator extension points (generic: emit-only-when-set) + +The generic preset ships no defaults for the same four blocks. Each `machine.*` block appears in the render ONLY when its `extra*` values key is non-empty. + +With all `extra*` keys at their empty defaults: + +```bash +talm template -f nodes/node0.yaml > /tmp/render.yaml +for k in kernel sysctls files; do + if [ "$(yq ".machine.$k" /tmp/render.yaml)" = "null" ]; then + echo "OK: no $k block" + else + echo "FAIL: $k block leaked into default generic render" + fi +done +if [ "$(yq '.machine.kubelet.extraConfig' /tmp/render.yaml)" = "null" ]; then + echo "OK: no extraConfig" +else + echo "FAIL: extraConfig leaked" +fi +``` + +Expected: all four prints "OK". Set any single `extra*` key non-empty, re-render — only the matching block appears; the other three stay absent. The `yq ... = "null"` form is exact-match: `yq` emits the literal string `null` on a single line for an absent path, so a leaking empty block (`modules: []`, `extraConfig: {}`) shows up as a non-`null` value and trips the FAIL branch. + +Regression anchor: contract tests `TestContract_Machine_Extra*_Generic_NonEmptyEmitsBlock` pin the on-state for each block; `TestContract_Machine_NoCozystackOpinionsOnGeneric` pins the off-state at default values. A regression that emits an empty `modules: []` / `sysctls: {}` / `files: []` block in the default render would fail the latter. + ## C. Apply (auth path) This section is the smoke-test for the apply pipe itself; the per-gate matrix lives in **Section C-safety** below. diff --git a/pkg/engine/contract_machine_test.go b/pkg/engine/contract_machine_test.go index 9b74da16..52725805 100644 --- a/pkg/engine/contract_machine_test.go +++ b/pkg/engine/contract_machine_test.go @@ -28,7 +28,10 @@ package engine import ( + "strings" "testing" + + "gopkg.in/yaml.v3" ) // === Shared contracts (both charts, both schemas, both machine types) === @@ -217,6 +220,389 @@ func TestContract_Machine_KernelModules_Cozystack(t *testing.T) { } } +// === Cozystack extension points (extraKernelModules / extraSysctls / extraKubeletExtraArgs / extraMachineFiles) === +// +// The cozystack preset ships hard-coded defaults for kernel modules, +// sysctls, kubelet extraConfig, and machine.files. Operators wanting +// to extend any of these without forking the preset set the matching +// `extra*` values key; the chart appends operator entries to the +// built-in set. +// +// The built-in set is load-bearing for cozystack's storage / +// networking / runtime stack and is NEVER overridable by an +// extension entry: +// +// - List-shaped `extra*` values (extraKernelModules, +// extraMachineFiles) append unconditionally; duplicates by +// identifying field are tolerated at apply time by Talos +// (modules dedupe on load; collision-by-path on machine.files +// is the operator's responsibility). +// - Map-shaped `extra*` values (extraKubeletExtraArgs, extraSysctls) +// are guarded at template time by chart-level `fail`: any operator +// key that names a built-in (e.g. extraSysctls.gc_thresh1) blocks +// the render with a hint pointing at the offending key. yaml.v3 +// (used by Talos config decode and by the upgrade-time body +// writeback in this same branch) rejects duplicate map keys on +// decode, so a silent emit-both merge would produce a config that +// cannot round-trip. The escape hatch is to fork the preset. +// +// The contract tests below pin both shapes: `_AppendValues` / +// `_MergeInto*` round-trip-decode the rendered output through yaml.v3 +// (substring asserts on the bytes would silently pass for invalid +// duplicate-key output), and `_RejectsCollisionWithBuiltin` pins the +// failure mode for every built-in key. + +// Contract: values.extraKernelModules APPENDS to the cozystack preset's +// built-in module list — it never overrides. The built-in six +// (openvswitch, drbd, zfs, spl, vfio_pci, vfio_iommu_type1) are +// load-bearing for cozystack's storage/networking stack, so an +// operator who supplies extra modules must NOT silently drop any of +// the built-ins. +func TestContract_Machine_ExtraKernelModules_Cozystack_AppendValues(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKernelModules": []any{ + map[string]any{"name": "nf_conntrack"}, + map[string]any{"name": "br_netfilter"}, + }, + }) + // Built-in six still present. + assertContains(t, out, "- name: openvswitch") + assertContains(t, out, "- name: drbd") + assertContains(t, out, "- usermode_helper=disabled") + assertContains(t, out, "- name: zfs") + assertContains(t, out, "- name: spl") + assertContains(t, out, "- name: vfio_pci") + assertContains(t, out, "- name: vfio_iommu_type1") + // Operator-supplied modules appended. + assertContains(t, out, "- name: nf_conntrack") + assertContains(t, out, "- name: br_netfilter") +} + +// Contract: an empty values.extraKernelModules (or its default `[]`) +// leaves the cozystack module list IDENTICAL to the built-in six — no +// `[]` suffix, no empty-list artifact, no trailing module lines. The +// `{{- with .Values.extraKernelModules }}` guard relies on Helm's +// emptiness check; a regression that swaps `with` for a bare +// `toYaml .Values.extraKernelModules` would emit `[]` after +// vfio_iommu_type1 and either fail YAML parse or pin the wrong list +// shape. Boundary case between the unset path +// (TestContract_Machine_KernelModules_Cozystack) and the set path +// (_AppendValues). +func TestContract_Machine_ExtraKernelModules_Cozystack_EmptyOmitsAppend(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKernelModules": []any{}, + }) + // Built-in six present. + assertContains(t, out, "- name: vfio_iommu_type1") + // No empty-list artifact after the built-in tail. + assertNotContains(t, out, "vfio_iommu_type1\n []") + assertNotContains(t, out, "vfio_iommu_type1\n[]") + // Exactly six `- name:` lines inside the modules block: parse the + // block bounds and count. Anchors `kernel:` / `certSANs:` are the + // adjacent siblings under machine.* in the cozystack chart. + kernelIdx := strings.Index(out, " kernel:") + if kernelIdx < 0 { + t.Fatalf("kernel: block missing from rendered output") + } + tail := out[kernelIdx:] + endIdx := strings.Index(tail, " certSANs:") + if endIdx < 0 { + t.Fatalf("certSANs: sibling missing — block bounds undetectable") + } + block := tail[:endIdx] + gotCount := strings.Count(block, "- name:") + if gotCount != 6 { + t.Errorf("expected 6 modules in kernel.modules block with empty extraKernelModules, got %d\nblock:\n%s", gotCount, block) + } +} + +// Contract: values.extraKernelModules entries pass through verbatim, +// including the `parameters:` list. A module that needs a module-param +// (the way the built-in drbd carries `usermode_helper=disabled`) must +// emit it identically when supplied through values, otherwise the +// values-driven path silently differs from the hard-coded one. +func TestContract_Machine_ExtraKernelModules_Cozystack_PreservesParameters(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKernelModules": []any{ + map[string]any{ + "name": "nf_conntrack", + "parameters": []any{"hashsize=131072"}, + }, + }, + }) + assertContains(t, out, "- name: nf_conntrack") + assertContains(t, out, "- hashsize=131072") +} + +// Contract: values.extraKubeletExtraArgs entries merge into the +// kubelet.extraConfig map alongside the cozystack preset's +// `cpuManagerPolicy: static` and `maxPods: 512`. The merge is +// validated by round-trip through yaml.v3 — Talos's config decoder +// uses yaml.v3 which REJECTS duplicate map keys, so the rendered +// output must contain each key exactly once. Operator keys must be +// disjoint from the built-in set (the collision case is rejected at +// render time — see _RejectsCollisionWithBuiltin). +func TestContract_Machine_ExtraKubeletExtraArgs_Cozystack_MergeIntoExtraConfig(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKubeletExtraArgs": map[string]any{ + "feature-gates": "NodeSwap=true", + }, + }) + extraConfig := decodeKubeletExtraConfig(t, out) + if got, want := extraConfig["cpuManagerPolicy"], "static"; got != want { + t.Errorf("cpuManagerPolicy: got %v, want %v", got, want) + } + // yaml.v3 decodes the bare numeric 512 as int; the test pins the + // numeric type as well as the value so a regression that quoted + // the built-in would also surface. + if got, want := extraConfig["maxPods"], 512; got != want { + t.Errorf("maxPods: got %v (%T), want %v (int)", got, got, want) + } + if got, want := extraConfig["feature-gates"], "NodeSwap=true"; got != want { + t.Errorf("feature-gates: got %v, want %v", got, want) + } +} + +// Contract: an operator key in extraKubeletExtraArgs that collides +// with a built-in extraConfig key (cpuManagerPolicy, maxPods) MUST +// fail at render time with a precise hint. yaml.v3 rejects duplicate +// map keys on decode, so a silent merge would produce a Talos config +// that cannot round-trip. Fork-the-preset is the documented escape +// hatch; this test pins the rejection so the fork path is the only +// way an operator can change a built-in default. +func TestContract_Machine_ExtraKubeletExtraArgs_Cozystack_RejectsCollisionWithBuiltin(t *testing.T) { + cases := []string{"cpuManagerPolicy", "maxPods"} + for _, key := range cases { + t.Run(key, func(t *testing.T) { + err := renderCozystackExpectError(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKubeletExtraArgs": map[string]any{ + key: "operator-value", + }, + }) + if err == nil { + t.Fatalf("expected render error for collision on key %q, got nil", key) + } + msg := err.Error() + if !strings.Contains(msg, key) { + t.Errorf("error message must name the offending key %q; got: %v", key, err) + } + if !strings.Contains(msg, "extraKubeletExtraArgs") { + t.Errorf("error message must name the offending values key extraKubeletExtraArgs; got: %v", err) + } + }) + } +} + +// Contract: values.extraSysctls entries merge into machine.sysctls +// alongside the cozystack preset's gc_thresh* + vm.nr_hugepages +// defaults. Round-trip-decoded through yaml.v3 so a regression that +// emits a duplicate key would fail decode here. +func TestContract_Machine_ExtraSysctls_Cozystack_MergeIntoSysctls(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraSysctls": map[string]any{ + "net.core.somaxconn": "65535", + }, + }) + sysctls := decodeMachineSysctls(t, out) + // Talos requires sysctl values be strings; the chart's hardcoded + // gc_thresh* entries are explicitly quoted to match. Pin both the + // value and the string type so a regression that emits unquoted + // integers would surface. + for _, k := range []string{ + "net.ipv4.neigh.default.gc_thresh1", + "net.ipv4.neigh.default.gc_thresh2", + "net.ipv4.neigh.default.gc_thresh3", + } { + v, ok := sysctls[k].(string) + if !ok { + t.Errorf("sysctl %q: expected string, got %T (value %v)", k, sysctls[k], sysctls[k]) + } + if v == "" { + t.Errorf("sysctl %q: expected non-empty string value, got empty", k) + } + } + if got, want := sysctls["net.core.somaxconn"], "65535"; got != want { + t.Errorf("operator sysctl: got %v, want %v", got, want) + } +} + +// Contract: an operator key in extraSysctls that collides with the +// preset's built-in machine.sysctls keys MUST fail render. Same +// rationale as extraKubeletExtraArgs. +func TestContract_Machine_ExtraSysctls_Cozystack_RejectsCollisionWithBuiltin(t *testing.T) { + cases := []string{ + "vm.nr_hugepages", + "net.ipv4.neigh.default.gc_thresh1", + "net.ipv4.neigh.default.gc_thresh2", + "net.ipv4.neigh.default.gc_thresh3", + } + for _, key := range cases { + t.Run(key, func(t *testing.T) { + err := renderCozystackExpectError(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraSysctls": map[string]any{ + key: "operator-value", + }, + }) + if err == nil { + t.Fatalf("expected render error for collision on key %q, got nil", key) + } + msg := err.Error() + if !strings.Contains(msg, key) { + t.Errorf("error message must name the offending key %q; got: %v", key, err) + } + if !strings.Contains(msg, "extraSysctls") { + t.Errorf("error message must name the offending values key extraSysctls; got: %v", err) + } + }) + } +} + +// decodeKubeletExtraConfig parses every YAML document in the rendered +// output and returns the machine.kubelet.extraConfig map from +// whichever document carries it. Uses yaml.v3 directly (the decoder +// Talos uses internally) so a duplicate map key in the rendered +// output fails decoding here — the test surfaces the defect instead +// of asserting on substring presence over invalid YAML. +func decodeKubeletExtraConfig(t *testing.T, out string) map[string]any { + t.Helper() + return decodeMachineSubMap(t, out, "kubelet", "extraConfig") +} + +func decodeMachineSysctls(t *testing.T, out string) map[string]any { + t.Helper() + return decodeMachineSubMap(t, out, "sysctls") +} + +func decodeMachineSubMap(t *testing.T, out string, path ...string) map[string]any { + t.Helper() + + dec := yaml.NewDecoder(strings.NewReader(out)) + for { + var doc map[string]any + err := dec.Decode(&doc) + if err != nil { + if err.Error() == "EOF" { + break + } + t.Fatalf("decoding rendered YAML: %v", err) + } + machine, ok := doc["machine"].(map[string]any) + if !ok { + continue + } + cur := any(machine) + for _, p := range path { + m, ok := cur.(map[string]any) + if !ok { + cur = nil + break + } + cur = m[p] + } + if m, ok := cur.(map[string]any); ok { + return m + } + } + t.Fatalf("no document carried machine.%s in rendered output:\n%s", strings.Join(path, "."), out) + return nil +} + +// Contract: values.extraMachineFiles entries append to machine.files +// alongside the cozystack preset's CRI-customization and lvm.conf +// entries. The built-in two must stay intact. +func TestContract_Machine_ExtraMachineFiles_Cozystack_AppendsToFiles(t *testing.T) { + out := renderCozystackWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraMachineFiles": []any{ + map[string]any{ + "path": "/etc/example/operator.conf", + "op": "create", + "content": "hello=world\n", + }, + }, + }) + // Built-in files preserved. + assertContains(t, out, "path: /etc/cri/conf.d/20-customization.part") + assertContains(t, out, "path: /etc/lvm/lvm.conf") + // Operator-supplied entry appended. + assertContains(t, out, "path: /etc/example/operator.conf") + assertContains(t, out, "hello=world") +} + +// === Generic extension points: emit-only-when-set === +// +// The generic preset ships no defaults for any of these blocks. Each +// rendered block appears ONLY when the matching values key is +// non-empty. Pinning the on-state here so a regression that always +// emits an empty `modules: []` (etc.) would fail; the off-state stays +// pinned by TestContract_Machine_NoCozystackOpinionsOnGeneric. + +// Contract: generic preset emits machine.kernel.modules ONLY when +// values.extraKernelModules is non-empty. +func TestContract_Machine_ExtraKernelModules_Generic_NonEmptyEmitsBlock(t *testing.T) { + out := renderGenericWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKernelModules": []any{ + map[string]any{"name": "nf_conntrack"}, + }, + }) + assertContains(t, out, "kernel:") + assertContains(t, out, "modules:") + assertContains(t, out, "- name: nf_conntrack") +} + +// Contract: generic preset emits kubelet.extraConfig ONLY when +// values.extraKubeletExtraArgs is non-empty (default is `{}`, so the +// preset renders kubelet with only nodeIP.validSubnets). +func TestContract_Machine_ExtraKubeletExtraArgs_Generic_NonEmptyEmitsBlock(t *testing.T) { + out := renderGenericWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraKubeletExtraArgs": map[string]any{ + "feature-gates": "NodeSwap=true", + }, + }) + assertContains(t, out, "extraConfig:") + assertContains(t, out, "feature-gates: NodeSwap=true") +} + +// Contract: generic preset emits machine.sysctls ONLY when +// values.extraSysctls is non-empty. +func TestContract_Machine_ExtraSysctls_Generic_NonEmptyEmitsBlock(t *testing.T) { + out := renderGenericWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraSysctls": map[string]any{ + "net.core.somaxconn": "65535", + }, + }) + assertContains(t, out, "sysctls:") + assertContains(t, out, "net.core.somaxconn:") + assertContains(t, out, `"65535"`) +} + +// Contract: generic preset emits machine.files ONLY when +// values.extraMachineFiles is non-empty. +func TestContract_Machine_ExtraMachineFiles_Generic_NonEmptyEmitsBlock(t *testing.T) { + out := renderGenericWith(t, helmEngineEmptyLookup, map[string]any{ + "advertisedSubnets": []any{testAdvertisedSubnet}, + "extraMachineFiles": []any{ + map[string]any{ + "path": "/etc/example/operator.conf", + "op": "create", + "content": "hello=world\n", + }, + }, + }) + assertContains(t, out, "files:") + assertContains(t, out, "path: /etc/example/operator.conf") +} + // Contract: cozystack always prepends 127.0.0.1 to machine.certSANs // (separate from the controlplane-only cluster.apiServer.certSANs // pinned in contract_cluster_test.go). machine-level certSANs control From c349a0532ad243473ac80c6f1f45d686752cf22e Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Fri, 15 May 2026 20:37:47 +0300 Subject: [PATCH 3/3] fix(upgrade): point-patch node body install.image after successful upgrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit talm upgrade resolves the target installer image from values.yaml (canonical "raise the cluster's Talos version" workflow), but the node body's machine.install.image was never resynced afterwards. A follow-up talm apply re-rendered the chart against values.yaml and got the new image, then merged the stale body via MergeFileAsPatch and silently pinned the cluster back to the pre-upgrade image — the next A/B boot would roll back without warning. Point-patch machine.install.image in every -f node body after the upgrade RPC + post-upgrade verify return success. Uses a yaml.v3 node round-trip so the modeline, autogen banner, sibling keys, and per-key comments survive untouched; files without an install.image key (side-patches, orphans, partials) are silently skipped so the handler can blindly fan out over the full -f list. Verify failure intentionally leaves the body alone: the node is on the pre-upgrade image after auto-rollback, and the body must reflect what the node actually runs — not what talosctl was asked to install. Operator opt-out via --skip-post-upgrade-verify (or the --insecure / --stage paths that drop Phase 2C entirely) patches unconditionally on RPC success, trading the verify gate's safety for body-sync. Contract tests pin scalar swap, idempotency (no rewrite when already on target), silent-skip across three no-key shapes, structural-error surface for unexpected machine.install.image kinds, and file-list fan-out semantics. Manual-test-plan E4 covers the happy path, failure-path skip, and operator-opt-out path against a real cluster. Signed-off-by: Aleksei Sviridkin --- docs/manual-test-plan.md | 57 ++ .../contract_upgrade_image_writeback_test.go | 542 ++++++++++++++++++ pkg/commands/upgrade_handler.go | 43 +- pkg/commands/upgrade_image_writeback.go | 314 ++++++++++ 4 files changed, 953 insertions(+), 3 deletions(-) create mode 100644 pkg/commands/contract_upgrade_image_writeback_test.go create mode 100644 pkg/commands/upgrade_image_writeback.go diff --git a/docs/manual-test-plan.md b/docs/manual-test-plan.md index 8a5f8098..db8c1cef 100644 --- a/docs/manual-test-plan.md +++ b/docs/manual-test-plan.md @@ -799,6 +799,63 @@ Expected for the help line: flag listed with `default 1m30s`. Expected for the 1 Regression anchor: the version-mismatch hint emitted on a Phase 2C blocker MUST NOT contain the literal string `90s` — operators passing a custom window would see contradictory advice. The hint should reference "the configured reconcile window (`--post-upgrade-reconcile-window`)" instead. +### E4. Post-upgrade body sync: install.image patched on success + +Substitute `` below with the Talos installer version you want the cluster to land on (e.g. `v1.13.0`). The cozystack v1.12+ render emits a multi-document body (machine + cluster + RegistryMirrorConfig + LinkConfig + Layer2VIPConfig + VLANConfig separated by `---`); record the document count before the upgrade so it can be re-asserted afterwards. + +```bash +# Bump the cluster-wide installer in values.yaml WITHOUT re-running `talm template`: +sed -i.bak 's|^image: .*|image: "ghcr.io/cozystack/cozystack/talos:"|' values.yaml +grep '^image:' values.yaml + +# Snapshot the body shape so the post-upgrade diff can pin both the +# image change AND the multi-doc preservation. +cp nodes/node0.yaml /tmp/node0.before.yaml +docs_before=$(grep -c '^---$' nodes/node0.yaml) +echo "documents before upgrade: $((docs_before + 1))" + +# Body still pins the previous image — it was templated earlier. +yq '.machine.install.image' nodes/node0.yaml + +# Run the upgrade. Image is resolved from values.yaml (--image unset). +talm upgrade -f nodes/node0.yaml +``` + +Expected, in order: + +- stderr: `Using image from values.yaml: ghcr.io/cozystack/cozystack/talos:v1.13.0` +- talosctl upgrade event stream runs through `post check passed`. +- stderr: `post-upgrade verify: waiting 1m30s for the node to finish booting...` then a success line confirming the running version matches the target. +- stderr: `Synced machine.install.image in nodes/node0.yaml to ghcr.io/cozystack/cozystack/talos:` — emitted ONLY after the verify line, never before. +- `nodes/node0.yaml` on disk: `machine.install.image` now equals the target. The `# talm: ...` modeline, the autogen banner, and every other key under `machine.*` are byte-identical to the pre-upgrade file. `git diff nodes/node0.yaml` shows a single-line image change. +- Multi-doc preservation: `grep -c '^---$' nodes/node0.yaml` returns the SAME count as `$docs_before`. `diff /tmp/node0.before.yaml nodes/node0.yaml` shows only the `image:` line as differing — no document is removed, no `---` separator is dropped, no RegistryMirrorConfig / LinkConfig / Layer2VIPConfig / VLANConfig is silently erased. A regression that used `yaml.Unmarshal` instead of a streaming decoder would silently truncate every non-first document, which this check catches. + +Drift check (the bug this scenario closes): + +```bash +talm apply --dry-run -f nodes/node0.yaml +``` + +Expected: Phase 2A drift preview shows zero `machine.install.image` mutation. A regression that drops the write-back step would re-render install.image from the new values.yaml, then merge the stale body image on top, and the drift preview would surface `~ machine.install.image: -> ` — exactly the silent-revert footgun. + +Failure-path coverage (run only if you have a way to force auto-rollback, e.g. cross-vendor image): + +```bash +talm upgrade --image ghcr.io/siderolabs/installer:v1.13.0 -f nodes/node0.yaml +``` + +Expected: Phase 2C version verify FAILS (running version is still the pre-upgrade one because Talos auto-rolled back). The "Synced machine.install.image" line does NOT appear; `nodes/node0.yaml` is byte-identical to its pre-upgrade state. The body must reflect what the node ACTUALLY runs, which is the pre-upgrade image. + +Skip-verify coverage: + +```bash +talm upgrade --skip-post-upgrade-verify -f nodes/node0.yaml +``` + +Expected: stderr emits the "Synced machine.install.image" line immediately after the talosctl RPC returns success — the operator opted out of verify, so the patch fires unconditionally on RPC success. Document the trade-off: skipping verify trades safety (no auto-rollback detection) for body-sync; operators must inspect the node's running version themselves. + +Regression anchor: contract tests `TestContract_WriteBackInstallImage_*` pin the on-disk shape of the patch (scalar swap, idempotency, silent-skip on orphan files, structural errors, file-list fan-out). A regression that fires the write-back BEFORE verify (or instead of verify) would silently pin the body to an image the node never actually ran — the failure-path scenario above would catch it. Cross-reference: `pkg/commands/upgrade_image_writeback.go`. + ## F. CA rotation ### F1. Rotate CA dry-run diff --git a/pkg/commands/contract_upgrade_image_writeback_test.go b/pkg/commands/contract_upgrade_image_writeback_test.go new file mode 100644 index 00000000..cd6a179b --- /dev/null +++ b/pkg/commands/contract_upgrade_image_writeback_test.go @@ -0,0 +1,542 @@ +// Copyright Cozystack Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Contract: after a successful `talm upgrade`, the upgrade handler +// must point-patch machine.install.image in every -f node body so the +// next `talm apply` does not silently revert install.image. These +// tests pin the patcher's shape on disk: scalar value swap, +// idempotency, silent-skip on files without the key, error surface +// when the YAML shape conflicts with the contract, and preservation +// of the autogen banner / talm modeline / per-key comments that +// `talm template -I` writes upstream of upgrade. + +package commands + +import ( + "os" + "path/filepath" + "runtime" + "strings" + "testing" +) + +const ( + testOldImage = "ghcr.io/cozystack/cozystack/talos:v1.12.6" + testNewImage = "ghcr.io/cozystack/cozystack/talos:v1.13.0" +) + +func writeTempFile(t *testing.T, content string) string { + t.Helper() + + dir := t.TempDir() + path := filepath.Join(dir, "node0.yaml") + + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("seeding tempfile: %v", err) + } + + return path +} + +// Contract: a node body with machine.install.image has the scalar +// swapped in-place. The rest of the YAML body — including the talm +// modeline (`# talm: ...`), the autogen banner, sibling keys under +// machine.install, and any per-key comments — must survive the +// round-trip. yaml.v3's node-based parse keeps comments attached; +// regression would surface here as either an unchanged image (write +// silently failed) or a destroyed banner / modeline. +func TestContract_WriteBackInstallImage_HappyPath(t *testing.T) { + body := `# talm: nodes=["192.0.2.10"], endpoints=["192.0.2.10"], templates=["templates/controlplane.yaml"] +# THIS FILE IS AUTOGENERATED. PREFER TEMPLATE EDITS OVER MANUAL ONES. +machine: + type: controlplane + install: + disk: /dev/sda + image: ` + testOldImage + ` +cluster: + clusterName: test +` + + path := writeTempFile(t, body) + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if !patched { + t.Fatalf("expected patched=true on happy path, got false") + } + + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-reading: %v", err) + } + gotStr := string(got) + + // New image present. + if !strings.Contains(gotStr, testNewImage) { + t.Errorf("expected new image %q in output, got:\n%s", testNewImage, gotStr) + } + + // Old image gone. + if strings.Contains(gotStr, testOldImage) { + t.Errorf("old image %q still present after writeback:\n%s", testOldImage, gotStr) + } + + // Modeline preserved (the first-line comment yaml.v3 keeps as the + // document's HeadComment). + if !strings.Contains(gotStr, `# talm: nodes=["192.0.2.10"]`) { + t.Errorf("modeline missing after writeback:\n%s", gotStr) + } + + // Sibling key preserved. + if !strings.Contains(gotStr, "disk: /dev/sda") { + t.Errorf("sibling key machine.install.disk missing after writeback:\n%s", gotStr) + } +} + +// Contract: when the body's machine.install.image already equals the +// target, writeback is a no-op — it does NOT rewrite the file on +// disk. Pinned via mtime: an operator who runs `talm template -I` to +// sync, then `talm upgrade`, then `talm upgrade` again must not see +// the file's mtime bump on the redundant pass (touching the file +// would re-trigger git status, file watchers, etc.). +func TestContract_WriteBackInstallImage_NoOpWhenAlreadyTarget(t *testing.T) { + body := `machine: + install: + image: ` + testNewImage + ` +` + path := writeTempFile(t, body) + + statBefore, err := os.Stat(path) + if err != nil { + t.Fatalf("stat before: %v", err) + } + + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if patched { + t.Errorf("expected patched=false when body already on target image, got true") + } + + statAfter, err := os.Stat(path) + if err != nil { + t.Fatalf("stat after: %v", err) + } + + if !statBefore.ModTime().Equal(statAfter.ModTime()) { + t.Errorf("expected no-op (mtime stable), got mtime change %v -> %v", statBefore.ModTime(), statAfter.ModTime()) + } +} + +// Contract: a file with no machine.install.image key is silently +// skipped. The upgrade handler calls writeBackInstallImageToFiles +// over every -f file; side-patches, orphan patches, and any other +// shape without machine.install.image legitimately surface here, and +// erroring would block the upgrade success path on irrelevant inputs. +func TestContract_WriteBackInstallImage_SkipsFileWithoutKey(t *testing.T) { + cases := []struct { + name string + body string + }{ + { + "side-patch with only network", + "machine:\n network:\n hostname: foo\n", + }, + { + "no machine key at all", + "cluster:\n clusterName: foo\n", + }, + { + "machine.install without image", + "machine:\n install:\n disk: /dev/sda\n", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + path := writeTempFile(t, tc.body) + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("expected silent skip, got error: %v", err) + } + if patched { + t.Errorf("expected patched=false on skip path, got true") + } + + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-reading: %v", err) + } + if string(got) != tc.body { + t.Errorf("expected file unchanged on skip path, got:\n%s\nwant:\n%s", string(got), tc.body) + } + }) + } +} + +// Contract: a structurally wrong shape at the machine.install.image +// path surfaces as an error, not a silent skip. An operator who +// edited the body into an invalid form must see the failure rather +// than have it masked by the silent-skip branch (which is reserved +// for the "key absent" shape). +func TestContract_WriteBackInstallImage_StructuralErrors(t *testing.T) { + cases := []struct { + name string + body string + }{ + { + "machine.install.image is a mapping", + "machine:\n install:\n image:\n ref: foo\n", + }, + { + "machine.install is a scalar", + "machine:\n install: not-a-mapping\n", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + path := writeTempFile(t, tc.body) + if _, err := writeBackInstallImageToNodeBody(path, testNewImage); err == nil { + t.Errorf("expected structural error for %q, got nil", tc.body) + } + }) + } +} + +// Contract: the writeback re-emits the body with the same indent +// shape it was seeded with. talm-rendered node bodies use 2-space +// indent throughout (see the `nindent 2/4/6` ladder in the chart +// helpers); yaml.v3's default Marshal emits 4-space indent which +// would re-indent every line under machine.* on round-trip and +// flood `git diff` with cosmetic noise — making the operator-visible +// claim "git diff shows a single-line image change" (manual-test-plan +// E4) false. +// +// Strategy: seed a body, capture it line-by-line, run the writeback, +// then assert every line is byte-identical EXCEPT the single +// ` image: ` line whose right-hand side changed. A regression +// that re-indented the whole file would fail every nested line. +func TestContract_WriteBackInstallImage_PreservesIndentShape(t *testing.T) { + body := `# talm: nodes=["192.0.2.10"], endpoints=["192.0.2.10"], templates=["templates/controlplane.yaml"] +# THIS FILE IS AUTOGENERATED. PREFER TEMPLATE EDITS OVER MANUAL ONES. +machine: + type: controlplane + kubelet: + nodeIP: + validSubnets: + - 192.0.2.0/24 + install: + disk: /dev/sda + image: ` + testOldImage + ` +cluster: + clusterName: test +` + path := writeTempFile(t, body) + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if !patched { + t.Fatalf("expected patched=true, got false") + } + + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-reading: %v", err) + } + + beforeLines := strings.Split(body, "\n") + afterLines := strings.Split(string(got), "\n") + + if len(beforeLines) != len(afterLines) { + t.Fatalf("expected line count to match (writeback must not add/remove lines), got before=%d after=%d\nafter:\n%s", len(beforeLines), len(afterLines), string(got)) + } + + for i := range beforeLines { + before := beforeLines[i] + after := afterLines[i] + + if strings.HasPrefix(strings.TrimSpace(before), "image:") { + // The image scalar line is allowed to differ — but its + // indent prefix must be unchanged. + beforeIndent := before[:len(before)-len(strings.TrimLeft(before, " "))] + afterIndent := after[:len(after)-len(strings.TrimLeft(after, " "))] + if beforeIndent != afterIndent { + t.Errorf("image line indent changed: %q -> %q", beforeIndent, afterIndent) + } + if !strings.Contains(after, testNewImage) { + t.Errorf("image line should now contain %q, got %q", testNewImage, after) + } + + continue + } + + if before != after { + t.Errorf("line %d unexpectedly changed:\n before: %q\n after: %q", i+1, before, after) + } + } +} + +// Contract: the writeback preserves the source file's permission +// bits. An operator who hardened node-body permissions (chmod 0600 +// for some reason) must not get them silently relaxed to the +// presetFileMode 0o644 by the writeback. Only when Stat fails does +// the writeback fall back to presetFileMode. +// +// Windows is skipped: NTFS does not honour Unix permission bits, so +// os.FileInfo.Mode().Perm() always returns 0o666 there (see the +// project memory entry on cross-platform Go testing). +func TestContract_WriteBackInstallImage_PreservesFileMode(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("Unix permission bits not meaningful on NTFS") + } + + body := "machine:\n install:\n image: " + testOldImage + "\n" + path := writeTempFile(t, body) + + if err := os.Chmod(path, 0o600); err != nil { + t.Fatalf("chmod seed: %v", err) + } + + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if !patched { + t.Fatalf("expected patched=true, got false") + } + + info, err := os.Stat(path) + if err != nil { + t.Fatalf("stat after: %v", err) + } + if got, want := info.Mode().Perm(), os.FileMode(0o600); got != want { + t.Errorf("file mode after writeback: got %v, want %v (operator-hardened mode must be preserved)", got, want) + } +} + +// Contract: a multi-document node body (cozystack v1.12+ render +// shape: machine + cluster + RegistryMirrorConfig + LinkConfig + +// Layer2VIPConfig + VLANConfig separated by `---`) preserves every +// non-machine document on round-trip. yaml.Unmarshal reads only the +// first document — using it on a multi-doc body would silently erase +// the registry / link / VIP / VLAN config from the file on disk. +// The writeback uses yaml.NewDecoder in a loop and re-emits every +// document through a single yaml.NewEncoder; this test pins that +// shape against the exact failure mode. +func TestContract_WriteBackInstallImage_PreservesAllDocuments(t *testing.T) { + body := `# talm: nodes=["192.0.2.10"], endpoints=["192.0.2.10"], templates=["templates/controlplane.yaml"] +machine: + type: controlplane + install: + disk: /dev/sda + image: ` + testOldImage + ` +cluster: + clusterName: test +--- +apiVersion: v1alpha1 +kind: RegistryMirrorConfig +name: docker.io +endpoints: + - url: https://mirror.gcr.io +--- +apiVersion: v1alpha1 +kind: LinkConfig +name: eth0 +up: true +--- +apiVersion: v1alpha1 +kind: Layer2VIPConfig +name: eth0 +vip: 192.0.2.1 +` + path := writeTempFile(t, body) + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if !patched { + t.Fatalf("expected patched=true on multi-doc body, got false") + } + + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-reading: %v", err) + } + gotStr := string(got) + + // Image swap landed. + if !strings.Contains(gotStr, testNewImage) { + t.Errorf("new image %q missing from output:\n%s", testNewImage, gotStr) + } + if strings.Contains(gotStr, testOldImage) { + t.Errorf("old image %q still present after writeback:\n%s", testOldImage, gotStr) + } + + // Every non-machine document survived. Assert by content + // fingerprints — the kind names are unique enough to pin the + // presence of each document. + for _, kind := range []string{"RegistryMirrorConfig", "LinkConfig", "Layer2VIPConfig"} { + if !strings.Contains(gotStr, "kind: "+kind) { + t.Errorf("document with kind %s missing after writeback (multi-doc truncated?):\n%s", kind, gotStr) + } + } + + // Pin the document count via the `---` separator. The input has + // 4 documents (3 separators); the output must have the same. + wantSeparators := strings.Count(body, "\n---\n") + gotSeparators := strings.Count(gotStr, "\n---\n") + if wantSeparators != gotSeparators { + t.Errorf("document-separator count changed: want %d, got %d\noutput:\n%s", wantSeparators, gotSeparators, gotStr) + } + + // Content of preserved documents is byte-for-byte the same. Walk + // the registry-doc body and confirm each line appears verbatim. + for _, marker := range []string{ + "name: docker.io", + "url: https://mirror.gcr.io", + "vip: 192.0.2.1", + } { + if !strings.Contains(gotStr, marker) { + t.Errorf("expected marker %q from a preserved document, got:\n%s", marker, gotStr) + } + } +} + +// Contract: writeback locates machine.install.image even when it is +// NOT in the first document. The cozystack preset emits machine +// first, but a hand-authored body that puts it elsewhere (e.g. +// network docs first, then machine) must still patch correctly. +// Defensive coverage against a future chart re-ordering. +func TestContract_WriteBackInstallImage_FindsImageInAnyDocument(t *testing.T) { + body := `apiVersion: v1alpha1 +kind: RegistryMirrorConfig +name: docker.io +endpoints: + - url: https://mirror.gcr.io +--- +machine: + install: + image: ` + testOldImage + ` +` + path := writeTempFile(t, body) + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if !patched { + t.Fatalf("expected patched=true when machine doc is second, got false") + } + + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("re-reading: %v", err) + } + if !strings.Contains(string(got), testNewImage) { + t.Errorf("new image missing; image not patched when not in first doc:\n%s", string(got)) + } + if !strings.Contains(string(got), "kind: RegistryMirrorConfig") { + t.Errorf("first doc lost on writeback:\n%s", string(got)) + } +} + +// Contract: a multi-document body that is already on the target +// image is a no-op — mtime stays stable so the redundant pass +// doesn't trigger git-status churn or file watchers, AND every +// non-machine document is preserved unchanged (the no-op path must +// not silently truncate the body even though no rewrite is needed). +func TestContract_WriteBackInstallImage_NoOpOnMultiDocAlreadyTarget(t *testing.T) { + body := `machine: + install: + image: ` + testNewImage + ` +--- +apiVersion: v1alpha1 +kind: RegistryMirrorConfig +name: docker.io +` + path := writeTempFile(t, body) + before, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read before: %v", err) + } + statBefore, err := os.Stat(path) + if err != nil { + t.Fatalf("stat before: %v", err) + } + + patched, err := writeBackInstallImageToNodeBody(path, testNewImage) + if err != nil { + t.Fatalf("writeBackInstallImageToNodeBody: %v", err) + } + if patched { + t.Errorf("expected patched=false (already on target), got true") + } + + statAfter, err := os.Stat(path) + if err != nil { + t.Fatalf("stat after: %v", err) + } + if !statBefore.ModTime().Equal(statAfter.ModTime()) { + t.Errorf("expected mtime stable on no-op multi-doc path, got %v -> %v", statBefore.ModTime(), statAfter.ModTime()) + } + + after, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read after: %v", err) + } + if string(before) != string(after) { + t.Errorf("expected file unchanged on no-op multi-doc path; before:\n%s\nafter:\n%s", before, after) + } +} + +// Contract: writeBackInstallImageToFiles iterates the file list, +// processing each via writeBackInstallImageToNodeBody. An empty image +// argument short-circuits to no-op (the upgrade handler reaches this +// with targetImage="" when --image was unset AND the values.yaml +// resolution was bypassed — there is nothing to write back). A list +// with files of mixed shapes (one with image, one without) processes +// both — the no-key file is silently skipped, the keyed file is +// patched. +func TestContract_WriteBackInstallImage_FilesFanOut(t *testing.T) { + keyed := writeTempFile(t, "machine:\n install:\n image: "+testOldImage+"\n") + noKey := writeTempFile(t, "machine:\n network:\n hostname: foo\n") + + if err := writeBackInstallImageToFiles([]string{keyed, noKey}, testNewImage); err != nil { + t.Fatalf("writeBackInstallImageToFiles: %v", err) + } + + keyedOut, err := os.ReadFile(keyed) + if err != nil { + t.Fatalf("re-read keyed: %v", err) + } + if !strings.Contains(string(keyedOut), testNewImage) { + t.Errorf("keyed file not patched: %s", string(keyedOut)) + } + + // Empty image: no-op even on a keyed file. + keyedRetry := writeTempFile(t, "machine:\n install:\n image: "+testOldImage+"\n") + if err := writeBackInstallImageToFiles([]string{keyedRetry}, ""); err != nil { + t.Fatalf("writeBackInstallImageToFiles with empty image: %v", err) + } + retryOut, err := os.ReadFile(keyedRetry) + if err != nil { + t.Fatalf("re-read retry: %v", err) + } + if !strings.Contains(string(retryOut), testOldImage) { + t.Errorf("empty image arg should be a no-op; file changed: %s", string(retryOut)) + } +} diff --git a/pkg/commands/upgrade_handler.go b/pkg/commands/upgrade_handler.go index b722207c..d6a06046 100644 --- a/pkg/commands/upgrade_handler.go +++ b/pkg/commands/upgrade_handler.go @@ -91,7 +91,21 @@ Image resolution (when -f is provided): The first -f file anchors the project root (Chart.yaml + secrets.yaml); its modeline supplies the nodes / endpoints. The node body's machine.install.image is no longer consulted by the -upgrade flow.` +upgrade flow. + +Post-upgrade sync (when the upgrade succeeds): + - talm point-patches machine.install.image in every -f node body + to the image that was applied. Keeps the body consistent with + the running node — without this a follow-up ` + "`talm apply`" + ` would + merge the stale install.image over the chart-rendered new value + and silently pin the cluster back to the pre-upgrade image on + the next A/B boot. Comments, modeline, and unrelated keys are + preserved (yaml.v3 node round-trip); files without an + install.image key (orphans / side-patches) are silently skipped. + - The patch fires after post-upgrade verify confirms the running + version matches the target. A failed verify (auto-rollback) + intentionally leaves the body untouched, so it still reflects + what the node actually runs.` wrappedCmd.Flags().BoolVar(&upgradeCmdFlags.skipPostUpgradeVerify, "skip-post-upgrade-verify", false, "skip the post-upgrade check that compares running Talos version against the target image's tag (detects silent A/B rollback after the RPC acks success)") @@ -212,7 +226,13 @@ upgrade flow.` // Skip predicate documents the cases where this gate cannot // produce a meaningful result. if !shouldRunPostUpgradeVerify(insecure, staged, upgradeCmdFlags.skipPostUpgradeVerify) { - return nil + // Verify skipped (operator opt-out / insecure / staged). + // Still sync node bodies: the RPC was acked, and skipping + // the verify is an explicit operator choice — the body + // must track what talosctl was asked to install so the + // next `talm apply` does not silently revert install.image + // over the chart-rendered new value. + return writeBackInstallImageToFiles(filesToProcess, targetImage) } if targetImage == "" { @@ -221,7 +241,24 @@ upgrade flow.` return nil } - return runPostUpgradeVersionVerify(cmd.Context(), targetImage) + if err := runPostUpgradeVersionVerify(cmd.Context(), targetImage); err != nil { + return err + } + + // Verify did not block — patch the node body. The verify + // helper returns nil on three shapes: (a) running version + // matches the target (the common case), (b) zero nodes were + // resolved (rare — the talosctl upgrade RPC above would have + // also no-op'd, so the patch is vacuously consistent with + // what hit the cluster), (c) the version reader surrendered + // silently (reserved future contract). A failed verify + // (auto-rollback detected) returns non-nil and was already + // returned above, intentionally leaving the body untouched: + // the body still points at the pre-upgrade image, which + // matches what the node ended up running. An operator who + // fixes the rollback cause and re-runs upgrade will sync the + // body on the next pass that clears verify. + return writeBackInstallImageToFiles(filesToProcess, targetImage) } } diff --git a/pkg/commands/upgrade_image_writeback.go b/pkg/commands/upgrade_image_writeback.go new file mode 100644 index 00000000..812ed790 --- /dev/null +++ b/pkg/commands/upgrade_image_writeback.go @@ -0,0 +1,314 @@ +// Copyright Cozystack Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package commands + +import ( + "bytes" + "fmt" + "io" + "os" + + "github.com/cockroachdb/errors" + "gopkg.in/yaml.v3" +) + +// nodeBodyYAMLIndent pins the YAML indent the writeback emits at. +// talm-rendered node bodies use 2-space indent throughout (see the +// `nindent 2/4/6` ladder in charts/cozystack/templates/_helpers.tpl). +// yaml.v3's default Marshal uses 4-space indent, which would +// re-indent every line under `machine:` on round-trip and flood +// `git diff` with cosmetic noise on every upgrade. Pinning to 2 keeps +// the writeback diff to the single image scalar line. +const nodeBodyYAMLIndent = 2 + +// writeBackInstallImageToNodeBody patches machine.install.image inside +// the YAML at filePath, replacing whatever value sits there with newImage. +// Preserves the file's comments, indentation, AND multi-document shape +// via yaml.v3 streaming round-trip; the encoder is pinned to +// nodeBodyYAMLIndent (2) so a follow-up `git diff` shows only the +// single image scalar line. +// +// Multi-document support is load-bearing for the cozystack v1.12+ +// render path: talos.config.multidoc emits machine.config + cluster +// alongside RegistryMirrorConfig, LinkConfig, Layer2VIPConfig and +// VLANConfig documents separated by `---`. A naive yaml.Unmarshal + +// yaml.Marshal would drop every document after the first one, +// silently erasing the registry / network config from the body file +// on disk. Read with a streaming decoder, patch the first document +// that carries machine.install.image, re-emit all documents in order. +// +// Returns (true, nil) on a successful patch, (false, nil) when no +// document has machine.install.image OR the located document is +// already on newImage (both shapes are silent-skip — the caller +// distinguishes the two via the stderr line written here, but the +// return shape is uniform). Returns an error only when the file is +// unreadable, unparseable, the YAML structure conflicts with the +// expected shape (e.g. machine: is a scalar), or the write-back +// itself fails. +// +// Called by talm upgrade after a successful upgrade RPC + post-upgrade +// verify so the cluster's nodes/*.yaml stays consistent with the image +// the node now runs. +func writeBackInstallImageToNodeBody(filePath, newImage string) (bool, error) { + data, err := os.ReadFile(filePath) + if err != nil { + return false, errors.Wrapf(err, "reading node body %s", filePath) + } + + docs, err := decodeAllYAMLDocs(data) + if err != nil { + return false, errors.Wrapf(err, "parsing node body %s", filePath) + } + + imageNode, found, err := locateInstallImageNode(docs) + if err != nil { + return false, errors.Wrapf(err, "locating machine.install.image in %s", filePath) + } + + if !found { + // No document carries the key — orphan / side-patch / + // truncated file. Surface the skip explicitly so an operator + // who passed a file by mistake sees the no-op. + fmt.Fprintf(os.Stderr, "Skipped %s: no machine.install.image key (orphan or side-patch shape)\n", filePath) + + return false, nil + } + + if imageNode.Value == newImage { + // Already on the target image. Skip the disk write so file + // mtime stays stable. + return false, nil + } + + imageNode.Value = newImage + + out, err := encodeAllYAMLDocs(docs) + if err != nil { + return false, errors.Wrapf(err, "re-marshalling node body %s", filePath) + } + + // Resolve the file's mode bits. os.WriteFile applies its mode + // argument ONLY when it creates the file; on the truncate-and- + // rewrite path (the common case here — the file already exists) + // the OS preserves the existing permissions regardless of what + // we pass. So this Stat-and-pass dance covers the rare case + // where the file was deleted between the read above and the + // write below: a fresh file would otherwise pick up the process + // umask. presetFileMode (0o644) is the fallback when Stat + // itself fails (the node body would be unreadable in that case, + // so further hardening would serve no purpose). + mode := presetFileMode + if info, err := os.Stat(filePath); err == nil { + mode = info.Mode().Perm() + } + + //nolint:gosec // filePath is an operator-supplied -f argument; we must write to exactly that path + if err := os.WriteFile(filePath, out, mode); err != nil { + return false, errors.Wrapf(err, "writing back node body %s", filePath) + } + + return true, nil +} + +// decodeAllYAMLDocs streams every document in the buffer through +// yaml.NewDecoder. yaml.Unmarshal reads only the first document, so +// using it would silently truncate multi-doc inputs — exactly the +// shape the cozystack v1.12+ render emits. Returns the documents in +// source order; an empty input returns an empty slice. +func decodeAllYAMLDocs(data []byte) ([]*yaml.Node, error) { + var docs []*yaml.Node + + dec := yaml.NewDecoder(bytes.NewReader(data)) + + for { + var doc yaml.Node + if err := dec.Decode(&doc); err != nil { + if errors.Is(err, io.EOF) { + return docs, nil + } + + //nolint:wrapcheck // caller wraps with the file path + return nil, err + } + + docs = append(docs, &doc) + } +} + +// locateInstallImageNode scans docs in order and returns the first +// scalar node behind machine.install.image. Returns (node, true, nil) +// when found, (nil, false, nil) when no document has the key (the +// silent-skip path), and an error when a document's structural shape +// conflicts with the expected mapping chain (e.g. machine.install is +// a scalar). +func locateInstallImageNode(docs []*yaml.Node) (*yaml.Node, bool, error) { + for _, doc := range docs { + node, found, err := findMachineInstallImageNode(doc) + if err != nil { + return nil, false, err + } + + if found { + return node, true, nil + } + } + + return nil, false, nil +} + +// encodeAllYAMLDocs re-emits docs through a single yaml.NewEncoder +// pinned to nodeBodyYAMLIndent. The encoder writes `---\n` between +// successive Encode calls, reproducing the source's multi-doc shape. +func encodeAllYAMLDocs(docs []*yaml.Node) ([]byte, error) { + var buf bytes.Buffer + + enc := yaml.NewEncoder(&buf) + enc.SetIndent(nodeBodyYAMLIndent) + + for _, doc := range docs { + if err := enc.Encode(doc); err != nil { + _ = enc.Close() + + //nolint:wrapcheck // caller wraps with the file path + return nil, err + } + } + + if err := enc.Close(); err != nil { + //nolint:wrapcheck // caller wraps with the file path + return nil, err + } + + return buf.Bytes(), nil +} + +// findMachineInstallImageNode walks a parsed YAML document looking +// for the scalar node behind .machine.install.image. Returns +// (node, true, nil) when found, (nil, false, nil) when any link in +// the chain is missing, and an error when the chain exists but the +// node shape is wrong (e.g. machine.install is a scalar). +// +// An empty document (zero content) is treated as "key absent" and +// silent-skipped. The caller logs the skip via a stderr line so an +// operator who passed a truncated body sees the no-op explicitly +// instead of wondering why the file was left alone. +func findMachineInstallImageNode(doc *yaml.Node) (*yaml.Node, bool, error) { + if doc.Kind != yaml.DocumentNode || len(doc.Content) == 0 { + return nil, false, nil + } + + root := doc.Content[0] + if root.Kind != yaml.MappingNode { + return nil, false, nil + } + + machine, ok, err := childByKey(root, "machine") + if err != nil || !ok { + return nil, false, err + } + + install, ok, err := childByKey(machine, "install") + if err != nil || !ok { + return nil, false, err + } + + image, ok, err := childByKey(install, "image") + if err != nil || !ok { + return nil, false, err + } + + if image.Kind != yaml.ScalarNode { + //nolint:wrapcheck // sentinel constructed in-place; caller wraps with the file path + return nil, false, errors.Newf("expected scalar at machine.install.image, got %s", yamlKindName(image.Kind)) + } + + return image, true, nil +} + +// yamlKindName renders a yaml.Kind constant as the human-readable +// node-shape name used in Talos / YAML docs. The yaml.v3 constants +// are an iota enum whose numeric values mean nothing to an operator +// reading a structural error — "got kind 4" is opaque, "got mapping" +// is actionable. +func yamlKindName(kind yaml.Kind) string { + switch kind { + case yaml.DocumentNode: + return "document" + case yaml.SequenceNode: + return "sequence" + case yaml.MappingNode: + return "mapping" + case yaml.ScalarNode: + return "scalar" + case yaml.AliasNode: + return "alias" + default: + return fmt.Sprintf("unknown (kind %d)", kind) + } +} + +// childByKey returns the value node for `key` inside a mapping node. +// Returns (nil, false, nil) when the key is absent (legitimate skip +// path), and an error when the surrounding node is not a mapping at +// all (the path the caller is walking is structurally wrong). +func childByKey(parent *yaml.Node, key string) (*yaml.Node, bool, error) { + if parent.Kind != yaml.MappingNode { + //nolint:wrapcheck // sentinel constructed in-place; caller wraps with the file path + return nil, false, errors.Newf("expected mapping while walking to %q, got %s", key, yamlKindName(parent.Kind)) + } + + for i := 0; i < len(parent.Content)-1; i += 2 { + k := parent.Content[i] + if k.Kind == yaml.ScalarNode && k.Value == key { + return parent.Content[i+1], true, nil + } + } + + return nil, false, nil +} + +// writeBackInstallImageToFiles applies writeBackInstallImageToNodeBody +// to every entry in files and prints a short progress line per file +// (matching the rest of the upgrade-handler output style). Returns +// the first error encountered; on success every file is patched (or +// silently skipped if it has no install.image key). +// +// Called from the upgrade handler after the talosctl RPC + post-upgrade +// verify return success — see upgrade_handler.go. +func writeBackInstallImageToFiles(files []string, newImage string) error { + if newImage == "" { + return nil + } + + for _, filePath := range files { + patched, err := writeBackInstallImageToNodeBody(filePath, newImage) + if err != nil { + return err + } + + // Print "Synced" only when the body was actually rewritten. + // The skip path (orphan / side-patch shape) already wrote its + // own "Skipped %s: no machine.install.image key …" line from + // writeBackInstallImageToNodeBody; the no-op-target-match path + // is silent (nothing changed on disk, no operator-facing + // signal needed). A "Skipped … Synced …" pair on the same + // file would be misleading. + if patched { + fmt.Fprintf(os.Stderr, "Synced machine.install.image in %s to %s\n", filePath, newImage) + } + } + + return nil +}