diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go index c33988a1..5636f79d 100644 --- a/pkg/drbd/drbdadm.go +++ b/pkg/drbd/drbdadm.go @@ -19,13 +19,24 @@ package drbd import ( "context" "fmt" + "strconv" "strings" + "time" "github.com/cockroachdb/errors" "github.com/cozystack/blockstor/pkg/storage" ) +// DownForceTimeout caps how long DownForce waits for the happy-path +// `drbdadm down ` before falling back to the kernel-direct +// teardown sequence. 15s is the budget the satellite daemonset's +// terminationGracePeriod (30s) and per-resource preStop budget can +// both afford — long enough for a healthy peer to ack the +// disconnect, short enough that a suspended-quorum slot can't stall +// the deletion path indefinitely (Bug 82). +const DownForceTimeout = 15 * time.Second + // Adm is a thin wrapper around the `drbdadm` CLI. It exists so the // satellite reconciler can be unit-tested without a real DRBD kernel // module present: production injects storage.RealExec, tests inject @@ -51,6 +62,75 @@ func (a *Adm) Down(ctx context.Context, resource string) error { return a.run(ctx, "down", resource) } +// DownForce tears the resource down with a bounded timeout, falling +// back to the kernel-direct sequence when the happy-path +// `drbdadm down ` doesn't return in time (Bug 82). +// +// Why the happy-path can hang: when DRBD loses quorum +// (`suspended:quorum`) mid-delete, `drbdadm down` blocks waiting +// for the resource to leave the suspended state, which only +// resolves if a peer comes back. Operators see this as +// `kubectl delete pod` hanging Terminating for many minutes and +// only unblocking under `--force`, leaving a stale +// `/dev/drbd` slot that breaks the next resource allocated +// the same port→minor pair (Bug 82 dev-stand observation). +// +// Flow: +// +// 1. Run `drbdadm down ` under a child context with deadline = +// DownForceTimeout. If it returns within the budget (success or +// a non-hang error), we're done — the kernel slot is gone. +// 2. On timeout / error: for each volume, force-detach the lower +// disk via `drbdsetup detach --force /`. detach does +// NOT wait for quorum, so this returns even when the resource +// is suspended. +// 3. Finalise with `drbdsetup down ` (kernel-direct slot +// release, bypasses drbdadm's quorum-aware wait). Returns nil +// on a "missing slot" exit — the goal state (slot absent) is +// the same. +// +// volumeNumbers is the set of DRBD volume slots to force-detach; +// callers source it from VolumeDefinitions. An empty slice skips +// the per-volume detach and relies on `drbdsetup down` alone — +// belt-and-braces for kernels where `down` won't proceed past a +// transient disk-attached state. +// +// Returns nil on success. Returns a wrapped error only when BOTH +// the bounded `drbdadm down` AND the kernel-direct `drbdsetup +// down` failed. +func (a *Adm) DownForce(ctx context.Context, resource string, volumeNumbers []int32) error { + downCtx, cancel := context.WithTimeout(ctx, DownForceTimeout) + defer cancel() + + err := a.run(downCtx, "down", resource) + if err == nil { + return nil + } + + // Fallback path: force-detach every volume (no quorum wait), + // then kernel-direct `drbdsetup down`. Use the parent ctx so a + // tighter caller deadline still applies. + for _, vol := range volumeNumbers { + target := fmt.Sprintf("%s/%d", resource, vol) + _, _ = a.exec.Run(ctx, "drbdsetup", "detach", "--force", target) + } + + _, downErr := a.exec.Run(ctx, "drbdsetup", "down", resource) + if downErr == nil { + return nil + } + + // `drbdsetup down` on a missing slot exits non-zero with + // "No such resource" / "Unknown resource" — same goal state. + if strings.Contains(downErr.Error(), "No such resource") || + strings.Contains(downErr.Error(), "No currently configured DRBD") || + strings.Contains(downErr.Error(), "Unknown resource") { + return nil + } + + return errors.Wrapf(downErr, "drbdsetup down %s after drbdadm down failed (%v)", resource, err) +} + // Adjust reconciles kernel state to the on-disk .res file. Called after // the ConfFileBuilder writes a new file and we need DRBD to pick up // changes (added/removed peers, new options). @@ -181,16 +261,24 @@ func (a *Adm) Resize(ctx context.Context, resource string) error { // to the peer". History is zeroed — DRBD's handshake never matches // against history when current+bitmap match, so it doesn't matter. // +// DRBD 9.2+ requires `--node-id `: the bitmap-uuid slot is +// per-peer in the modern metadata layout, so the caller must invoke +// SetGi once per peer node-id with the same tuple to stamp every +// peer's bitmap slot. Without --node-id, drbdmeta refuses with +// "The set-gi command requires the --node-id option". +// // Tested via FakeExec capture in pkg/drbd/drbdadm_test.go and // pinned end-to-end in pkg/satellite/reconciler_drbd_test.go's // first-activation case. -func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device, peerCurrentGi string) error { +func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device string, peerNodeID int32, peerCurrentGi string) error { target := fmt.Sprintf("%s/%d", resource, volume) gi := fmt.Sprintf("%s:%s:0:0", peerCurrentGi, peerCurrentGi) - _, err := a.exec.Run(ctx, "drbdmeta", "--force", target, "v09", device, "internal", "set-gi", gi) + _, err := a.exec.Run(ctx, + "drbdmeta", "--force", target, "v09", device, "internal", + "set-gi", "--node-id", strconv.Itoa(int(peerNodeID)), gi) if err != nil { - return errors.Wrapf(err, "drbdmeta set-gi %s", target) + return errors.Wrapf(err, "drbdmeta set-gi %s --node-id %d", target, peerNodeID) } return nil diff --git a/pkg/drbd/drbdadm_test.go b/pkg/drbd/drbdadm_test.go index ca600aa3..0b26b1c7 100644 --- a/pkg/drbd/drbdadm_test.go +++ b/pkg/drbd/drbdadm_test.go @@ -190,7 +190,7 @@ func TestAdmDetachInvokesDrbdadm(t *testing.T) { // TestAdmSetGiInvokesDrbdmeta pins the initial-sync skip seeding // command shape: `drbdmeta --force / v09 -// internal set-gi ::0:0`. Phase 8.1. +// internal set-gi --node-id ::0:0`. // // The format MUST be peer-gi twice (current_uuid + bitmap_uuid both // match the peer's current_uuid), then two zero history slots — a @@ -198,19 +198,32 @@ func TestAdmDetachInvokesDrbdadm(t *testing.T) { // bitmap order would silently break the GI-handshake match and // re-introduce the full initial-sync this whole pipeline exists to // avoid. +// +// DRBD 9.2+ requires `--node-id ` since the bitmap-uuid slot +// is per-peer in modern metadata; dropping it would re-introduce +// the failure mode where drbdmeta rejects set-gi with +// "The set-gi command requires the --node-id option". func TestAdmSetGiInvokesDrbdmeta(t *testing.T) { fx := storage.NewFakeExec() adm := drbd.NewAdm(fx) - err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", "78A0DDDABCDEF000") + err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", 2, "78A0DDDABCDEF000") if err != nil { t.Fatalf("SetGi: %v", err) } - want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0" + want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi --node-id 2 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0" if !slices.Contains(fx.CommandLines(), want) { t.Errorf("expected %q in calls; got %v", want, fx.CommandLines()) } + + // The bare no-node-id form MUST NOT appear — a regression to + // the legacy shape would silently disable the day0 + // skip-initial-sync on DRBD 9.2+ stands. + legacy := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0" + if slices.Contains(fx.CommandLines(), legacy) { + t.Errorf("SetGi emitted legacy no-node-id form: %s", legacy) + } } // TestAdmResizeInvokesDrbdadm: Resize → `drbdadm resize --assume-clean `. diff --git a/pkg/rest/api_call_rc.go b/pkg/rest/api_call_rc.go index 8c7a6d86..5a1c5559 100644 --- a/pkg/rest/api_call_rc.go +++ b/pkg/rest/api_call_rc.go @@ -198,6 +198,26 @@ const apiCallRcFailInUse int64 = 997 // `--force`). const apiCallRcFailInvldVlmSize int64 = 206 +// apiCallRcFailStorPoolConfigurationError mirrors upstream LINSTOR's +// `ApiConsts.FAIL_STOR_POOL_CONFIGURATION_ERROR` (`990 | MASK_ERROR`). +// Sub-code 990 is the closest semantic match in upstream's catalogue +// for "the storage pool's backing configuration is broken on the +// satellite" — exactly the situation that produces PoolMissing=true +// after the satellite's StoragePool reconciler trips on a destroyed +// backing zpool / vg. The MASK_ERROR bit is OR'd in by the +// `apiCallRcError` envelope wrapper at the call site (Bug 83). +const apiCallRcFailStorPoolConfigurationError int64 = 990 + +// apiCallRcMaskStorPool mirrors upstream LINSTOR's +// `ApiConsts.MASK_STOR_POOL` (`0x0000_0000_0014_0000`). OR'd into the +// `ret_code` of any ApiCallRc entry whose primary subject is a +// StoragePool, so audit-log greppers and the Python CLI's reply +// classifier can route the entry to the storage-pool bucket. We tag +// the PoolMissing reports[] entry with this mask so `linstor sp l` +// renders the State column from the structured reply rather than +// reporting "Ok" against an empty reports slice (Bug 83). +const apiCallRcMaskStorPool int64 = 0x0000_0000_0014_0000 + // ObjRefs key constants — the wire-side identifiers upstream LINSTOR // uses to tag ApiCallRc entries with the object(s) the message refers // to. The strings are case-sensitive (the Python CLI matches on the diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index db4a6374..2f0fad00 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -278,24 +278,39 @@ func (r *Reconciler) DeleteSnapshot(ctx context.Context, req *intent.DeleteSnaps return &intent.DeleteSnapshotResponse{Ok: true}, nil } -// DeleteResource tears down a resource: drbdadm down (best-effort — -// the kernel handles a missing one fine), DeleteVolume on every -// requested volume_number through the named Provider, then remove -// the .res file. Idempotent on a missing resource. Per-step errors -// land in the response body so the controller can surface granular -// status without aborting the rest of the cleanup. +// DeleteResource tears down a resource: bounded drbdadm down with a +// kernel-direct `drbdsetup detach`+`drbdsetup down` fallback when the +// happy path is wedged by suspended-quorum (Bug 82), DeleteVolume on +// every requested volume_number through the named Provider, then +// remove the .res file. Idempotent on a missing resource. Per-step +// errors land in the response body so the controller can surface +// granular status without aborting the rest of the cleanup. +// +// Bug 82: the previous `Adm.Down` call had no timeout, so deleting an +// RD whose kernel slot was `suspended:quorum` would block the entire +// handleDelete → satellite finalizer-strip path indefinitely. With +// the satellite finalizer stuck, operators force-stripped it and the +// kernel slot survived as a `/dev/drbd` orphan that prevented +// the next port→minor allocation from creating its metadata. The +// bounded `DownForce` path collapses both failure modes: happy-path +// returns within DownForceTimeout; quorum-stuck path falls back to +// `drbdsetup detach --force` per volume + `drbdsetup down` (neither +// command waits for quorum). Either way the kernel slot is gone +// before we return, and finalizer-strip becomes safe again. func (r *Reconciler) DeleteResource(ctx context.Context, req *intent.DeleteResourceRequest) (*intent.DeleteResourceResponse, error) { var downMsg string if r.cfg.Adm != nil { - err := r.cfg.Adm.Down(ctx, req.GetName()) + err := r.cfg.Adm.DownForce(ctx, req.GetName(), req.GetVolumeNumbers()) if err != nil { - // Best-effort: a "not configured" error is fine here - // (resource was already torn down on a prior pass), but we - // still want to surface the message back to the controller - // so it shows up in the gRPC response. Don't fail — DRBD - // down errors shouldn't block the storage cleanup. - downMsg = "drbdadm down: " + err.Error() + // Best-effort: a "not configured" / "no such resource" + // error here is fine (resource was already torn down on + // a prior pass), but we still want to surface the + // message back to the controller so it shows up in the + // response. Don't fail — even a fully-failed DownForce + // shouldn't block storage cleanup; the storage sweeper + // is the wider-scope backstop. + downMsg = "drbdadm down (force): " + err.Error() } } @@ -1166,23 +1181,30 @@ func (r *Reconciler) seedInitialGi(ctx context.Context, dr *intent.DesiredResour continue } - err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, seed) - // Bug 81: DRBD 9.2+'s drbdmeta requires --node-id on set-gi - // (the GI tuple is per-peer in modern metadata layouts). The - // current SetGi call is the legacy no-node-id form; on - // affected stands it returns - // "The set-gi command requires the --node-id option". Catch - // that specific error and downgrade to a log: skipping the - // day0 GI seed is safe — DRBD just falls through to a full - // initial sync on first connect, the slow-but-correct path. - // Proper fix (per-peer iteration with --node-id) is tracked - // as Bug 81 follow-up. - if err != nil && strings.Contains(err.Error(), "requires the --node-id option") { - err = nil - } + // DRBD 9.2+'s drbdmeta requires --node-id on set-gi — the GI + // tuple lives in per-peer bitmap slots, so we iterate over + // every diskful peer and stamp the same day0/seed GI against + // each. With the deterministic day0 GI (Bug 77), every replica + // computes the same tuple, so every peer's bitmap-vs-our-disk + // reads as "in sync" and DRBD's handshake skips the full + // initial-sync. Skipping a peer here would force a full resync + // against that peer only, which is the safe-but-slow fallback + // — matches LINSTOR's per-peer SetGi loop. + for _, peer := range dr.GetPeers() { + peerIDStr := dr.GetDrbdOptions()["peer."+peer+".node-id"] + if peerIDStr == "" { + continue + } - if err != nil { - return errors.Wrapf(err, "set-gi vol %d", vol.GetVolumeNumber()) + peerID, parseErr := strconv.Atoi(peerIDStr) + if parseErr != nil { + continue + } + + err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, int32(peerID), seed) + if err != nil { + return errors.Wrapf(err, "set-gi vol %d peer %s(%d)", vol.GetVolumeNumber(), peer, peerID) + } } } diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go index 22661e7f..5da90f81 100644 --- a/pkg/satellite/reconciler_drbd_test.go +++ b/pkg/satellite/reconciler_drbd_test.go @@ -1806,7 +1806,15 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) { }, DrbdOptions: map[string]string{ "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + // Bug 81: set-gi is per-peer in DRBD 9.2+, so the seed + // loop only fires when a peer is wired into DrbdOptions. + // Single-replica RDs don't need a seed at all (no peer + // to handshake with), but every multi-replica RD has at + // least one `peer..node-id` entry — mirror that + // shape here so the test exercises the per-peer path. + "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2", }, + Peers: []string{"n2"}, }, }) if err != nil { @@ -1838,7 +1846,10 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) { // Pin the exact GI tuple shape so the seed gets the peer's // current_uuid in BOTH current_uuid and bitmap_uuid slots. - wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0" + // Bug 81: drbdmeta in DRBD 9.2+ requires --node-id, identifying + // which peer-bitmap slot this GI applies to. We stamp peer n2 + // (node-id=1). + wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi --node-id 1 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0" if !slices.Contains(calls, wantSetGi) { t.Errorf("missing exact set-gi command %q in calls: %v", wantSetGi, calls) } @@ -2233,7 +2244,15 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) { }, DrbdOptions: map[string]string{ "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + // Bug 81: per-peer SetGi loop needs a peer wired + // in so the day0 skip-init-sync seed actually + // fires. Production never sees a 1-replica RD + // reach this path (autoplace mandates at least + // 1 peer or tiebreaker); test mirrors a 2-replica + // shape. + "peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2", }, + Peers: []string{"n2"}, }, }) if err != nil { @@ -2269,7 +2288,12 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) { // volume number — keep the expected value in sync with // pkg/satellite/providerkind.go's day0GiFor(). day0 := satellite.Day0GiForTest("pvc-zskip", 0) - wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi %s:%s:0:0", + // Bug 81: per-peer set-gi includes --node-id . + // Day0 is identical across all peers (deterministic from + // RD name + volume), so peer n2 (node-id=1) gets the same + // day0 in BOTH bitmap-uuid slots — exactly what DRBD needs + // for its skip-init-sync handshake. + wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi --node-id 1 %s:%s:0:0", tc.wantDevice, day0, day0) if !slices.Contains(calls, wantSetGi) { t.Errorf("missing exact day0 set-gi command %q in calls: %v", wantSetGi, calls) diff --git a/stand/blockstor-satellite-daemonset.yaml b/stand/blockstor-satellite-daemonset.yaml index 45f6a86d..8d263d58 100644 --- a/stand/blockstor-satellite-daemonset.yaml +++ b/stand/blockstor-satellite-daemonset.yaml @@ -126,15 +126,32 @@ spec: securityContext: privileged: true # PreStop hook: gracefully release DRBD-9 connections - # before SIGTERM. Without this, `kubectl delete pod` - # (no --force) leaves drbd kernel state with stuck - # `Connecting` peers, and the kernel module is - # host-level on the Talos node so it persists across - # pod restarts. `drbdadm down --all` walks /etc/drbd.d - # in order; the per-resource 5s budget keeps a single - # half-hung connection from blocking the whole drain. - # Wrapped in `timeout 25` so the overall hook stays - # under terminationGracePeriodSeconds (30). + # before SIGTERM with a kernel-direct fallback (Bug 82). + # + # Without this, `kubectl delete pod` (no --force) leaves + # drbd kernel state with stuck `Connecting` peers, and the + # kernel module is host-level on the Talos node so it + # persists across pod restarts. + # + # Bug 82: `drbdadm down ` blocks indefinitely on + # `suspended:quorum` slots — a resource mid-quorum-loss + # (controller deleted the RD while a peer was offline) + # will hang `drbdadm down` until a peer comes back. The + # old hook timed out the per-resource call at 5s, leaving + # the kernel slot loaded. Next satellite start sees the + # stale `/dev/drbd` and a recycled port→minor pair + # fails create-md with "Device '' is configured!" + # + # New flow per resource: + # 1. Try `timeout 5 drbdadm down`. Happy path. + # 2. On failure / timeout: force-detach each volume via + # `drbdsetup detach --force /` (no quorum + # wait) so the lower disk is released. + # 3. `drbdsetup down ` kernel-direct slot release + # (also bypasses drbdadm's quorum-aware wait). + # + # Wrapped in `timeout 25` so the overall hook stays under + # terminationGracePeriodSeconds (30). lifecycle: preStop: exec: @@ -144,10 +161,31 @@ spec: - | set +e timeout 25 sh -c ' - for r in $(drbdsetup status --json 2>/dev/null \ - | python3 -c "import json,sys; print(\" \".join(r[\"name\"] for r in json.load(sys.stdin)))" 2>/dev/null); do - timeout 5 drbdadm down "$r" 2>/dev/null || true - done + drbdsetup status --json 2>/dev/null \ + | python3 -c " + import json, sys + try: + rs = json.load(sys.stdin) + except Exception: + sys.exit(0) + for r in rs: + name = r.get(\"name\", \"\") + vols = [str(v.get(\"volume\", 0)) for v in r.get(\"volumes\", [])] + print(name + \"\\t\" + \",\".join(vols)) + " 2>/dev/null \ + | while IFS=$(printf "\t") read -r r vols; do + [ -z "$r" ] && continue + timeout 5 drbdadm down "$r" 2>/dev/null && continue + # Fallback: kernel-direct teardown for + # suspended:quorum-stuck slots. + IFS=, + for v in $vols; do + [ -z "$v" ] && continue + drbdsetup detach --force "$r/$v" 2>/dev/null || true + done + unset IFS + drbdsetup down "$r" 2>/dev/null || true + done ' volumeMounts: - {name: dev, mountPath: /dev}