cozystack · kvaps · May 14, 2026 · May 14, 2026 · gemini-code-assist · May 14, 2026
diff --git a/pkg/drbd/drbdadm.go b/pkg/drbd/drbdadm.go
@@ -19,13 +19,24 @@ package drbd
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"strings"
+	"time"
 
 	"github.com/cockroachdb/errors"
 
 	"github.com/cozystack/blockstor/pkg/storage"
 )
 
+// DownForceTimeout caps how long DownForce waits for the happy-path
+// `drbdadm down <res>` before falling back to the kernel-direct
+// teardown sequence. 15s is the budget the satellite daemonset's
+// terminationGracePeriod (30s) and per-resource preStop budget can
+// both afford — long enough for a healthy peer to ack the
+// disconnect, short enough that a suspended-quorum slot can't stall
+// the deletion path indefinitely (Bug 82).
+const DownForceTimeout = 15 * time.Second
+
 // Adm is a thin wrapper around the `drbdadm` CLI. It exists so the
 // satellite reconciler can be unit-tested without a real DRBD kernel
 // module present: production injects storage.RealExec, tests inject
@@ -51,6 +62,75 @@ func (a *Adm) Down(ctx context.Context, resource string) error {
 	return a.run(ctx, "down", resource)
 }
 
+// DownForce tears the resource down with a bounded timeout, falling
+// back to the kernel-direct sequence when the happy-path
+// `drbdadm down <res>` doesn't return in time (Bug 82).
+//
+// Why the happy-path can hang: when DRBD loses quorum
+// (`suspended:quorum`) mid-delete, `drbdadm down` blocks waiting
+// for the resource to leave the suspended state, which only
+// resolves if a peer comes back. Operators see this as
+// `kubectl delete pod` hanging Terminating for many minutes and
+// only unblocking under `--force`, leaving a stale
+// `/dev/drbd<minor>` slot that breaks the next resource allocated
+// the same port→minor pair (Bug 82 dev-stand observation).
+//
+// Flow:
+//
+//  1. Run `drbdadm down <res>` under a child context with deadline =
+//     DownForceTimeout. If it returns within the budget (success or
+//     a non-hang error), we're done — the kernel slot is gone.
+//  2. On timeout / error: for each volume, force-detach the lower
+//     disk via `drbdsetup detach --force <res>/<vol>`. detach does
+//     NOT wait for quorum, so this returns even when the resource
+//     is suspended.
+//  3. Finalise with `drbdsetup down <res>` (kernel-direct slot
+//     release, bypasses drbdadm's quorum-aware wait). Returns nil
+//     on a "missing slot" exit — the goal state (slot absent) is
+//     the same.
+//
+// volumeNumbers is the set of DRBD volume slots to force-detach;
+// callers source it from VolumeDefinitions. An empty slice skips
+// the per-volume detach and relies on `drbdsetup down` alone —
+// belt-and-braces for kernels where `down` won't proceed past a
+// transient disk-attached state.
+//
+// Returns nil on success. Returns a wrapped error only when BOTH
+// the bounded `drbdadm down` AND the kernel-direct `drbdsetup
+// down` failed.
+func (a *Adm) DownForce(ctx context.Context, resource string, volumeNumbers []int32) error {
+	downCtx, cancel := context.WithTimeout(ctx, DownForceTimeout)
+	defer cancel()
+
+	err := a.run(downCtx, "down", resource)
+	if err == nil {
+		return nil
+	}
+
+	// Fallback path: force-detach every volume (no quorum wait),
+	// then kernel-direct `drbdsetup down`. Use the parent ctx so a
+	// tighter caller deadline still applies.
+	for _, vol := range volumeNumbers {
+		target := fmt.Sprintf("%s/%d", resource, vol)
+		_, _ = a.exec.Run(ctx, "drbdsetup", "detach", "--force", target)
+	}
+
+	_, downErr := a.exec.Run(ctx, "drbdsetup", "down", resource)
+	if downErr == nil {
+		return nil
+	}
+
+	// `drbdsetup down` on a missing slot exits non-zero with
+	// "No such resource" / "Unknown resource" — same goal state.
+	if strings.Contains(downErr.Error(), "No such resource") ||
+		strings.Contains(downErr.Error(), "No currently configured DRBD") ||
+		strings.Contains(downErr.Error(), "Unknown resource") {
+		return nil
+	}
+
+	return errors.Wrapf(downErr, "drbdsetup down %s after drbdadm down failed (%v)", resource, err)
+}
+
 // Adjust reconciles kernel state to the on-disk .res file. Called after
 // the ConfFileBuilder writes a new file and we need DRBD to pick up
 // changes (added/removed peers, new options).
@@ -181,16 +261,24 @@ func (a *Adm) Resize(ctx context.Context, resource string) error {
 // to the peer". History is zeroed — DRBD's handshake never matches
 // against history when current+bitmap match, so it doesn't matter.
 //
+// DRBD 9.2+ requires `--node-id <peer>`: the bitmap-uuid slot is
+// per-peer in the modern metadata layout, so the caller must invoke
+// SetGi once per peer node-id with the same tuple to stamp every
+// peer's bitmap slot. Without --node-id, drbdmeta refuses with
+// "The set-gi command requires the --node-id option".
+//
 // Tested via FakeExec capture in pkg/drbd/drbdadm_test.go and
 // pinned end-to-end in pkg/satellite/reconciler_drbd_test.go's
 // first-activation case.
-func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device, peerCurrentGi string) error {
+func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device string, peerNodeID int32, peerCurrentGi string) error {
 	target := fmt.Sprintf("%s/%d", resource, volume)
 	gi := fmt.Sprintf("%s:%s:0:0", peerCurrentGi, peerCurrentGi)
 
-	_, err := a.exec.Run(ctx, "drbdmeta", "--force", target, "v09", device, "internal", "set-gi", gi)
+	_, err := a.exec.Run(ctx,
+		"drbdmeta", "--force", target, "v09", device, "internal",
+		"set-gi", "--node-id", strconv.Itoa(int(peerNodeID)), gi)
 	if err != nil {
-		return errors.Wrapf(err, "drbdmeta set-gi %s", target)
+		return errors.Wrapf(err, "drbdmeta set-gi %s --node-id %d", target, peerNodeID)
 	}
 
 	return nil

diff --git a/pkg/drbd/drbdadm_test.go b/pkg/drbd/drbdadm_test.go
@@ -190,27 +190,40 @@ func TestAdmDetachInvokesDrbdadm(t *testing.T) {
 
 // TestAdmSetGiInvokesDrbdmeta pins the initial-sync skip seeding
 // command shape: `drbdmeta --force <res>/<vol> v09 <device>
-// internal set-gi <peer_gi>:<peer_gi>:0:0`. Phase 8.1.
+// internal set-gi --node-id <peer> <peer_gi>:<peer_gi>:0:0`.
 //
 // The format MUST be peer-gi twice (current_uuid + bitmap_uuid both
 // match the peer's current_uuid), then two zero history slots — a
 // regression that emits just the bare GI or that swaps current/
 // bitmap order would silently break the GI-handshake match and
 // re-introduce the full initial-sync this whole pipeline exists to
 // avoid.
+//
+// DRBD 9.2+ requires `--node-id <peer>` since the bitmap-uuid slot
+// is per-peer in modern metadata; dropping it would re-introduce
+// the failure mode where drbdmeta rejects set-gi with
+// "The set-gi command requires the --node-id option".
 func TestAdmSetGiInvokesDrbdmeta(t *testing.T) {
 	fx := storage.NewFakeExec()
 	adm := drbd.NewAdm(fx)
 
-	err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", "78A0DDDABCDEF000")
+	err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", 2, "78A0DDDABCDEF000")
 	if err != nil {
 		t.Fatalf("SetGi: %v", err)
 	}
 
-	want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
+	want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi --node-id 2 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
 	if !slices.Contains(fx.CommandLines(), want) {
 		t.Errorf("expected %q in calls; got %v", want, fx.CommandLines())
 	}
+
+	// The bare no-node-id form MUST NOT appear — a regression to
+	// the legacy shape would silently disable the day0
+	// skip-initial-sync on DRBD 9.2+ stands.
+	legacy := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
+	if slices.Contains(fx.CommandLines(), legacy) {
+		t.Errorf("SetGi emitted legacy no-node-id form: %s", legacy)
+	}
 }
 
 // TestAdmResizeInvokesDrbdadm: Resize → `drbdadm resize --assume-clean <res>`.

diff --git a/pkg/rest/api_call_rc.go b/pkg/rest/api_call_rc.go
@@ -198,6 +198,26 @@ const apiCallRcFailInUse int64 = 997
 // `--force`).
 const apiCallRcFailInvldVlmSize int64 = 206
 
+// apiCallRcFailStorPoolConfigurationError mirrors upstream LINSTOR's
+// `ApiConsts.FAIL_STOR_POOL_CONFIGURATION_ERROR` (`990 | MASK_ERROR`).
+// Sub-code 990 is the closest semantic match in upstream's catalogue
+// for "the storage pool's backing configuration is broken on the
+// satellite" — exactly the situation that produces PoolMissing=true
+// after the satellite's StoragePool reconciler trips on a destroyed
+// backing zpool / vg. The MASK_ERROR bit is OR'd in by the
+// `apiCallRcError` envelope wrapper at the call site (Bug 83).
+const apiCallRcFailStorPoolConfigurationError int64 = 990
+
+// apiCallRcMaskStorPool mirrors upstream LINSTOR's
+// `ApiConsts.MASK_STOR_POOL` (`0x0000_0000_0014_0000`). OR'd into the
+// `ret_code` of any ApiCallRc entry whose primary subject is a
+// StoragePool, so audit-log greppers and the Python CLI's reply
+// classifier can route the entry to the storage-pool bucket. We tag
+// the PoolMissing reports[] entry with this mask so `linstor sp l`
+// renders the State column from the structured reply rather than
+// reporting "Ok" against an empty reports slice (Bug 83).
+const apiCallRcMaskStorPool int64 = 0x0000_0000_0014_0000
+
 // ObjRefs key constants — the wire-side identifiers upstream LINSTOR
 // uses to tag ApiCallRc entries with the object(s) the message refers
 // to. The strings are case-sensitive (the Python CLI matches on the

diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go
@@ -278,24 +278,39 @@ func (r *Reconciler) DeleteSnapshot(ctx context.Context, req *intent.DeleteSnaps
 	return &intent.DeleteSnapshotResponse{Ok: true}, nil
 }
 
-// DeleteResource tears down a resource: drbdadm down (best-effort —
-// the kernel handles a missing one fine), DeleteVolume on every
-// requested volume_number through the named Provider, then remove
-// the .res file. Idempotent on a missing resource. Per-step errors
-// land in the response body so the controller can surface granular
-// status without aborting the rest of the cleanup.
+// DeleteResource tears down a resource: bounded drbdadm down with a
+// kernel-direct `drbdsetup detach`+`drbdsetup down` fallback when the
+// happy path is wedged by suspended-quorum (Bug 82), DeleteVolume on
+// every requested volume_number through the named Provider, then
+// remove the .res file. Idempotent on a missing resource. Per-step
+// errors land in the response body so the controller can surface
+// granular status without aborting the rest of the cleanup.
+//
+// Bug 82: the previous `Adm.Down` call had no timeout, so deleting an
+// RD whose kernel slot was `suspended:quorum` would block the entire
+// handleDelete → satellite finalizer-strip path indefinitely. With
+// the satellite finalizer stuck, operators force-stripped it and the
+// kernel slot survived as a `/dev/drbd<minor>` orphan that prevented
+// the next port→minor allocation from creating its metadata. The
+// bounded `DownForce` path collapses both failure modes: happy-path
+// returns within DownForceTimeout; quorum-stuck path falls back to
+// `drbdsetup detach --force` per volume + `drbdsetup down` (neither
+// command waits for quorum). Either way the kernel slot is gone
+// before we return, and finalizer-strip becomes safe again.
 func (r *Reconciler) DeleteResource(ctx context.Context, req *intent.DeleteResourceRequest) (*intent.DeleteResourceResponse, error) {
 	var downMsg string
 
 	if r.cfg.Adm != nil {
-		err := r.cfg.Adm.Down(ctx, req.GetName())
+		err := r.cfg.Adm.DownForce(ctx, req.GetName(), req.GetVolumeNumbers())
 		if err != nil {
-			// Best-effort: a "not configured" error is fine here
-			// (resource was already torn down on a prior pass), but we
-			// still want to surface the message back to the controller
-			// so it shows up in the gRPC response. Don't fail — DRBD
-			// down errors shouldn't block the storage cleanup.
-			downMsg = "drbdadm down: " + err.Error()
+			// Best-effort: a "not configured" / "no such resource"
+			// error here is fine (resource was already torn down on
+			// a prior pass), but we still want to surface the
+			// message back to the controller so it shows up in the
+			// response. Don't fail — even a fully-failed DownForce
+			// shouldn't block storage cleanup; the storage sweeper
+			// is the wider-scope backstop.
+			downMsg = "drbdadm down (force): " + err.Error()
 		}
 	}
 
@@ -1166,23 +1181,30 @@ func (r *Reconciler) seedInitialGi(ctx context.Context, dr *intent.DesiredResour
 			continue
 		}
 
-		err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, seed)
-		// Bug 81: DRBD 9.2+'s drbdmeta requires --node-id on set-gi
-		// (the GI tuple is per-peer in modern metadata layouts). The
-		// current SetGi call is the legacy no-node-id form; on
-		// affected stands it returns
-		// "The set-gi command requires the --node-id option". Catch
-		// that specific error and downgrade to a log: skipping the
-		// day0 GI seed is safe — DRBD just falls through to a full
-		// initial sync on first connect, the slow-but-correct path.
-		// Proper fix (per-peer iteration with --node-id) is tracked
-		// as Bug 81 follow-up.
-		if err != nil && strings.Contains(err.Error(), "requires the --node-id option") {
-			err = nil
-		}
+		// DRBD 9.2+'s drbdmeta requires --node-id on set-gi — the GI
+		// tuple lives in per-peer bitmap slots, so we iterate over
+		// every diskful peer and stamp the same day0/seed GI against
+		// each. With the deterministic day0 GI (Bug 77), every replica
+		// computes the same tuple, so every peer's bitmap-vs-our-disk
+		// reads as "in sync" and DRBD's handshake skips the full
+		// initial-sync. Skipping a peer here would force a full resync
+		// against that peer only, which is the safe-but-slow fallback
+		// — matches LINSTOR's per-peer SetGi loop.
+		for _, peer := range dr.GetPeers() {
+			peerIDStr := dr.GetDrbdOptions()["peer."+peer+".node-id"]
+			if peerIDStr == "" {
+				continue
+			}
 
-		if err != nil {
-			return errors.Wrapf(err, "set-gi vol %d", vol.GetVolumeNumber())
+			peerID, parseErr := strconv.Atoi(peerIDStr)
+			if parseErr != nil {
+				continue
+			}
-			peerID, parseErr := strconv.Atoi(peerIDStr)
-			if parseErr != nil {
-				continue
-			}
+			peerID, parseErr := strconv.Atoi(peerIDStr)
+			if parseErr != nil {
+				return errors.Wrapf(parseErr, "invalid node-id for peer %s: %q", peer, peerIDStr)
+			}
-			peerID, parseErr := strconv.Atoi(peerIDStr)
-			if parseErr != nil {
-				continue
-			}
+			peerID, parseErr := strconv.Atoi(peerIDStr)
+			if parseErr != nil {
+				return errors.Wrapf(parseErr, "invalid node-id for peer %s: %q", peer, peerIDStr)
+			}
+
+			err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, int32(peerID), seed)
+			if err != nil {
+				return errors.Wrapf(err, "set-gi vol %d peer %s(%d)", vol.GetVolumeNumber(), peer, peerID)
+			}
 		}
 	}
 

diff --git a/pkg/satellite/reconciler_drbd_test.go b/pkg/satellite/reconciler_drbd_test.go
@@ -1806,7 +1806,15 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) {
 			},
 			DrbdOptions: map[string]string{
 				"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000",
+				// Bug 81: set-gi is per-peer in DRBD 9.2+, so the seed
+				// loop only fires when a peer is wired into DrbdOptions.
+				// Single-replica RDs don't need a seed at all (no peer
+				// to handshake with), but every multi-replica RD has at
+				// least one `peer.<name>.node-id` entry — mirror that
+				// shape here so the test exercises the per-peer path.
+				"peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
 			},
+			Peers: []string{"n2"},
 		},
 	})
 	if err != nil {
@@ -1838,7 +1846,10 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) {
 
 	// Pin the exact GI tuple shape so the seed gets the peer's
 	// current_uuid in BOTH current_uuid and bitmap_uuid slots.
-	wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
+	// Bug 81: drbdmeta in DRBD 9.2+ requires --node-id, identifying
+	// which peer-bitmap slot this GI applies to. We stamp peer n2
+	// (node-id=1).
+	wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi --node-id 1 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
 	if !slices.Contains(calls, wantSetGi) {
 		t.Errorf("missing exact set-gi command %q in calls: %v", wantSetGi, calls)
 	}
@@ -2233,7 +2244,15 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) {
 					},
 					DrbdOptions: map[string]string{
 						"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000",
+						// Bug 81: per-peer SetGi loop needs a peer wired
+						// in so the day0 skip-init-sync seed actually
+						// fires. Production never sees a 1-replica RD
+						// reach this path (autoplace mandates at least
+						// 1 peer or tiebreaker); test mirrors a 2-replica
+						// shape.
+						"peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
 					},
+					Peers: []string{"n2"},
 				},
 			})
 			if err != nil {
@@ -2269,7 +2288,12 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) {
 			// volume number — keep the expected value in sync with
 			// pkg/satellite/providerkind.go's day0GiFor().
 			day0 := satellite.Day0GiForTest("pvc-zskip", 0)
-			wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi %s:%s:0:0",
+			// Bug 81: per-peer set-gi includes --node-id <peer>.
+			// Day0 is identical across all peers (deterministic from
+			// RD name + volume), so peer n2 (node-id=1) gets the same
+			// day0 in BOTH bitmap-uuid slots — exactly what DRBD needs
+			// for its skip-init-sync handshake.
+			wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi --node-id 1 %s:%s:0:0",
 				tc.wantDevice, day0, day0)
 			if !slices.Contains(calls, wantSetGi) {
 				t.Errorf("missing exact day0 set-gi command %q in calls: %v", wantSetGi, calls)