Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 91 additions & 3 deletions pkg/drbd/drbdadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@ package drbd
import (
"context"
"fmt"
"strconv"
"strings"
"time"

"github.com/cockroachdb/errors"

"github.com/cozystack/blockstor/pkg/storage"
)

// DownForceTimeout caps how long DownForce waits for the happy-path
// `drbdadm down <res>` before falling back to the kernel-direct
// teardown sequence. 15s is the budget the satellite daemonset's
// terminationGracePeriod (30s) and per-resource preStop budget can
// both afford — long enough for a healthy peer to ack the
// disconnect, short enough that a suspended-quorum slot can't stall
// the deletion path indefinitely (Bug 82).
const DownForceTimeout = 15 * time.Second

// Adm is a thin wrapper around the `drbdadm` CLI. It exists so the
// satellite reconciler can be unit-tested without a real DRBD kernel
// module present: production injects storage.RealExec, tests inject
Expand All @@ -51,6 +62,75 @@ func (a *Adm) Down(ctx context.Context, resource string) error {
return a.run(ctx, "down", resource)
}

// DownForce tears the resource down with a bounded timeout, falling
// back to the kernel-direct sequence when the happy-path
// `drbdadm down <res>` doesn't return in time (Bug 82).
//
// Why the happy-path can hang: when DRBD loses quorum
// (`suspended:quorum`) mid-delete, `drbdadm down` blocks waiting
// for the resource to leave the suspended state, which only
// resolves if a peer comes back. Operators see this as
// `kubectl delete pod` hanging Terminating for many minutes and
// only unblocking under `--force`, leaving a stale
// `/dev/drbd<minor>` slot that breaks the next resource allocated
// the same port→minor pair (Bug 82 dev-stand observation).
//
// Flow:
//
// 1. Run `drbdadm down <res>` under a child context with deadline =
// DownForceTimeout. If it returns within the budget (success or
// a non-hang error), we're done — the kernel slot is gone.
// 2. On timeout / error: for each volume, force-detach the lower
// disk via `drbdsetup detach --force <res>/<vol>`. detach does
// NOT wait for quorum, so this returns even when the resource
// is suspended.
// 3. Finalise with `drbdsetup down <res>` (kernel-direct slot
// release, bypasses drbdadm's quorum-aware wait). Returns nil
// on a "missing slot" exit — the goal state (slot absent) is
// the same.
//
// volumeNumbers is the set of DRBD volume slots to force-detach;
// callers source it from VolumeDefinitions. An empty slice skips
// the per-volume detach and relies on `drbdsetup down` alone —
// belt-and-braces for kernels where `down` won't proceed past a
// transient disk-attached state.
//
// Returns nil on success. Returns a wrapped error only when BOTH
// the bounded `drbdadm down` AND the kernel-direct `drbdsetup
// down` failed.
func (a *Adm) DownForce(ctx context.Context, resource string, volumeNumbers []int32) error {
downCtx, cancel := context.WithTimeout(ctx, DownForceTimeout)
defer cancel()

err := a.run(downCtx, "down", resource)
if err == nil {
return nil
}

// Fallback path: force-detach every volume (no quorum wait),
// then kernel-direct `drbdsetup down`. Use the parent ctx so a
// tighter caller deadline still applies.
for _, vol := range volumeNumbers {
target := fmt.Sprintf("%s/%d", resource, vol)
_, _ = a.exec.Run(ctx, "drbdsetup", "detach", "--force", target)
}

_, downErr := a.exec.Run(ctx, "drbdsetup", "down", resource)
if downErr == nil {
return nil
}

// `drbdsetup down` on a missing slot exits non-zero with
// "No such resource" / "Unknown resource" — same goal state.
if strings.Contains(downErr.Error(), "No such resource") ||
strings.Contains(downErr.Error(), "No currently configured DRBD") ||
strings.Contains(downErr.Error(), "Unknown resource") {
return nil
}
Comment on lines +125 to +129
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The condition to check for expected errors from drbdsetup down is a bit long and relies on hardcoded strings. This is fragile and can be hard to maintain if the error messages from drbdsetup change in the future. To improve readability and maintainability, you could iterate over a slice of expected error substrings. This makes it easier to add or remove expected error messages in the future.

	for _, msg := range []string{
		"No such resource",
		"No currently configured DRBD",
		"Unknown resource",
	} {
		if strings.Contains(downErr.Error(), msg) {
			return nil
		}
	}


return errors.Wrapf(downErr, "drbdsetup down %s after drbdadm down failed (%v)", resource, err)
}

// Adjust reconciles kernel state to the on-disk .res file. Called after
// the ConfFileBuilder writes a new file and we need DRBD to pick up
// changes (added/removed peers, new options).
Expand Down Expand Up @@ -181,16 +261,24 @@ func (a *Adm) Resize(ctx context.Context, resource string) error {
// to the peer". History is zeroed — DRBD's handshake never matches
// against history when current+bitmap match, so it doesn't matter.
//
// DRBD 9.2+ requires `--node-id <peer>`: the bitmap-uuid slot is
// per-peer in the modern metadata layout, so the caller must invoke
// SetGi once per peer node-id with the same tuple to stamp every
// peer's bitmap slot. Without --node-id, drbdmeta refuses with
// "The set-gi command requires the --node-id option".
//
// Tested via FakeExec capture in pkg/drbd/drbdadm_test.go and
// pinned end-to-end in pkg/satellite/reconciler_drbd_test.go's
// first-activation case.
func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device, peerCurrentGi string) error {
func (a *Adm) SetGi(ctx context.Context, resource string, volume int32, device string, peerNodeID int32, peerCurrentGi string) error {
target := fmt.Sprintf("%s/%d", resource, volume)
gi := fmt.Sprintf("%s:%s:0:0", peerCurrentGi, peerCurrentGi)

_, err := a.exec.Run(ctx, "drbdmeta", "--force", target, "v09", device, "internal", "set-gi", gi)
_, err := a.exec.Run(ctx,
"drbdmeta", "--force", target, "v09", device, "internal",
"set-gi", "--node-id", strconv.Itoa(int(peerNodeID)), gi)
if err != nil {
return errors.Wrapf(err, "drbdmeta set-gi %s", target)
return errors.Wrapf(err, "drbdmeta set-gi %s --node-id %d", target, peerNodeID)
}

return nil
Expand Down
19 changes: 16 additions & 3 deletions pkg/drbd/drbdadm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,27 +190,40 @@ func TestAdmDetachInvokesDrbdadm(t *testing.T) {

// TestAdmSetGiInvokesDrbdmeta pins the initial-sync skip seeding
// command shape: `drbdmeta --force <res>/<vol> v09 <device>
// internal set-gi <peer_gi>:<peer_gi>:0:0`. Phase 8.1.
// internal set-gi --node-id <peer> <peer_gi>:<peer_gi>:0:0`.
//
// The format MUST be peer-gi twice (current_uuid + bitmap_uuid both
// match the peer's current_uuid), then two zero history slots — a
// regression that emits just the bare GI or that swaps current/
// bitmap order would silently break the GI-handshake match and
// re-introduce the full initial-sync this whole pipeline exists to
// avoid.
//
// DRBD 9.2+ requires `--node-id <peer>` since the bitmap-uuid slot
// is per-peer in modern metadata; dropping it would re-introduce
// the failure mode where drbdmeta rejects set-gi with
// "The set-gi command requires the --node-id option".
func TestAdmSetGiInvokesDrbdmeta(t *testing.T) {
fx := storage.NewFakeExec()
adm := drbd.NewAdm(fx)

err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", "78A0DDDABCDEF000")
err := adm.SetGi(t.Context(), "pvc-1", 0, "/dev/dm-3", 2, "78A0DDDABCDEF000")
if err != nil {
t.Fatalf("SetGi: %v", err)
}

want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
want := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi --node-id 2 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
if !slices.Contains(fx.CommandLines(), want) {
t.Errorf("expected %q in calls; got %v", want, fx.CommandLines())
}

// The bare no-node-id form MUST NOT appear — a regression to
// the legacy shape would silently disable the day0
// skip-initial-sync on DRBD 9.2+ stands.
legacy := "drbdmeta --force pvc-1/0 v09 /dev/dm-3 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
if slices.Contains(fx.CommandLines(), legacy) {
t.Errorf("SetGi emitted legacy no-node-id form: %s", legacy)
}
}

// TestAdmResizeInvokesDrbdadm: Resize → `drbdadm resize --assume-clean <res>`.
Expand Down
20 changes: 20 additions & 0 deletions pkg/rest/api_call_rc.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,26 @@ const apiCallRcFailInUse int64 = 997
// `--force`).
const apiCallRcFailInvldVlmSize int64 = 206

// apiCallRcFailStorPoolConfigurationError mirrors upstream LINSTOR's
// `ApiConsts.FAIL_STOR_POOL_CONFIGURATION_ERROR` (`990 | MASK_ERROR`).
// Sub-code 990 is the closest semantic match in upstream's catalogue
// for "the storage pool's backing configuration is broken on the
// satellite" — exactly the situation that produces PoolMissing=true
// after the satellite's StoragePool reconciler trips on a destroyed
// backing zpool / vg. The MASK_ERROR bit is OR'd in by the
// `apiCallRcError` envelope wrapper at the call site (Bug 83).
const apiCallRcFailStorPoolConfigurationError int64 = 990

// apiCallRcMaskStorPool mirrors upstream LINSTOR's
// `ApiConsts.MASK_STOR_POOL` (`0x0000_0000_0014_0000`). OR'd into the
// `ret_code` of any ApiCallRc entry whose primary subject is a
// StoragePool, so audit-log greppers and the Python CLI's reply
// classifier can route the entry to the storage-pool bucket. We tag
// the PoolMissing reports[] entry with this mask so `linstor sp l`
// renders the State column from the structured reply rather than
// reporting "Ok" against an empty reports slice (Bug 83).
const apiCallRcMaskStorPool int64 = 0x0000_0000_0014_0000

// ObjRefs key constants — the wire-side identifiers upstream LINSTOR
// uses to tag ApiCallRc entries with the object(s) the message refers
// to. The strings are case-sensitive (the Python CLI matches on the
Expand Down
80 changes: 51 additions & 29 deletions pkg/satellite/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,24 +278,39 @@ func (r *Reconciler) DeleteSnapshot(ctx context.Context, req *intent.DeleteSnaps
return &intent.DeleteSnapshotResponse{Ok: true}, nil
}

// DeleteResource tears down a resource: drbdadm down (best-effort —
// the kernel handles a missing one fine), DeleteVolume on every
// requested volume_number through the named Provider, then remove
// the .res file. Idempotent on a missing resource. Per-step errors
// land in the response body so the controller can surface granular
// status without aborting the rest of the cleanup.
// DeleteResource tears down a resource: bounded drbdadm down with a
// kernel-direct `drbdsetup detach`+`drbdsetup down` fallback when the
// happy path is wedged by suspended-quorum (Bug 82), DeleteVolume on
// every requested volume_number through the named Provider, then
// remove the .res file. Idempotent on a missing resource. Per-step
// errors land in the response body so the controller can surface
// granular status without aborting the rest of the cleanup.
//
// Bug 82: the previous `Adm.Down` call had no timeout, so deleting an
// RD whose kernel slot was `suspended:quorum` would block the entire
// handleDelete → satellite finalizer-strip path indefinitely. With
// the satellite finalizer stuck, operators force-stripped it and the
// kernel slot survived as a `/dev/drbd<minor>` orphan that prevented
// the next port→minor allocation from creating its metadata. The
// bounded `DownForce` path collapses both failure modes: happy-path
// returns within DownForceTimeout; quorum-stuck path falls back to
// `drbdsetup detach --force` per volume + `drbdsetup down` (neither
// command waits for quorum). Either way the kernel slot is gone
// before we return, and finalizer-strip becomes safe again.
func (r *Reconciler) DeleteResource(ctx context.Context, req *intent.DeleteResourceRequest) (*intent.DeleteResourceResponse, error) {
var downMsg string

if r.cfg.Adm != nil {
err := r.cfg.Adm.Down(ctx, req.GetName())
err := r.cfg.Adm.DownForce(ctx, req.GetName(), req.GetVolumeNumbers())
if err != nil {
// Best-effort: a "not configured" error is fine here
// (resource was already torn down on a prior pass), but we
// still want to surface the message back to the controller
// so it shows up in the gRPC response. Don't fail — DRBD
// down errors shouldn't block the storage cleanup.
downMsg = "drbdadm down: " + err.Error()
// Best-effort: a "not configured" / "no such resource"
// error here is fine (resource was already torn down on
// a prior pass), but we still want to surface the
// message back to the controller so it shows up in the
// response. Don't fail — even a fully-failed DownForce
// shouldn't block storage cleanup; the storage sweeper
// is the wider-scope backstop.
downMsg = "drbdadm down (force): " + err.Error()
}
}

Expand Down Expand Up @@ -1166,23 +1181,30 @@ func (r *Reconciler) seedInitialGi(ctx context.Context, dr *intent.DesiredResour
continue
}

err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, seed)
// Bug 81: DRBD 9.2+'s drbdmeta requires --node-id on set-gi
// (the GI tuple is per-peer in modern metadata layouts). The
// current SetGi call is the legacy no-node-id form; on
// affected stands it returns
// "The set-gi command requires the --node-id option". Catch
// that specific error and downgrade to a log: skipping the
// day0 GI seed is safe — DRBD just falls through to a full
// initial sync on first connect, the slow-but-correct path.
// Proper fix (per-peer iteration with --node-id) is tracked
// as Bug 81 follow-up.
if err != nil && strings.Contains(err.Error(), "requires the --node-id option") {
err = nil
}
// DRBD 9.2+'s drbdmeta requires --node-id on set-gi — the GI
// tuple lives in per-peer bitmap slots, so we iterate over
// every diskful peer and stamp the same day0/seed GI against
// each. With the deterministic day0 GI (Bug 77), every replica
// computes the same tuple, so every peer's bitmap-vs-our-disk
// reads as "in sync" and DRBD's handshake skips the full
// initial-sync. Skipping a peer here would force a full resync
// against that peer only, which is the safe-but-slow fallback
// — matches LINSTOR's per-peer SetGi loop.
for _, peer := range dr.GetPeers() {
peerIDStr := dr.GetDrbdOptions()["peer."+peer+".node-id"]
if peerIDStr == "" {
continue
}

if err != nil {
return errors.Wrapf(err, "set-gi vol %d", vol.GetVolumeNumber())
peerID, parseErr := strconv.Atoi(peerIDStr)
if parseErr != nil {
continue
}
Comment on lines +1199 to +1202
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The code silently ignores an error when parsing peerIDStr using strconv.Atoi. If peer.node-id from DrbdOptions is present but not a valid integer, it's a configuration error that should be surfaced. Silently continuing could lead to a peer being skipped for GI seeding without any warning, causing an unexpected full resync for that peer. This contradicts the PR's goal of making failures "real errors". I suggest returning the parsing error to make misconfigurations visible.

Suggested change
peerID, parseErr := strconv.Atoi(peerIDStr)
if parseErr != nil {
continue
}
peerID, parseErr := strconv.Atoi(peerIDStr)
if parseErr != nil {
return errors.Wrapf(parseErr, "invalid node-id for peer %s: %q", peer, peerIDStr)
}


err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, int32(peerID), seed)
if err != nil {
return errors.Wrapf(err, "set-gi vol %d peer %s(%d)", vol.GetVolumeNumber(), peer, peerID)
}
}
}

Expand Down
28 changes: 26 additions & 2 deletions pkg/satellite/reconciler_drbd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1806,7 +1806,15 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) {
},
DrbdOptions: map[string]string{
"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000",
// Bug 81: set-gi is per-peer in DRBD 9.2+, so the seed
// loop only fires when a peer is wired into DrbdOptions.
// Single-replica RDs don't need a seed at all (no peer
// to handshake with), but every multi-replica RD has at
// least one `peer.<name>.node-id` entry — mirror that
// shape here so the test exercises the per-peer path.
"peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
},
Peers: []string{"n2"},
},
})
if err != nil {
Expand Down Expand Up @@ -1838,7 +1846,10 @@ func TestApplyFirstActivationSeedsGiBeforeAdjust(t *testing.T) {

// Pin the exact GI tuple shape so the seed gets the peer's
// current_uuid in BOTH current_uuid and bitmap_uuid slots.
wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
// Bug 81: drbdmeta in DRBD 9.2+ requires --node-id, identifying
// which peer-bitmap slot this GI applies to. We stamp peer n2
// (node-id=1).
wantSetGi := "drbdmeta --force pvc-seed/0 v09 /dev/vg/pvc-seed_00000 internal set-gi --node-id 1 78A0DDDABCDEF000:78A0DDDABCDEF000:0:0"
if !slices.Contains(calls, wantSetGi) {
t.Errorf("missing exact set-gi command %q in calls: %v", wantSetGi, calls)
}
Expand Down Expand Up @@ -2233,7 +2244,15 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) {
},
DrbdOptions: map[string]string{
"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000",
// Bug 81: per-peer SetGi loop needs a peer wired
// in so the day0 skip-init-sync seed actually
// fires. Production never sees a 1-replica RD
// reach this path (autoplace mandates at least
// 1 peer or tiebreaker); test mirrors a 2-replica
// shape.
"peer.n2.node-id": "1", "peer.n2.address": "10.0.0.2",
},
Peers: []string{"n2"},
},
})
if err != nil {
Expand Down Expand Up @@ -2269,7 +2288,12 @@ func TestApplyFirstActivationSkipsInitialSyncOnThinOrZFS(t *testing.T) {
// volume number — keep the expected value in sync with
// pkg/satellite/providerkind.go's day0GiFor().
day0 := satellite.Day0GiForTest("pvc-zskip", 0)
wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi %s:%s:0:0",
// Bug 81: per-peer set-gi includes --node-id <peer>.
// Day0 is identical across all peers (deterministic from
// RD name + volume), so peer n2 (node-id=1) gets the same
// day0 in BOTH bitmap-uuid slots — exactly what DRBD needs
// for its skip-init-sync handshake.
wantSetGi := fmt.Sprintf("drbdmeta --force pvc-zskip/0 v09 %s internal set-gi --node-id 1 %s:%s:0:0",
tc.wantDevice, day0, day0)
if !slices.Contains(calls, wantSetGi) {
t.Errorf("missing exact day0 set-gi command %q in calls: %v", wantSetGi, calls)
Expand Down
Loading
Loading