-
Notifications
You must be signed in to change notification settings - Fork 0
fix(satellite+drbd): per-peer SetGi --node-id + bounded DRBD teardown (Bug 81 + Bug 82) #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -278,24 +278,39 @@ func (r *Reconciler) DeleteSnapshot(ctx context.Context, req *intent.DeleteSnaps | |||||||||||||||||
| return &intent.DeleteSnapshotResponse{Ok: true}, nil | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| // DeleteResource tears down a resource: drbdadm down (best-effort — | ||||||||||||||||||
| // the kernel handles a missing one fine), DeleteVolume on every | ||||||||||||||||||
| // requested volume_number through the named Provider, then remove | ||||||||||||||||||
| // the .res file. Idempotent on a missing resource. Per-step errors | ||||||||||||||||||
| // land in the response body so the controller can surface granular | ||||||||||||||||||
| // status without aborting the rest of the cleanup. | ||||||||||||||||||
| // DeleteResource tears down a resource: bounded drbdadm down with a | ||||||||||||||||||
| // kernel-direct `drbdsetup detach`+`drbdsetup down` fallback when the | ||||||||||||||||||
| // happy path is wedged by suspended-quorum (Bug 82), DeleteVolume on | ||||||||||||||||||
| // every requested volume_number through the named Provider, then | ||||||||||||||||||
| // remove the .res file. Idempotent on a missing resource. Per-step | ||||||||||||||||||
| // errors land in the response body so the controller can surface | ||||||||||||||||||
| // granular status without aborting the rest of the cleanup. | ||||||||||||||||||
| // | ||||||||||||||||||
| // Bug 82: the previous `Adm.Down` call had no timeout, so deleting an | ||||||||||||||||||
| // RD whose kernel slot was `suspended:quorum` would block the entire | ||||||||||||||||||
| // handleDelete → satellite finalizer-strip path indefinitely. With | ||||||||||||||||||
| // the satellite finalizer stuck, operators force-stripped it and the | ||||||||||||||||||
| // kernel slot survived as a `/dev/drbd<minor>` orphan that prevented | ||||||||||||||||||
| // the next port→minor allocation from creating its metadata. The | ||||||||||||||||||
| // bounded `DownForce` path collapses both failure modes: happy-path | ||||||||||||||||||
| // returns within DownForceTimeout; quorum-stuck path falls back to | ||||||||||||||||||
| // `drbdsetup detach --force` per volume + `drbdsetup down` (neither | ||||||||||||||||||
| // command waits for quorum). Either way the kernel slot is gone | ||||||||||||||||||
| // before we return, and finalizer-strip becomes safe again. | ||||||||||||||||||
| func (r *Reconciler) DeleteResource(ctx context.Context, req *intent.DeleteResourceRequest) (*intent.DeleteResourceResponse, error) { | ||||||||||||||||||
| var downMsg string | ||||||||||||||||||
|
|
||||||||||||||||||
| if r.cfg.Adm != nil { | ||||||||||||||||||
| err := r.cfg.Adm.Down(ctx, req.GetName()) | ||||||||||||||||||
| err := r.cfg.Adm.DownForce(ctx, req.GetName(), req.GetVolumeNumbers()) | ||||||||||||||||||
| if err != nil { | ||||||||||||||||||
| // Best-effort: a "not configured" error is fine here | ||||||||||||||||||
| // (resource was already torn down on a prior pass), but we | ||||||||||||||||||
| // still want to surface the message back to the controller | ||||||||||||||||||
| // so it shows up in the gRPC response. Don't fail — DRBD | ||||||||||||||||||
| // down errors shouldn't block the storage cleanup. | ||||||||||||||||||
| downMsg = "drbdadm down: " + err.Error() | ||||||||||||||||||
| // Best-effort: a "not configured" / "no such resource" | ||||||||||||||||||
| // error here is fine (resource was already torn down on | ||||||||||||||||||
| // a prior pass), but we still want to surface the | ||||||||||||||||||
| // message back to the controller so it shows up in the | ||||||||||||||||||
| // response. Don't fail — even a fully-failed DownForce | ||||||||||||||||||
| // shouldn't block storage cleanup; the storage sweeper | ||||||||||||||||||
| // is the wider-scope backstop. | ||||||||||||||||||
| downMsg = "drbdadm down (force): " + err.Error() | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
@@ -1166,23 +1181,30 @@ func (r *Reconciler) seedInitialGi(ctx context.Context, dr *intent.DesiredResour | |||||||||||||||||
| continue | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, seed) | ||||||||||||||||||
| // Bug 81: DRBD 9.2+'s drbdmeta requires --node-id on set-gi | ||||||||||||||||||
| // (the GI tuple is per-peer in modern metadata layouts). The | ||||||||||||||||||
| // current SetGi call is the legacy no-node-id form; on | ||||||||||||||||||
| // affected stands it returns | ||||||||||||||||||
| // "The set-gi command requires the --node-id option". Catch | ||||||||||||||||||
| // that specific error and downgrade to a log: skipping the | ||||||||||||||||||
| // day0 GI seed is safe — DRBD just falls through to a full | ||||||||||||||||||
| // initial sync on first connect, the slow-but-correct path. | ||||||||||||||||||
| // Proper fix (per-peer iteration with --node-id) is tracked | ||||||||||||||||||
| // as Bug 81 follow-up. | ||||||||||||||||||
| if err != nil && strings.Contains(err.Error(), "requires the --node-id option") { | ||||||||||||||||||
| err = nil | ||||||||||||||||||
| } | ||||||||||||||||||
| // DRBD 9.2+'s drbdmeta requires --node-id on set-gi — the GI | ||||||||||||||||||
| // tuple lives in per-peer bitmap slots, so we iterate over | ||||||||||||||||||
| // every diskful peer and stamp the same day0/seed GI against | ||||||||||||||||||
| // each. With the deterministic day0 GI (Bug 77), every replica | ||||||||||||||||||
| // computes the same tuple, so every peer's bitmap-vs-our-disk | ||||||||||||||||||
| // reads as "in sync" and DRBD's handshake skips the full | ||||||||||||||||||
| // initial-sync. Skipping a peer here would force a full resync | ||||||||||||||||||
| // against that peer only, which is the safe-but-slow fallback | ||||||||||||||||||
| // — matches LINSTOR's per-peer SetGi loop. | ||||||||||||||||||
| for _, peer := range dr.GetPeers() { | ||||||||||||||||||
| peerIDStr := dr.GetDrbdOptions()["peer."+peer+".node-id"] | ||||||||||||||||||
| if peerIDStr == "" { | ||||||||||||||||||
| continue | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| if err != nil { | ||||||||||||||||||
| return errors.Wrapf(err, "set-gi vol %d", vol.GetVolumeNumber()) | ||||||||||||||||||
| peerID, parseErr := strconv.Atoi(peerIDStr) | ||||||||||||||||||
| if parseErr != nil { | ||||||||||||||||||
| continue | ||||||||||||||||||
| } | ||||||||||||||||||
|
Comment on lines
+1199
to
+1202
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code silently ignores an error when parsing
Suggested change
|
||||||||||||||||||
|
|
||||||||||||||||||
| err := r.cfg.Adm.SetGi(ctx, dr.GetName(), vol.GetVolumeNumber(), device, int32(peerID), seed) | ||||||||||||||||||
| if err != nil { | ||||||||||||||||||
| return errors.Wrapf(err, "set-gi vol %d peer %s(%d)", vol.GetVolumeNumber(), peer, peerID) | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The condition to check for expected errors from
drbdsetup downis a bit long and relies on hardcoded strings. This is fragile and can be hard to maintain if the error messages fromdrbdsetupchange in the future. To improve readability and maintainability, you could iterate over a slice of expected error substrings. This makes it easier to add or remove expected error messages in the future.