diff --git a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go index a4e823ac14..0aefd93764 100644 --- a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go @@ -1026,6 +1026,23 @@ const ( ConditionReasonIdentitySynthesizedInactive = "AllUpstreamsHaveUserInfo" ) +// Condition reasons for ConditionTypeValid on OBO-typed configs. These +// literals are part of the user-facing contract — external consumers and +// downstream tooling pattern-match on them. +const ( + // ConditionReasonEnterpriseRequired: an obo-typed MCPExternalAuthConfig + // requires an enterprise build that has registered an OBO handler via + // controllerutil.RegisterOBOHandler. Upstream-only builds surface this + // reason for every obo-typed config. + ConditionReasonEnterpriseRequired = "EnterpriseRequired" + + // ConditionReasonInvalidConfig: an obo-typed MCPExternalAuthConfig is + // well-formed at the CRD level but fails the registered OBO handler's + // Validate() with an error other than the enterprise-required sentinel. + // Used by out-of-tree handlers; unreachable in upstream-only builds. + ConditionReasonInvalidConfig = "InvalidConfig" +) + // MCPExternalAuthConfigStatus defines the observed state of MCPExternalAuthConfig type MCPExternalAuthConfigStatus struct { // Conditions represent the latest available observations of the MCPExternalAuthConfig's state @@ -1115,14 +1132,14 @@ func (r *MCPExternalAuthConfig) Validate() error { // No complex validation needed for these types return nil case ExternalAuthTypeOBO: - // TODO(#5328): no body change is planned here — OBO validation is - // delegated to the registered OBO handler at the controllerutil layer - // (via controllerutil.OBOValidate -> obo.ErrEnterpriseRequired in - // upstream-only builds), invoked from the reconcile loop. The - // CRD-level Validate() stays a no-op for OBO and exists only to keep - // the `exhaustive` linter happy now that ExternalAuthTypeOBO is - // defined. The CRD enum currently rejects "obo" at the apiserver - // layer, so this arm is unreachable in upstream-only builds. + // OBO validation is delegated to the registered OBO handler at the + // controllerutil layer (via controllerutil.OBOValidate -> + // obo.ErrEnterpriseRequired in upstream-only builds), invoked from + // the reconcile loop. The CRD-level Validate() stays a no-op for OBO + // and exists only to keep the `exhaustive` linter happy now that + // ExternalAuthTypeOBO is defined. The CRD enum currently rejects + // "obo" at the apiserver layer, so this arm is unreachable in + // upstream-only builds. return nil default: // Unknown type - should be caught by enum validation, but handle defensively diff --git a/cmd/thv-operator/controllers/mcpexternalauthconfig_controller.go b/cmd/thv-operator/controllers/mcpexternalauthconfig_controller.go index 13e812bef2..7e0a653527 100644 --- a/cmd/thv-operator/controllers/mcpexternalauthconfig_controller.go +++ b/cmd/thv-operator/controllers/mcpexternalauthconfig_controller.go @@ -5,6 +5,7 @@ package controllers import ( "context" + stderrors "errors" "fmt" "time" @@ -22,6 +23,7 @@ import ( mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + "github.com/stacklok/toolhive/pkg/auth/obo" ) const ( @@ -90,6 +92,15 @@ func (r *MCPExternalAuthConfigReconciler) Reconcile(ctx context.Context, req ctr // it before validation ensures the advisory tracks the current spec on // every reconcile — including the validation-failure path — so a broken // edit cannot leave a stale True/upstream-name dangling. + // + // Note: the OBO failure path routes through setInvalid, which discards + // this in-memory mutation (its MutateAndPatchStatus call re-fetches and + // re-applies the advisory inside its patch closure). The in-memory + // mutation here is therefore load-bearing only on the validation-failure + // path (the r.Status().Update on the ValidationFailed return) and the + // Valid=True path (the r.Status().Update on the conditionChanged write). + // The function is idempotent on the same spec, so the double computation + // on the OBO path is benign. syntheticChanged := r.applyIdentitySynthesizedCondition(externalAuthConfig) // Validate spec configuration early @@ -111,6 +122,16 @@ func (r *MCPExternalAuthConfigReconciler) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil // Don't requeue on validation errors - user must fix spec } + // Dispatch OBO-typed configs through the registered handler. The default + // handler returns obo.ErrEnterpriseRequired so upstream-only builds surface + // Valid=False / Reason=EnterpriseRequired here rather than failing later + // inside a consumer reconciler with a generic "unsupported" error. + if externalAuthConfig.Spec.Type == mcpv1beta1.ExternalAuthTypeOBO { + if handled, err := r.triageOBOValidation(ctx, externalAuthConfig); handled { + return ctrl.Result{}, err + } + } + // Validation succeeded - set Valid=True condition conditionChanged := meta.SetStatusCondition(&externalAuthConfig.Status.Conditions, metav1.Condition{ Type: mcpv1beta1.ConditionTypeValid, @@ -187,6 +208,98 @@ func (*MCPExternalAuthConfigReconciler) applyIdentitySynthesizedCondition( }) } +// triageOBOValidation runs the registered OBO handler's Validate and routes +// the result through the three-bucket contract documented on OBOHandler: +// +// - ErrEnterpriseRequired → permanent, Reason=EnterpriseRequired +// - *obo.ValidationError → permanent, Reason=InvalidConfig (the typed +// error's Message is written verbatim into condition.Message) +// - anything else → transient; return the error wrapped so +// controller-runtime requeues with backoff and self-heals once the +// upstream dependency (Secret/JWKS/webhook) recovers +// +// Returns handled=true to signal the caller to return immediately (with a +// zero ctrl.Result and the returned error). handled=false means Validate +// succeeded and the reconciler should continue with the Valid=True path. +// +// Extracted from Reconcile so the parent stays under the gocyclo threshold. +func (r *MCPExternalAuthConfigReconciler) triageOBOValidation( + ctx context.Context, + cfg *mcpv1beta1.MCPExternalAuthConfig, +) (handled bool, err error) { + validateErr := ctrlutil.OBOValidate(cfg) + if validateErr == nil { + return false, nil + } + switch { + case stderrors.Is(validateErr, obo.ErrEnterpriseRequired): + return true, r.setInvalid(ctx, cfg, validateErr, mcpv1beta1.ConditionReasonEnterpriseRequired) + default: + var valErr *obo.ValidationError + if stderrors.As(validateErr, &valErr) { + return true, r.setInvalid(ctx, cfg, valErr, mcpv1beta1.ConditionReasonInvalidConfig) + } + // Transient: return the error so controller-runtime requeues with + // backoff. Locking the resource into a permanent InvalidConfig on a + // transient I/O blip would require the user to touch the spec for + // the failure to clear, defeating self-healing. + return true, fmt.Errorf("OBO handler validation failed: %w", validateErr) + } +} + +// setInvalid writes a Valid=False condition through MutateAndPatchStatus, +// using the supplied reason string and the error's message as the condition +// message. Returns an empty result with no requeue: the spec must change (or +// an out-of-tree handler must be registered) for this branch to clear, so +// requeuing buys nothing. +// +// Callers in Reconcile() may have already mutated cfg.Status.Conditions in +// memory (notably applyIdentitySynthesizedCondition). MutateAndPatchStatus +// diffs the post-mutate object against the snapshot it takes at the start of +// the call, so any mutation present in cfg before the helper runs lands in +// both halves of the diff and silently disappears from the merge patch. To +// avoid losing the IdentitySynthesized advisory transition (e.g., when a user +// switches a config from embeddedAuthServer to obo), this helper re-fetches +// the object from the apiserver and re-applies the synthesized-condition +// computation inside the patch closure so both the advisory transition and +// the Valid=False condition land in the same patch. +func (r *MCPExternalAuthConfigReconciler) setInvalid( + ctx context.Context, + cfg *mcpv1beta1.MCPExternalAuthConfig, + err error, + reason string, +) error { + fresh := &mcpv1beta1.MCPExternalAuthConfig{} + if getErr := r.Get(ctx, client.ObjectKeyFromObject(cfg), fresh); getErr != nil { + if errors.IsNotFound(getErr) { + // Deleted between the reconciler's initial Get and this re-fetch; + // nothing to patch, and the reconciler's own NotFound handling + // already returned cleanly the next time around. + return nil + } + return getErr + } + return ctrlutil.MutateAndPatchStatus(ctx, r.Client, fresh, func(c *mcpv1beta1.MCPExternalAuthConfig) { + // applyIdentitySynthesizedCondition is idempotent on the same spec; + // re-applying it inside the closure folds the advisory transition + // into the same patch as the Valid=False write below. This + // re-invocation is load-bearing: removing it would cause + // MutateAndPatchStatus to silently drop the IdentitySynthesized + // transition because the pre-mutate snapshot already contains the + // in-memory mutation from line 95. See + // TestMCPExternalAuthConfigReconciler_OBO_ClearsStaleIdentitySynthesized + // for the regression guard. + r.applyIdentitySynthesizedCondition(c) + meta.SetStatusCondition(&c.Status.Conditions, metav1.Condition{ + Type: mcpv1beta1.ConditionTypeValid, + Status: metav1.ConditionFalse, + Reason: reason, + Message: err.Error(), + ObservedGeneration: c.Generation, + }) + }) +} + // handleConfigHashChange handles the logic when the config hash changes func (r *MCPExternalAuthConfigReconciler) handleConfigHashChange( ctx context.Context, diff --git a/cmd/thv-operator/controllers/mcpexternalauthconfig_controller_test.go b/cmd/thv-operator/controllers/mcpexternalauthconfig_controller_test.go index 30d35ed9b3..8f8a619198 100644 --- a/cmd/thv-operator/controllers/mcpexternalauthconfig_controller_test.go +++ b/cmd/thv-operator/controllers/mcpexternalauthconfig_controller_test.go @@ -4,6 +4,9 @@ package controllers import ( + "context" + stderrors "errors" + "fmt" "testing" "time" @@ -18,6 +21,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" + ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + "github.com/stacklok/toolhive/pkg/auth/obo" + "github.com/stacklok/toolhive/pkg/runner" ) func TestMCPExternalAuthConfigReconciler_calculateConfigHash(t *testing.T) { @@ -1476,3 +1482,291 @@ func findCondition(conditions []metav1.Condition, t string) *metav1.Condition { } return nil } + +// TestMCPExternalAuthConfigReconciler_OBO_DefaultHandler_SetsEnterpriseRequired +// proves the dispatch wiring: with the default OBO handler installed (the +// upstream-only state), reconciling an obo-typed MCPExternalAuthConfig surfaces +// Valid=False / Reason=EnterpriseRequired rather than the generic "unsupported +// external auth type" path. Drives the reconciler directly with a fake client +// to bypass the CRD enum (which does not admit "obo" until #5329 lands). +func TestMCPExternalAuthConfigReconciler_OBO_DefaultHandler_SetsEnterpriseRequired(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + cfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cfg). + WithStatusSubresource(&mcpv1beta1.MCPExternalAuthConfig{}). + Build() + + r := &MCPExternalAuthConfigReconciler{Client: fakeClient, Scheme: scheme} + req := reconcile.Request{NamespacedName: types.NamespacedName{ + Name: cfg.Name, + Namespace: cfg.Namespace, + }} + + // First reconcile adds the finalizer; the requeued reconcile runs the body. + result, err := r.Reconcile(t.Context(), req) + require.NoError(t, err) + if result.RequeueAfter > 0 { + _, err = r.Reconcile(t.Context(), req) + require.NoError(t, err) + } + + var updated mcpv1beta1.MCPExternalAuthConfig + require.NoError(t, fakeClient.Get(t.Context(), req.NamespacedName, &updated)) + + validCond := findCondition(updated.Status.Conditions, mcpv1beta1.ConditionTypeValid) + require.NotNil(t, validCond, "Valid condition must be set for OBO-typed config") + assert.Equal(t, metav1.ConditionFalse, validCond.Status) + assert.Equal(t, mcpv1beta1.ConditionReasonEnterpriseRequired, validCond.Reason, + "the exact reason string is part of the user-facing contract — external consumers pattern-match on it") + + // Generic-error guard: the dispatch path must NOT leak a generic + // "unsupported external auth type" or "unknown middleware type" message. + assert.NotContains(t, validCond.Message, "unsupported external auth type") + assert.NotContains(t, validCond.Message, "unknown middleware type") + + // Positive assertion: condition.Message must surface the sentinel's + // user-facing text. Without this check a refactor that emptied or + // rewrote the message would still pass the reason-string assertion. + assert.Equal(t, obo.ErrEnterpriseRequired.Error(), validCond.Message, + "condition.Message must surface the sentinel's user-facing text") +} + +// TestMCPExternalAuthConfigReconciler_OBO_ClearsStaleIdentitySynthesized +// proves that when a user switches an existing config from embeddedAuthServer +// (which set IdentitySynthesized=True) to obo, the stale IdentitySynthesized +// condition is removed alongside the new Valid=False/EnterpriseRequired +// condition. Regression for the bug where setInvalid's MutateAndPatchStatus +// diffed against an already-mutated snapshot and silently dropped the +// IdentitySynthesized removal. +func TestMCPExternalAuthConfigReconciler_OBO_ClearsStaleIdentitySynthesized(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + // Construct a config that is already in the obo type but has a stale + // IdentitySynthesized condition left over from a prior embeddedAuthServer + // configuration. The reconciler must remove that condition on its next + // pass even though the failure path now routes through MutateAndPatchStatus. + cfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + Finalizers: []string{ExternalAuthConfigFinalizerName}, + Generation: 2, + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + Status: mcpv1beta1.MCPExternalAuthConfigStatus{ + Conditions: []metav1.Condition{ + { + Type: mcpv1beta1.ConditionTypeIdentitySynthesized, + Status: metav1.ConditionTrue, + Reason: mcpv1beta1.ConditionReasonIdentitySynthesizedActive, + Message: "stale message from the embeddedAuthServer days", + ObservedGeneration: 1, + LastTransitionTime: metav1.Now(), + }, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cfg). + WithStatusSubresource(&mcpv1beta1.MCPExternalAuthConfig{}). + Build() + + r := &MCPExternalAuthConfigReconciler{Client: fakeClient, Scheme: scheme} + req := reconcile.Request{NamespacedName: types.NamespacedName{ + Name: cfg.Name, + Namespace: cfg.Namespace, + }} + + _, err := r.Reconcile(t.Context(), req) + require.NoError(t, err) + + var updated mcpv1beta1.MCPExternalAuthConfig + require.NoError(t, fakeClient.Get(t.Context(), req.NamespacedName, &updated)) + + // IdentitySynthesized must be removed (the spec is no longer embeddedAuthServer). + assert.Nil(t, findCondition(updated.Status.Conditions, mcpv1beta1.ConditionTypeIdentitySynthesized), + "stale IdentitySynthesized condition must be removed once the spec leaves embeddedAuthServer") + + // Valid=False/EnterpriseRequired must be set in the same reconcile pass. + validCond := findCondition(updated.Status.Conditions, mcpv1beta1.ConditionTypeValid) + require.NotNil(t, validCond) + assert.Equal(t, metav1.ConditionFalse, validCond.Status) + assert.Equal(t, mcpv1beta1.ConditionReasonEnterpriseRequired, validCond.Reason) +} + +// defaultOBOHandlerStub returns a handler whose every method returns +// obo.ErrEnterpriseRequired — the same shape as the package default. Used to +// restore the registered handler after a test that overrode it. +func defaultOBOHandlerStub() ctrlutil.OBOHandler { + return ctrlutil.OBOHandler{ + Validate: func(*mcpv1beta1.MCPExternalAuthConfig) error { return obo.ErrEnterpriseRequired }, + ApplyRunConfig: func( + context.Context, client.Client, string, + *mcpv1beta1.MCPExternalAuthConfig, *[]runner.RunConfigBuilderOption, + ) error { + return obo.ErrEnterpriseRequired + }, + SecretEnvVars: func(*mcpv1beta1.MCPExternalAuthConfig) ([]corev1.EnvVar, error) { + return nil, obo.ErrEnterpriseRequired + }, + } +} + +// TestMCPExternalAuthConfigReconciler_OBO_ErrorTriageInReconcile drives the +// production three-way error triage in the OBO branch of Reconcile by +// registering a stub OBO handler whose Validate returns each test's error, +// then calling Reconcile and asserting on the resulting condition (or the +// returned error, for the transient case). The three buckets are: +// +// - errors.Is(err, ErrEnterpriseRequired) → permanent, EnterpriseRequired +// - errors.As(err, &*obo.ValidationError) → permanent, InvalidConfig +// - anything else → transient; Reconcile returns +// the error, no condition write +// +// This exercises the production errors.Is decision (so a regression that +// swaps stderrors.Is for == would be caught — that would break the vMCP +// "failed to convert to strategy: %w" wrap path) AND the new errors.As +// decision for *obo.ValidationError. +// +//nolint:paralleltest // Mutates package-level oboHandler via RegisterOBOHandler. +func TestMCPExternalAuthConfigReconciler_OBO_ErrorTriageInReconcile(t *testing.T) { + tests := []struct { + name string + validateErr error + // Exactly one of these two paths is exercised per test case: + // either the reconciler writes a permanent Valid=False condition with + // the expected reason/message, or it returns a transient error and + // writes no Valid condition. + wantTransient bool + wantReason string + wantMessage string + }{ + { + name: "bare sentinel produces EnterpriseRequired (permanent)", + validateErr: obo.ErrEnterpriseRequired, + wantReason: mcpv1beta1.ConditionReasonEnterpriseRequired, + wantMessage: obo.ErrEnterpriseRequired.Error(), + }, + { + name: "wrapped sentinel still produces EnterpriseRequired via errors.Is", + validateErr: fmt.Errorf("outer: %w", obo.ErrEnterpriseRequired), + wantReason: mcpv1beta1.ConditionReasonEnterpriseRequired, + wantMessage: "outer: " + obo.ErrEnterpriseRequired.Error(), + }, + { + name: "ValidationError produces InvalidConfig (permanent)", + validateErr: &obo.ValidationError{Message: "audience must be a non-empty URL"}, + wantReason: mcpv1beta1.ConditionReasonInvalidConfig, + wantMessage: "audience must be a non-empty URL", + }, + { + name: "wrapped ValidationError still produces InvalidConfig via errors.As", + validateErr: fmt.Errorf("validating obo spec: %w", &obo.ValidationError{Message: "missing tokenURL"}), + wantReason: mcpv1beta1.ConditionReasonInvalidConfig, + // setInvalid is called with the unwrapped ValidationError, so its + // Message lands verbatim — the wrap prefix is discarded on purpose + // so handler-author-visible text controls what kubectl describe shows. + wantMessage: "missing tokenURL", + }, + { + name: "unclassified error is transient — Reconcile returns it, no condition write", + validateErr: stderrors.New("transient JWKS fetch failed"), + wantTransient: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Always restore the default after the subtest so we don't leak + // the stub into other tests that share this package. + t.Cleanup(func() { ctrlutil.RegisterOBOHandler(defaultOBOHandlerStub()) }) + + // Install the per-case stub: Validate returns tt.validateErr; + // the other two methods keep the default behavior. + stub := defaultOBOHandlerStub() + stub.Validate = func(*mcpv1beta1.MCPExternalAuthConfig) error { return tt.validateErr } + ctrlutil.RegisterOBOHandler(stub) + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + cfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cfg). + WithStatusSubresource(&mcpv1beta1.MCPExternalAuthConfig{}). + Build() + + r := &MCPExternalAuthConfigReconciler{Client: fakeClient, Scheme: scheme} + req := reconcile.Request{NamespacedName: types.NamespacedName{ + Name: cfg.Name, + Namespace: cfg.Namespace, + }} + + // First reconcile may just add the finalizer; the requeued + // reconcile runs the body that exercises the triage. + result, err := r.Reconcile(t.Context(), req) + if result.RequeueAfter > 0 { + require.NoError(t, err) + result, err = r.Reconcile(t.Context(), req) + } + + var updated mcpv1beta1.MCPExternalAuthConfig + require.NoError(t, fakeClient.Get(t.Context(), req.NamespacedName, &updated)) + validCond := findCondition(updated.Status.Conditions, mcpv1beta1.ConditionTypeValid) + + if tt.wantTransient { + // Transient path: Reconcile must return an error that + // preserves the underlying cause (errors.Is must still match + // it through the wrap), and no Valid condition is written. + require.Error(t, err, "transient errors must propagate so controller-runtime requeues") + assert.ErrorIs(t, err, tt.validateErr, + "the transient error must remain inspectable via errors.Is") + assert.Nil(t, validCond, + "transient errors must not write a Valid condition; "+ + "locking the resource into a permanent state would block self-healing") + return + } + + require.NoError(t, err) + require.NotNil(t, validCond) + assert.Equal(t, metav1.ConditionFalse, validCond.Status) + assert.Equal(t, tt.wantReason, validCond.Reason) + assert.Equal(t, tt.wantMessage, validCond.Message) + }) + } +} diff --git a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go index 8eecd65d5e..d42e6a7e63 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_deployment.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_deployment.go @@ -832,12 +832,7 @@ func (r *VirtualMCPServerReconciler) getExternalAuthConfigSecretEnvVar( return nil, nil case mcpv1beta1.ExternalAuthTypeOBO: - // TODO(#5328): replace this body with a call into - // controllerutil.OBOSecretEnvVars as part of the dispatch-wiring task. - // The arm exists today to satisfy the `exhaustive` linter. The CRD - // enum currently rejects "obo" at the apiserver layer, so this arm is - // unreachable in upstream-only builds. - return nil, nil + return firstOBOSecretEnvVar(externalAuthConfig) default: return nil, nil // Not applicable @@ -856,6 +851,20 @@ func (r *VirtualMCPServerReconciler) getExternalAuthConfigSecretEnvVar( }, nil } +// firstOBOSecretEnvVar dispatches through the registered OBO handler and +// returns the first env var (if any) or the dispatch error. In upstream-only +// builds the default handler returns obo.ErrEnterpriseRequired; an out-of-tree +// build registers a real handler via controllerutil.RegisterOBOHandler. +// Extracted from getExternalAuthConfigSecretEnvVar to keep its cyclomatic +// complexity below the project threshold. +func firstOBOSecretEnvVar(cfg *mcpv1beta1.MCPExternalAuthConfig) (*corev1.EnvVar, error) { + envVars, err := ctrlutil.OBOSecretEnvVars(cfg) + if err != nil || len(envVars) == 0 { + return nil, err + } + return &envVars[0], nil +} + // buildDeploymentMetadataForVmcp builds deployment-level labels and annotations func (r *VirtualMCPServerReconciler) buildDeploymentMetadataForVmcp( baseLabels map[string]string, diff --git a/cmd/thv-operator/controllers/virtualmcpserver_externalauth_test.go b/cmd/thv-operator/controllers/virtualmcpserver_externalauth_test.go index e4ce126fc7..7e3b8945ea 100644 --- a/cmd/thv-operator/controllers/virtualmcpserver_externalauth_test.go +++ b/cmd/thv-operator/controllers/virtualmcpserver_externalauth_test.go @@ -19,6 +19,7 @@ import ( mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + "github.com/stacklok/toolhive/pkg/auth/obo" "github.com/stacklok/toolhive/pkg/authserver" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config" @@ -1649,3 +1650,48 @@ func TestBuildOutgoingAuthConfig_InlineBackendSubjectProviderInjection(t *testin assert.Equal(t, "corporate-idp", config.Backends["inline-backend"].TokenExchange.SubjectProviderName, "inline backend SubjectProviderName should be injected from first upstream") } + +// TestGetExternalAuthConfigSecretEnvVar_OBO proves the obo arm of the +// getExternalAuthConfigSecretEnvVar switch dispatches through the registered +// OBO handler. With the default handler the method must return an error +// wrapping obo.ErrEnterpriseRequired AND must not silently fall through to +// nil, nil — that would mask the wired-but-disabled state behind a no-op. +func TestGetExternalAuthConfigSecretEnvVar_OBO(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + authCfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(authCfg). + Build() + + r := &VirtualMCPServerReconciler{ + Client: fakeClient, + Scheme: scheme, + PlatformDetector: ctrlutil.NewSharedPlatformDetector(), + } + + envVar, err := r.getExternalAuthConfigSecretEnvVar(t.Context(), "default", authCfg.Name) + require.Error(t, err) + assert.ErrorIs(t, err, obo.ErrEnterpriseRequired, + "the default OBO handler returns obo.ErrEnterpriseRequired; the dispatch arm must propagate it") + assert.Nil(t, envVar, "no env var should be returned on the error path") + + // Generic-error guard: per issue #5328 AC, neither generic substring may + // leak from any of the three consumer dispatch paths. + assert.NotContains(t, err.Error(), "unsupported external auth type") + assert.NotContains(t, err.Error(), "unknown middleware type") +} diff --git a/cmd/thv-operator/pkg/controllerutil/tokenexchange.go b/cmd/thv-operator/pkg/controllerutil/tokenexchange.go index e0dc6680e4..5141f5eba4 100644 --- a/cmd/thv-operator/pkg/controllerutil/tokenexchange.go +++ b/cmd/thv-operator/pkg/controllerutil/tokenexchange.go @@ -25,14 +25,39 @@ import ( // MCPExternalAuthConfig resources. An out-of-tree build replaces the default // instance (which returns obo.ErrEnterpriseRequired from every method) by // calling RegisterOBOHandler once during init(). +// +// # Error contract +// +// Every method below must return one of three error categories so callers can +// triage failures consistently: +// +// - errors.Is(err, obo.ErrEnterpriseRequired) — the build is not licensed +// to run OBO. Callers treat this as permanent until an out-of-tree handler +// is registered. +// - errors.As(err, &*obo.ValidationError) — the user-supplied spec is +// malformed (missing field, schema violation, invalid URL). Callers treat +// this as permanent until the user edits the spec. The ValidationError's +// Message field is written verbatim into the surfaced condition, so +// handler authors must ensure it is safe to expose (no Secret names, no +// internal addressing, no credential fragments). +// - anything else — treated as a transient failure (Secret not yet +// available, JWKS unreachable, webhook 5xx). Callers requeue with backoff +// rather than locking the resource into a permanent state. +// +// Returning a non-ValidationError for what is genuinely a user-fix condition +// causes the reconciler to spin on backoff. Returning a ValidationError for +// what is genuinely transient locks the resource into InvalidConfig until the +// user edits the spec. Handler authors are responsible for placing each +// failure in the right bucket. type OBOHandler struct { // Validate is called from MCPExternalAuthConfig validation to verify the - // resource's obo-typed config is well-formed. + // resource's obo-typed config is well-formed. See the type-level "Error + // contract" doc for the three-bucket triage callers apply to its return. Validate func(*mcpv1beta1.MCPExternalAuthConfig) error // ApplyRunConfig is called from AddExternalAuthConfigOptions to apply // OBO-specific runner configuration options for consuming MCPServer/ - // MCPRemoteProxy resources. + // MCPRemoteProxy resources. See the type-level "Error contract" doc. ApplyRunConfig func( ctx context.Context, c client.Client, namespace string, cfg *mcpv1beta1.MCPExternalAuthConfig, @@ -40,7 +65,8 @@ type OBOHandler struct { ) error // SecretEnvVars is called when computing the consuming resource's pod - // environment, to inject any secrets the OBO flow needs at runtime. + // environment, to inject any secrets the OBO flow needs at runtime. See + // the type-level "Error contract" doc. SecretEnvVars func(*mcpv1beta1.MCPExternalAuthConfig) ([]corev1.EnvVar, error) } @@ -115,9 +141,9 @@ func OBOSecretEnvVars(cfg *mcpv1beta1.MCPExternalAuthConfig) ([]corev1.EnvVar, e // OBOApplyRunConfig runs the registered OBO handler's ApplyRunConfig function. // With the default handler it returns obo.ErrEnterpriseRequired without -// mutating the supplied options slice. The follow-up dispatch-wiring task -// (#5328) is the first caller of this wrapper; exporting it now keeps the -// three-method surface symmetric with OBOValidate / OBOSecretEnvVars. +// mutating the supplied options slice. Exported so that the three-method +// surface stays symmetric with OBOValidate / OBOSecretEnvVars; routes through +// the package-level OBO handler registered via RegisterOBOHandler. func OBOApplyRunConfig( ctx context.Context, c client.Client, @@ -225,14 +251,12 @@ func AddExternalAuthConfigOptions( // Upstream inject is handled by the vMCP converter at runtime return nil case mcpv1beta1.ExternalAuthTypeOBO: - // TODO(#5328): replace this body with a call into - // oboHandler.ApplyRunConfig as part of the dispatch-wiring task. The - // arm exists today to satisfy the `exhaustive` linter; until #5328 - // lands we surface obo.ErrEnterpriseRequired so callers can use - // errors.Is to distinguish "type known but not wired" from a genuinely - // unknown type. The CRD enum currently rejects "obo" at the apiserver - // layer, so this arm is unreachable in upstream-only builds. - return fmt.Errorf("obo dispatch not yet wired: %w", obo.ErrEnterpriseRequired) + // Dispatch through the registered handler. In upstream-only builds the + // default handler returns obo.ErrEnterpriseRequired; an out-of-tree + // build registers a real handler via RegisterOBOHandler. Bypass the + // default's "unsupported external auth type" path so callers can + // distinguish via errors.Is(err, obo.ErrEnterpriseRequired). + return OBOApplyRunConfig(ctx, c, namespace, externalAuthConfig, options) default: return fmt.Errorf("unsupported external auth type: %s", externalAuthConfig.Spec.Type) } diff --git a/cmd/thv-operator/pkg/controllerutil/tokenexchange_test.go b/cmd/thv-operator/pkg/controllerutil/tokenexchange_test.go index cc43af8b69..4a04a543ba 100644 --- a/cmd/thv-operator/pkg/controllerutil/tokenexchange_test.go +++ b/cmd/thv-operator/pkg/controllerutil/tokenexchange_test.go @@ -11,7 +11,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" "github.com/stacklok/toolhive/pkg/auth/obo" @@ -234,3 +237,54 @@ func TestRegisterOBOHandler_PanicsOnNilField(t *testing.T) { }) } } + +// TestAddExternalAuthConfigOptions_OBO proves the obo arm of the +// AddExternalAuthConfigOptions switch dispatches through the registered OBO +// handler. With the default handler the function must return an error wrapping +// obo.ErrEnterpriseRequired AND must not fall through to the default arm's +// "unsupported external auth type" message — external consumers pattern-match +// on errors.Is, and the dispatch arm's purpose is to keep the wired-but- +// disabled state distinguishable from a genuinely unknown type. +func TestAddExternalAuthConfigOptions_OBO(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + authCfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(authCfg). + Build() + + var options []runner.RunConfigBuilderOption + err := AddExternalAuthConfigOptions( + t.Context(), + fakeClient, + "default", + "server-name", + &mcpv1beta1.ExternalAuthConfigRef{Name: authCfg.Name}, + nil, // oidcConfig — not required for OBO + &options, + ) + require.Error(t, err) + assert.ErrorIs(t, err, obo.ErrEnterpriseRequired, + "the default OBO handler returns obo.ErrEnterpriseRequired; the dispatch arm must propagate it") + + // Generic-error guard: the dispatch arm must short-circuit the default + // arm's "unsupported external auth type: ..." path and must not leak the + // middleware-map lookup's "unknown middleware type" path either. + assert.NotContains(t, err.Error(), "unsupported external auth type") + assert.NotContains(t, err.Error(), "unknown middleware type") + assert.Empty(t, options, "default OBO handler must not append to the options slice") +} diff --git a/pkg/auth/obo/errors.go b/pkg/auth/obo/errors.go index f1b2ab72f5..8b51056028 100644 --- a/pkg/auth/obo/errors.go +++ b/pkg/auth/obo/errors.go @@ -18,3 +18,35 @@ import "errors" // obo.RegisterFactory (for the proxy middleware factory). var ErrEnterpriseRequired = errors.New( "on-behalf-of (OBO) external auth type requires an enterprise build") + +// ValidationError is the typed error an OBO handler returns when its input +// is genuinely malformed and the user must fix the spec for the failure to +// clear. It is the contract for the "permanent, user-fix" bucket in the +// OBOHandler error triage: +// +// - errors.Is(err, ErrEnterpriseRequired) → not licensed; permanent until an +// out-of-tree handler is registered. +// - errors.As(err, &*ValidationError) → permanent until the user edits the +// spec; the operator writes condition.Reason=InvalidConfig and does not +// requeue. +// - anything else → treated as transient by the reconciler, which returns +// the error so controller-runtime requeues with backoff. +// +// Handler authors must use ValidationError for any condition the user can +// fix by editing the spec (missing field, malformed URL, schema violation) +// and must return a non-ValidationError for transient I/O failures (Secret +// not yet available, JWKS unreachable, webhook 5xx) so the reconciler +// retries instead of locking the resource into a permanent InvalidConfig +// state. +// +// The Message field is written verbatim into condition.Message — handler +// authors are responsible for ensuring it is safe to expose (no Secret +// names, no internal addressing, no credential fragments). +type ValidationError struct { + Message string +} + +// Error returns the user-facing message verbatim. The OBO branch in the +// MCPExternalAuthConfig reconciler writes this string into the Valid +// condition's Message field. +func (e *ValidationError) Error() string { return e.Message } diff --git a/pkg/auth/obo/errors_test.go b/pkg/auth/obo/errors_test.go index c95e2b2892..261787ee04 100644 --- a/pkg/auth/obo/errors_test.go +++ b/pkg/auth/obo/errors_test.go @@ -4,10 +4,12 @@ package obo import ( + "errors" "fmt" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestErrEnterpriseRequired_IsSentinel(t *testing.T) { @@ -18,3 +20,31 @@ func TestErrEnterpriseRequired_IsSentinel(t *testing.T) { wrapped := fmt.Errorf("outer wrap: %w", ErrEnterpriseRequired) assert.ErrorIs(t, wrapped, ErrEnterpriseRequired) } + +func TestValidationError_ErrorAndAsThroughWrap(t *testing.T) { + t.Parallel() + + original := &ValidationError{Message: "audience must be a non-empty URL"} + + // Error() returns the Message verbatim — the reconciler writes this + // string into condition.Message, so handler authors control the + // kubectl-describe output through this field. + assert.Equal(t, "audience must be a non-empty URL", original.Error()) + + // errors.As must match the typed error through a fmt.Errorf wrap so + // handler authors can wrap with extra context without breaking the + // reconciler's triage. + wrapped := fmt.Errorf("validating obo spec: %w", original) + var got *ValidationError + require.True(t, errors.As(wrapped, &got), + "errors.As must match *ValidationError through a fmt.Errorf wrap") + assert.Same(t, original, got, + "errors.As must return the same pointer that was wrapped") + + // errors.Is(err, ErrEnterpriseRequired) must NOT match a ValidationError + // — they live in different buckets of the OBOHandler error contract. + assert.False(t, errors.Is(original, ErrEnterpriseRequired), + "ValidationError must not be confused with the EnterpriseRequired sentinel") + assert.False(t, errors.Is(wrapped, ErrEnterpriseRequired), + "wrapped ValidationError must not be confused with the EnterpriseRequired sentinel") +} diff --git a/pkg/vmcp/auth/converters/obo_test.go b/pkg/vmcp/auth/converters/obo_test.go index d411084936..403f03d764 100644 --- a/pkg/vmcp/auth/converters/obo_test.go +++ b/pkg/vmcp/auth/converters/obo_test.go @@ -9,6 +9,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" "github.com/stacklok/toolhive/pkg/auth/obo" @@ -98,3 +102,52 @@ func TestOBOConverter_RegisteredInDefaultRegistry(t *testing.T) { require.NoError(t, err) assert.IsType(t, &OBOConverter{}, converter) } + +// TestDiscoverAndResolveAuth_OBO_SentinelSurvivesWrap proves that the +// OBO sentinel propagates through DiscoverAndResolveAuth's +// `"failed to convert to strategy: %w"` wrap at interface.go:191 and +// remains recognizable via errors.Is. The vMCP integration path is +// structurally fragile: matching by the wrap-prefix string would silently +// degrade if the message ever changed. This test is the regression guard. +func TestDiscoverAndResolveAuth_OBO_SentinelSurvivesWrap(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + cfg := &mcpv1beta1.MCPExternalAuthConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "obo-config", + Namespace: "default", + }, + Spec: mcpv1beta1.MCPExternalAuthConfigSpec{ + Type: mcpv1beta1.ExternalAuthTypeOBO, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(cfg). + Build() + + strategy, err := DiscoverAndResolveAuth( + t.Context(), + &mcpv1beta1.ExternalAuthConfigRef{Name: cfg.Name}, + cfg.Namespace, + fakeClient, + ) + + require.Error(t, err) + assert.Nil(t, strategy) + assert.ErrorIs(t, err, obo.ErrEnterpriseRequired, + "errors.Is must match the sentinel through the convert-to-strategy wrap") + + // Sanity: the wrap prefix is present, but recognition above is via + // errors.Is, not by string matching. + assert.Contains(t, err.Error(), "failed to convert to strategy") + + // Generic-error guards: the dispatch path must not leak generic strings. + assert.NotContains(t, err.Error(), "unsupported external auth type") + assert.NotContains(t, err.Error(), "unknown middleware type") +}