diff --git a/api/v1alpha1/hyperbytedbcluster_types.go b/api/v1alpha1/hyperbytedbcluster_types.go index 4c3bdd9..a89e6f0 100644 --- a/api/v1alpha1/hyperbytedbcluster_types.go +++ b/api/v1alpha1/hyperbytedbcluster_types.go @@ -94,6 +94,12 @@ type HyperbytedbClusterSpec struct { // +optional Failover *FailoverSpec `json:"failover,omitempty"` + // Proxy deploys a stateless `hyperbytedb-proxy` Deployment in front of the + // StatefulSet to absorb rolling restarts and drain events without + // returning errors to clients. + // +optional + Proxy *ProxySpec `json:"proxy,omitempty"` + // +optional PodAnnotations map[string]string `json:"podAnnotations,omitempty"` @@ -470,6 +476,93 @@ type FailoverSpec struct { FailoverTimeoutSecs int32 `json:"failoverTimeoutSecs,omitempty"` } +// ProxySpec configures the optional `hyperbytedb-proxy` reverse proxy that +// sits in front of the StatefulSet. The proxy is health-aware: it routes +// only to Active backends and holds requests briefly while a rolling +// restart cycles through pods, so clients (Grafana, Telegraf, etc.) never +// observe transient 503s. +type ProxySpec struct { + // When false, the operator does not create or reconcile any proxy + // resources. Existing proxy Deployment/Service (if any) are left alone + // so they can be cleaned up out-of-band. + // +kubebuilder:default=false + Enabled bool `json:"enabled"` + + // +kubebuilder:default="hyperbytedb-proxy:latest" + Image string `json:"image,omitempty"` + + // +optional + ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + + // +optional + ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"` + + // +kubebuilder:default=2 + // +kubebuilder:validation:Minimum=1 + Replicas *int32 `json:"replicas,omitempty"` + + // Port the proxy Service exposes. Defaults to the cluster server port so + // existing clients can re-target the Service name with no port change. + // +optional + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + Port int32 `json:"port,omitempty"` + + // HTTP path used for backend health probes. Defaults to `/health`. + // Set to `/health/ready` for the deeper chDB-aware readiness check. + // +optional + HealthPath string `json:"healthPath,omitempty"` + + // How long the proxy waits for a backend to come back before failing a + // request with 503. Bigger values mean rolling restarts are smoother but + // individual stuck requests sit longer. + // +optional + // +kubebuilder:default=10 + // +kubebuilder:validation:Minimum=0 + HoldTimeoutSecs int32 `json:"holdTimeoutSecs,omitempty"` + + // Cap on per-backend retries for one request. 0 disables retries. + // +optional + // +kubebuilder:default=2 + // +kubebuilder:validation:Minimum=0 + MaxRetries int32 `json:"maxRetries,omitempty"` + + // How long the proxy keeps serving in-flight requests after SIGTERM + // before exiting. Should comfortably exceed the longest expected query. + // +optional + // +kubebuilder:default=30 + // +kubebuilder:validation:Minimum=1 + ShutdownGraceSecs int32 `json:"shutdownGraceSecs,omitempty"` + + // Per-request budget the proxy allows for the upstream call. Defaults + // to ~ServerSpec.RequestTimeoutSecs. + // +optional + // +kubebuilder:validation:Minimum=1 + RequestTimeoutSecs int32 `json:"requestTimeoutSecs,omitempty"` + + // +optional + Resources corev1.ResourceRequirements `json:"resources,omitempty"` + + // Type of the proxy Service. Defaults to ClusterIP. Set NodePort/ + // LoadBalancer to expose externally. + // +optional + // +kubebuilder:validation:Enum=ClusterIP;NodePort;LoadBalancer + ServiceType corev1.ServiceType `json:"serviceType,omitempty"` + + // Explicit nodePort when ServiceType=NodePort. Required for kind clusters + // that pre-map a host port to a fixed nodePort. + // +optional + // +kubebuilder:validation:Minimum=30000 + // +kubebuilder:validation:Maximum=32767 + NodePort int32 `json:"nodePort,omitempty"` + + // +optional + PodAnnotations map[string]string `json:"podAnnotations,omitempty"` + + // +optional + PodLabels map[string]string `json:"podLabels,omitempty"` +} + // ClusterPhase represents the lifecycle phase of the cluster. // +kubebuilder:validation:Enum=Pending;Initializing;Running;Scaling;Upgrading;Failed type ClusterPhase string diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 593467a..f534186 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -433,6 +433,11 @@ func (in *HyperbytedbClusterSpec) DeepCopyInto(out *HyperbytedbClusterSpec) { *out = new(FailoverSpec) **out = **in } + if in.Proxy != nil { + in, out := &in.Proxy, &out.Proxy + *out = new(ProxySpec) + (*in).DeepCopyInto(*out) + } if in.PodAnnotations != nil { in, out := &in.PodAnnotations, &out.PodAnnotations *out = make(map[string]string, len(*in)) @@ -716,6 +721,46 @@ func (in *PersistentVolumeClaimSpec) DeepCopy() *PersistentVolumeClaimSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProxySpec) DeepCopyInto(out *ProxySpec) { + *out = *in + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]corev1.LocalObjectReference, len(*in)) + copy(*out, *in) + } + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } + in.Resources.DeepCopyInto(&out.Resources) + if in.PodAnnotations != nil { + in, out := &in.PodAnnotations, &out.PodAnnotations + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.PodLabels != nil { + in, out := &in.PodLabels, &out.PodLabels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProxySpec. +func (in *ProxySpec) DeepCopy() *ProxySpec { + if in == nil { + return nil + } + out := new(ProxySpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RateLimitSpec) DeepCopyInto(out *RateLimitSpec) { *out = *in diff --git a/config/crd/bases/hyperbytedb.hyperbytedb.io_hyperbytedbclusters.yaml b/config/crd/bases/hyperbytedb.hyperbytedb.io_hyperbytedbclusters.yaml index 899769b..45cc3b0 100644 --- a/config/crd/bases/hyperbytedb.hyperbytedb.io_hyperbytedbclusters.yaml +++ b/config/crd/bases/hyperbytedb.hyperbytedb.io_hyperbytedbclusters.yaml @@ -3309,6 +3309,181 @@ spec: additionalProperties: type: string type: object + proxy: + description: |- + Proxy deploys a stateless `hyperbytedb-proxy` Deployment in front of the + StatefulSet to absorb rolling restarts and drain events without + returning errors to clients. + properties: + enabled: + default: false + description: |- + When false, the operator does not create or reconcile any proxy + resources. Existing proxy Deployment/Service (if any) are left alone + so they can be cleaned up out-of-band. + type: boolean + healthPath: + description: |- + HTTP path used for backend health probes. Defaults to `/health`. + Set to `/health/ready` for the deeper chDB-aware readiness check. + type: string + holdTimeoutSecs: + default: 10 + description: |- + How long the proxy waits for a backend to come back before failing a + request with 503. Bigger values mean rolling restarts are smoother but + individual stuck requests sit longer. + format: int32 + minimum: 0 + type: integer + image: + default: hyperbytedb-proxy:latest + type: string + imagePullPolicy: + description: PullPolicy describes a policy for if/when to pull + a container image + type: string + imagePullSecrets: + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + maxRetries: + default: 2 + description: Cap on per-backend retries for one request. 0 disables + retries. + format: int32 + minimum: 0 + type: integer + nodePort: + description: |- + Explicit nodePort when ServiceType=NodePort. Required for kind clusters + that pre-map a host port to a fixed nodePort. + format: int32 + maximum: 32767 + minimum: 30000 + type: integer + podAnnotations: + additionalProperties: + type: string + type: object + podLabels: + additionalProperties: + type: string + type: object + port: + description: |- + Port the proxy Service exposes. Defaults to the cluster server port so + existing clients can re-target the Service name with no port change. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + replicas: + default: 2 + format: int32 + minimum: 1 + type: integer + requestTimeoutSecs: + description: |- + Per-request budget the proxy allows for the upstream call. Defaults + to ~ServerSpec.RequestTimeoutSecs. + format: int32 + minimum: 1 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + serviceType: + description: |- + Type of the proxy Service. Defaults to ClusterIP. Set NodePort/ + LoadBalancer to expose externally. + enum: + - ClusterIP + - NodePort + - LoadBalancer + type: string + shutdownGraceSecs: + default: 30 + description: |- + How long the proxy keeps serving in-flight requests after SIGTERM + before exiting. Should comfortably exceed the longest expected query. + format: int32 + minimum: 1 + type: integer + required: + - enabled + type: object rateLimit: description: RateLimitSpec controls per-endpoint request rate limiting. properties: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b3717c1..853d0a6 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -38,6 +38,7 @@ rules: - apiGroups: - apps resources: + - deployments - statefulsets verbs: - create diff --git a/dist/install.yaml b/dist/install.yaml index fd4f13b..66f0139 100644 --- a/dist/install.yaml +++ b/dist/install.yaml @@ -236,7 +236,8 @@ spec: name: v1alpha1 schema: openAPIV3Schema: - description: HyperbytedbCluster is the Schema for the hyperbytedbclusters API. + description: HyperbytedbCluster is the Schema for the hyperbytedbclusters + API. properties: apiVersion: description: |- @@ -3163,6 +3164,19 @@ spec: - enabled - maxReplicas type: object + cardinality: + description: CardinalitySpec configures cardinality limits enforced + by hyperbytedb. + properties: + maxMeasurementsPerDatabase: + format: int64 + minimum: 0 + type: integer + maxTagValuesPerMeasurement: + format: int64 + minimum: 0 + type: integer + type: object chdb: properties: poolSize: @@ -3201,10 +3215,79 @@ spec: default: 1000 format: int32 type: integer + replicateReceiverQueueDepth: + description: Bounded apply queue on the replicate receiver. + format: int32 + minimum: 0 + type: integer + replication: + description: Per-node, per-write replication mode and tuning. + properties: + ackTimeoutMs: + default: 5000 + description: |- + Worst-case latency budget (ms) for sync_quorum writes. On timeout the + coordinator returns 504; in-flight peer tasks keep running and unacked + peers fall back to hinted handoff. + format: int64 + minimum: 0 + type: integer + mode: + default: async + description: |- + Replication mode. "async" is fire-and-forget HTTP fan-out (default, + preserves today's behavior). "sync_quorum" awaits W-of-N peer acks + before returning to the client. + enum: + - async + - sync_quorum + type: string + syncQuorum: + description: SyncQuorumSpec configures the sync_quorum replication + mode. + properties: + minAcks: + anyOf: + - type: integer + - type: string + description: |- + Number of peer acks required for sync_quorum. Either the string + "majority" (resolved at request time against current active peers) or an + explicit integer count. The local WAL append always happens first, so + self-durability is implicit and the local node is never counted toward + the quorum. + x-kubernetes-int-or-string: true + type: object + type: object + replicationMaxCoalesceBodyBytes: + description: Max bytes for coalescing consecutive WAL batches + with the same db/rp/precision. + format: int64 + minimum: 0 + type: integer + replicationMaxInflightBatches: + description: Max concurrent outbound replication fan-out rounds + (token bucket). + format: int32 + minimum: 0 + type: integer replicationMaxRetries: default: 5 format: int32 type: integer + replicationQueueDepth: + description: Bounded outbound replication queue depth (ingest-sized + batches). + format: int32 + minimum: 0 + type: integer + replicationTruncateStalePeerMultiplier: + description: |- + When >0, peers with ack 0 and stale heartbeats (older than + heartbeatIntervalSecs * multiplier) are omitted from the WAL truncate barrier. + format: int64 + minimum: 0 + type: integer syncMaxConcurrentFiles: default: 4 format: int32 @@ -3239,6 +3322,21 @@ spec: type: object compaction: properties: + bucketDuration: + description: |- + Compaction time bucket. "1h" (hourly, default) or "1d" (daily). Daily + buckets merge all 24 hourly files, reducing file count for wide-range queries. + enum: + - 1h + - 1d + - 24h + type: string + compactAllMaxInflight: + description: Max concurrent measurement compactions when running + compact_all. + format: int32 + minimum: 0 + type: integer enabled: default: true type: boolean @@ -3246,14 +3344,33 @@ spec: default: 300 format: int32 type: integer + maxRepairChecksPerCycle: + description: |- + Max bucket-hash comparisons (and repair attempts) per compaction tick for + membership self-repair. + format: int32 + minimum: 0 + type: integer minFilesToCompact: default: 4 format: int32 type: integer + selfRepairEnabled: + description: |- + When true, periodically compare each active peer's manifest against local + bucket hashes and fetch divergent or missing slices. + type: boolean targetFileSizeMb: default: 256 format: int32 type: integer + verifiedCompactionAgeSecs: + description: |- + Minimum age (seconds) before per-node files are hash-verified and merged + into a single compacted file. + format: int64 + minimum: 0 + type: integer type: object failover: description: FailoverSpec controls automatic failure detection and @@ -3284,14 +3401,52 @@ spec: default: 10 format: int32 type: integer + maxPointsPerBatch: + description: Maximum points buffered per measurement before forcing + a flush. 0 = unlimited. + format: int32 + minimum: 0 + type: integer timeBucketDuration: default: 1h type: string + walBatchDelayUs: + description: 'WAL group-commit: max microseconds to wait for more + entries before flushing.' + format: int64 + minimum: 0 + type: integer + walBatchSize: + description: 'WAL group-commit: max entries to coalesce per write + batch. 0 = disabled.' + format: int32 + minimum: 0 + type: integer walSizeThresholdMb: default: 64 format: int32 type: integer type: object + hintedHandoff: + description: |- + HintedHandoffSpec configures the hinted-handoff queue used to retry writes + against peers that were temporarily unreachable. + properties: + enabled: + type: boolean + maxHintAgeSecs: + description: Hints older than this (seconds) are discarded on + drain. + format: int64 + minimum: 0 + type: integer + maxHintsPerPeer: + description: Maximum queued hints per unreachable peer before + oldest are dropped. + format: int64 + minimum: 0 + type: integer + type: object image: default: hyperbytedb:latest type: string @@ -3357,6 +3512,193 @@ spec: additionalProperties: type: string type: object + proxy: + description: |- + Proxy deploys a stateless `hyperbytedb-proxy` Deployment in front of the + StatefulSet to absorb rolling restarts and drain events without + returning errors to clients. + properties: + enabled: + default: false + description: |- + When false, the operator does not create or reconcile any proxy + resources. Existing proxy Deployment/Service (if any) are left alone + so they can be cleaned up out-of-band. + type: boolean + healthPath: + description: |- + HTTP path used for backend health probes. Defaults to `/health`. + Set to `/health/ready` for the deeper chDB-aware readiness check. + type: string + holdTimeoutSecs: + default: 10 + description: |- + How long the proxy waits for a backend to come back before failing a + request with 503. Bigger values mean rolling restarts are smoother but + individual stuck requests sit longer. + format: int32 + minimum: 0 + type: integer + image: + default: hyperbytedb-proxy:latest + type: string + imagePullPolicy: + description: PullPolicy describes a policy for if/when to pull + a container image + type: string + imagePullSecrets: + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + maxRetries: + default: 2 + description: Cap on per-backend retries for one request. 0 disables + retries. + format: int32 + minimum: 0 + type: integer + nodePort: + description: |- + Explicit nodePort when ServiceType=NodePort. Required for kind clusters + that pre-map a host port to a fixed nodePort. + format: int32 + maximum: 32767 + minimum: 30000 + type: integer + podAnnotations: + additionalProperties: + type: string + type: object + podLabels: + additionalProperties: + type: string + type: object + port: + description: |- + Port the proxy Service exposes. Defaults to the cluster server port so + existing clients can re-target the Service name with no port change. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + replicas: + default: 2 + format: int32 + minimum: 1 + type: integer + requestTimeoutSecs: + description: |- + Per-request budget the proxy allows for the upstream call. Defaults + to ~ServerSpec.RequestTimeoutSecs. + format: int32 + minimum: 1 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + serviceType: + description: |- + Type of the proxy Service. Defaults to ClusterIP. Set NodePort/ + LoadBalancer to expose externally. + enum: + - ClusterIP + - NodePort + - LoadBalancer + type: string + shutdownGraceSecs: + default: 30 + description: |- + How long the proxy keeps serving in-flight requests after SIGTERM + before exiting. Should comfortably exceed the longest expected query. + format: int32 + minimum: 1 + type: integer + required: + - enabled + type: object + rateLimit: + description: RateLimitSpec controls per-endpoint request rate limiting. + properties: + enabled: + type: boolean + maxRequestsPerSecond: + description: Maximum requests per second per endpoint (/write, + /query). 0 = unlimited. + format: int64 + minimum: 0 + type: integer + type: object replicas: default: 1 format: int32 @@ -3427,6 +3769,11 @@ spec: default: 26214400 format: int64 type: integer + maxConcurrentQueries: + description: Maximum concurrent /query requests. 0 = unlimited. + format: int32 + minimum: 0 + type: integer port: default: 8086 format: int32 @@ -3468,6 +3815,21 @@ spec: - enabled type: object type: object + statementSummary: + description: |- + StatementSummarySpec controls collection of per-statement execution stats + exposed via /debug/statement_summary. + properties: + enabled: + type: boolean + maxEntries: + description: |- + Maximum number of distinct statements tracked. Oldest entries are evicted + when the limit is exceeded. + format: int32 + minimum: 0 + type: integer + type: object storage: properties: backend: @@ -3905,7 +4267,8 @@ spec: name: v1alpha1 schema: openAPIV3Schema: - description: HyperbytedbRestore is the Schema for the hyperbytedbrestores API. + description: HyperbytedbRestore is the Schema for the hyperbytedbrestores + API. properties: apiVersion: description: |- @@ -4357,6 +4720,7 @@ rules: - apiGroups: - apps resources: + - deployments - statefulsets verbs: - create @@ -4423,18 +4787,6 @@ rules: - get - patch - update -- apiGroups: - - monitoring.coreos.com - resources: - - servicemonitors - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - apiGroups: - policy resources: diff --git a/internal/controller/hyperbytedbcluster_controller.go b/internal/controller/hyperbytedbcluster_controller.go index 0244316..e5f9b27 100644 --- a/internal/controller/hyperbytedbcluster_controller.go +++ b/internal/controller/hyperbytedbcluster_controller.go @@ -60,6 +60,7 @@ type HyperbytedbClusterReconciler struct { // +kubebuilder:rbac:groups=hyperbytedb.hyperbytedb.io,resources=hyperbytedbclusters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=hyperbytedb.hyperbytedb.io,resources=hyperbytedbclusters/finalizers,verbs=update // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete @@ -211,6 +212,15 @@ func (r *HyperbytedbClusterReconciler) Reconcile(ctx context.Context, req ctrl.R log.Error(err, "Failed to reconcile HPA") } + // 11b. Proxy (optional). When disabled (default) we don't reconcile — + // any pre-existing proxy resources are left intact so users can clean + // them up out-of-band. + if hyperbytedb.ProxyEnabled(cluster) { + if err := r.reconcileProxy(ctx, cluster); err != nil { + log.Error(err, "Failed to reconcile proxy") + } + } + // 12. Collect member statuses memberStatuses := r.Members.CollectMemberStatuses(ctx, cluster, cluster.Namespace, replicas) cluster.Status.Members = memberStatuses @@ -626,6 +636,84 @@ func (r *HyperbytedbClusterReconciler) reconcileHPA(ctx context.Context, cluster return r.Update(ctx, existing) } +// ---------- Proxy ---------- + +// reconcileProxy creates/updates the optional `hyperbytedb-proxy` Deployment +// + Service that fronts the StatefulSet. Only invoked when +// spec.proxy.enabled is true. The proxy itself discovers backend pods via +// the existing headless Service, so this reconciler does not need to wire +// peer addresses explicitly. +func (r *HyperbytedbClusterReconciler) reconcileProxy(ctx context.Context, cluster *hyperbytedbv1alpha1.HyperbytedbCluster) error { + desiredSvc := hyperbytedb.BuildProxyService(cluster) + if err := controllerutil.SetControllerReference(cluster, desiredSvc, r.Scheme); err != nil { + return err + } + existingSvc := &corev1.Service{} + err := r.Get(ctx, types.NamespacedName{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc) + switch { + case apierrors.IsNotFound(err): + if err := r.Create(ctx, desiredSvc); err != nil { + return fmt.Errorf("create proxy service: %w", err) + } + case err != nil: + return fmt.Errorf("get proxy service: %w", err) + default: + // Type changes (e.g. ClusterIP↔NodePort) require recreating the Service; + // we only patch in-place when type matches to avoid unnecessary churn. + existingSvc.Spec.Ports = desiredSvc.Spec.Ports + existingSvc.Spec.Selector = desiredSvc.Spec.Selector + if existingSvc.Spec.Type == desiredSvc.Spec.Type { + if err := r.Update(ctx, existingSvc); err != nil { + return fmt.Errorf("update proxy service: %w", err) + } + } else { + if err := r.Delete(ctx, existingSvc); err != nil { + return fmt.Errorf("delete proxy service for type change: %w", err) + } + if err := r.Create(ctx, desiredSvc); err != nil { + return fmt.Errorf("recreate proxy service: %w", err) + } + } + } + + desiredDep := hyperbytedb.BuildProxyDeployment(cluster) + if err := controllerutil.SetControllerReference(cluster, desiredDep, r.Scheme); err != nil { + return err + } + existingDep := &appsv1.Deployment{} + err = r.Get(ctx, types.NamespacedName{Name: desiredDep.Name, Namespace: desiredDep.Namespace}, existingDep) + if apierrors.IsNotFound(err) { + r.Recorder.Event(cluster, corev1.EventTypeNormal, "CreatingProxy", "Creating hyperbytedb-proxy Deployment") + return r.Create(ctx, desiredDep) + } + if err != nil { + return fmt.Errorf("get proxy deployment: %w", err) + } + + // Preserve template-level annotations set by external tooling (most + // notably `kubectl.kubernetes.io/restartedAt`, which `kubectl rollout + // restart` writes to force a new ReplicaSet). Without this merge our + // next reconcile would overwrite Spec.Template wholesale, drop that + // annotation, and the Deployment would scale the new ReplicaSet back + // to zero — silently undoing the rollout. + mergedTemplateAnnotations := map[string]string{} + for k, v := range existingDep.Spec.Template.Annotations { + mergedTemplateAnnotations[k] = v + } + for k, v := range desiredDep.Spec.Template.Annotations { + mergedTemplateAnnotations[k] = v + } + if len(mergedTemplateAnnotations) > 0 { + desiredDep.Spec.Template.Annotations = mergedTemplateAnnotations + } + + existingDep.Spec.Replicas = desiredDep.Spec.Replicas + existingDep.Spec.Selector = desiredDep.Spec.Selector + existingDep.Spec.Strategy = desiredDep.Spec.Strategy + existingDep.Spec.Template = desiredDep.Spec.Template + return r.Update(ctx, existingDep) +} + // ---------- Cluster Membership Reconciliation (API-driven) ---------- // podView is a per-pod snapshot used during membership reconciliation. diff --git a/internal/hyperbytedb/proxy.go b/internal/hyperbytedb/proxy.go new file mode 100644 index 0000000..68c37c7 --- /dev/null +++ b/internal/hyperbytedb/proxy.go @@ -0,0 +1,273 @@ +package hyperbytedb + +import ( + "fmt" + "maps" + "strconv" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" + + v1alpha1 "github.com/hyperbyte-cloud/hyperbytedb-operator/api/v1alpha1" +) + +// ProxyDeploymentName / ProxyServiceName follow the StatefulSet/headless-svc +// pattern: `-proxy` for the Deployment, `-proxy` for the +// stable client-facing Service. The original `` headless Service +// stays in place so the proxy can still discover backend pods via DNS. +func ProxyDeploymentName(cluster *v1alpha1.HyperbytedbCluster) string { + return cluster.Name + "-proxy" +} + +func ProxyServiceName(cluster *v1alpha1.HyperbytedbCluster) string { + return cluster.Name + "-proxy" +} + +// ProxyEnabled reports whether the user opted-in to proxy reconciliation. +func ProxyEnabled(cluster *v1alpha1.HyperbytedbCluster) bool { + return cluster.Spec.Proxy != nil && cluster.Spec.Proxy.Enabled +} + +// proxyLabels intentionally uses `name=hyperbytedb-proxy` instead of +// `name=hyperbytedb`. The headless Service selector is just CommonLabels +// (`name=hyperbytedb`, `instance=`, `managed-by=hyperbytedb-operator`) +// and a StatefulSet/Service selector is immutable, so we cannot retro-fit a +// `component=database` filter onto the existing headless selector. By +// flipping `name` on the proxy side we guarantee the headless DNS only +// resolves to database pods — without that, the proxy resolves itself, +// forwards requests to itself, and infinitely recurses until the pod OOMs. +func proxyLabels(cluster *v1alpha1.HyperbytedbCluster) map[string]string { + return map[string]string{ + "app.kubernetes.io/name": "hyperbytedb-proxy", + "app.kubernetes.io/instance": cluster.Name, + "app.kubernetes.io/managed-by": "hyperbytedb-operator", + "app.kubernetes.io/component": "proxy", + } +} + +func proxyPort(cluster *v1alpha1.HyperbytedbCluster) int32 { + if cluster.Spec.Proxy != nil && cluster.Spec.Proxy.Port > 0 { + return cluster.Spec.Proxy.Port + } + return ServerPort(cluster) +} + +// BuildProxyService is the stable client entry point for the cluster when +// proxy mode is enabled. Defaults to ClusterIP; supports NodePort/LoadBalancer +// for clusters that need external exposure (kind, on-prem, etc.). +func BuildProxyService(cluster *v1alpha1.HyperbytedbCluster) *corev1.Service { + spec := cluster.Spec.Proxy + port := proxyPort(cluster) + + svcType := corev1.ServiceTypeClusterIP + if spec != nil && spec.ServiceType != "" { + svcType = spec.ServiceType + } + + svcPort := corev1.ServicePort{ + Name: "http", + Port: port, + TargetPort: intstr.FromString("http"), + Protocol: corev1.ProtocolTCP, + } + if svcType == corev1.ServiceTypeNodePort && spec != nil && spec.NodePort > 0 { + svcPort.NodePort = spec.NodePort + } + + return &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: ProxyServiceName(cluster), + Namespace: cluster.Namespace, + Labels: proxyLabels(cluster), + }, + Spec: corev1.ServiceSpec{ + Type: svcType, + Selector: proxyLabels(cluster), + Ports: []corev1.ServicePort{svcPort}, + }, + } +} + +// BuildProxyDeployment renders the stateless proxy Deployment that fans out +// in front of the StatefulSet. The proxy discovers pods via the existing +// headless Service so we don't need any extra wiring as the cluster scales. +func BuildProxyDeployment(cluster *v1alpha1.HyperbytedbCluster) *appsv1.Deployment { + spec := cluster.Spec.Proxy + if spec == nil { + // Defensive: callers should gate on ProxyEnabled, but never panic. + spec = &v1alpha1.ProxySpec{} + } + + replicas := int32(2) + if spec.Replicas != nil { + replicas = *spec.Replicas + } + + image := spec.Image + if image == "" { + image = "hyperbytedb-proxy:latest" + } + pullPolicy := spec.ImagePullPolicy + if pullPolicy == "" { + pullPolicy = corev1.PullIfNotPresent + } + + port := proxyPort(cluster) + backendPort := ServerPort(cluster) + backendService := fmt.Sprintf("%s.%s.svc.cluster.local", + HeadlessServiceName(cluster), cluster.Namespace) + + holdSecs := int32(10) + if spec.HoldTimeoutSecs > 0 { + holdSecs = spec.HoldTimeoutSecs + } + maxRetries := int32(2) + if spec.MaxRetries >= 0 && spec.MaxRetries != 0 { + maxRetries = spec.MaxRetries + } + graceSecs := int32(30) + if spec.ShutdownGraceSecs > 0 { + graceSecs = spec.ShutdownGraceSecs + } + requestTimeout := cluster.Spec.Server.RequestTimeoutSecs + if requestTimeout <= 0 { + requestTimeout = 60 + } + if spec.RequestTimeoutSecs > 0 { + requestTimeout = spec.RequestTimeoutSecs + } + healthPath := spec.HealthPath + if healthPath == "" { + healthPath = "/health" + } + + env := []corev1.EnvVar{ + {Name: "HYPERBYTEDB_PROXY_LISTEN", Value: fmt.Sprintf("0.0.0.0:%d", port)}, + {Name: "HYPERBYTEDB_PROXY_BACKEND_SERVICE", Value: backendService}, + {Name: "HYPERBYTEDB_PROXY_BACKEND_PORT", Value: strconv.Itoa(int(backendPort))}, + {Name: "HYPERBYTEDB_PROXY_HEALTH_PATH", Value: healthPath}, + {Name: "HYPERBYTEDB_PROXY_HOLD_TIMEOUT_SECS", Value: strconv.Itoa(int(holdSecs))}, + {Name: "HYPERBYTEDB_PROXY_MAX_RETRIES", Value: strconv.Itoa(int(maxRetries))}, + {Name: "HYPERBYTEDB_PROXY_SHUTDOWN_GRACE_SECS", Value: strconv.Itoa(int(graceSecs))}, + {Name: "HYPERBYTEDB_PROXY_REQUEST_TIMEOUT_SECS", Value: strconv.Itoa(int(requestTimeout))}, + // Downward-API: defense-in-depth so the proxy never adds its own pod + // IP to the backend pool even if a future label refactor accidentally + // makes the headless Service select proxy pods again. + { + Name: "HYPERBYTEDB_PROXY_SELF_IP", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "status.podIP", + }, + }, + }, + } + if cluster.Spec.Logging.Format == "json" { + env = append(env, corev1.EnvVar{Name: "LOG_FORMAT", Value: "json"}) + } + + podLabels := make(map[string]string) + maps.Copy(podLabels, proxyLabels(cluster)) + maps.Copy(podLabels, spec.PodLabels) + + podAnnotations := map[string]string{} + maps.Copy(podAnnotations, spec.PodAnnotations) + + container := corev1.Container{ + Name: "proxy", + Image: image, + ImagePullPolicy: pullPolicy, + Env: env, + Ports: []corev1.ContainerPort{ + { + Name: "http", + ContainerPort: port, + Protocol: corev1.ProtocolTCP, + }, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("http"), + }, + }, + InitialDelaySeconds: 2, + PeriodSeconds: 10, + TimeoutSeconds: 2, + }, + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("http"), + }, + }, + InitialDelaySeconds: 1, + PeriodSeconds: 2, + TimeoutSeconds: 2, + }, + Resources: spec.Resources, + } + + // Spread proxy replicas across nodes for HA. Cheap default — users can + // override via spec.Proxy.PodLabels + topology constraints applied + // out-of-band if they need something tighter. + podSpec := corev1.PodSpec{ + // Long enough to absorb in-flight queries during a proxy restart. + // The proxy itself enforces shutdown_grace internally; this just + // gives the kubelet headroom before SIGKILL. + TerminationGracePeriodSeconds: ptr.To(int64(graceSecs + 10)), + ImagePullSecrets: spec.ImagePullSecrets, + Containers: []corev1.Container{container}, + Affinity: &corev1.Affinity{ + PodAntiAffinity: &corev1.PodAntiAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{ + { + Weight: 100, + PodAffinityTerm: corev1.PodAffinityTerm{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: proxyLabels(cluster), + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + }, + }, + } + + maxUnavailable := intstr.FromInt(0) + maxSurge := intstr.FromInt(1) + + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: ProxyDeploymentName(cluster), + Namespace: cluster.Namespace, + Labels: proxyLabels(cluster), + }, + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To(replicas), + Selector: &metav1.LabelSelector{ + MatchLabels: proxyLabels(cluster), + }, + Strategy: appsv1.DeploymentStrategy{ + Type: appsv1.RollingUpdateDeploymentStrategyType, + RollingUpdate: &appsv1.RollingUpdateDeployment{ + MaxUnavailable: &maxUnavailable, + MaxSurge: &maxSurge, + }, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: podLabels, + Annotations: podAnnotations, + }, + Spec: podSpec, + }, + }, + } +}