From 47a4526c8b5e25aa10e126e7ded2f7d0509fbabe Mon Sep 17 00:00:00 2001 From: d062260 Date: Thu, 30 Apr 2026 17:12:35 +0200 Subject: [PATCH 1/3] wip gnosis intro --- Makefile | 7 + Makefile.maker.yaml | 7 + docs/content/architecture.doc.yaml | 48 +++ docs/content/getting-started.doc.yaml | 48 +++ docs/generated/.gitkeep | 0 docs/generated/guides/architecture.md | 357 +++++++++++++++++++++++ docs/generated/guides/getting-started.md | 276 ++++++++++++++++++ docs/generated/guides/index.md | 11 + docs/generated/index.md | 15 + gnosis.yaml | 22 ++ 10 files changed, 791 insertions(+) create mode 100644 docs/content/architecture.doc.yaml create mode 100644 docs/content/getting-started.doc.yaml create mode 100644 docs/generated/.gitkeep create mode 100644 docs/generated/guides/architecture.md create mode 100644 docs/generated/guides/getting-started.md create mode 100644 docs/generated/guides/index.md create mode 100644 docs/generated/index.md create mode 100644 gnosis.yaml diff --git a/Makefile b/Makefile index 19ebcd915..c91db66b4 100644 --- a/Makefile +++ b/Makefile @@ -191,6 +191,13 @@ run-docs: @docker build -t $(DOCS_IMG) -f docs/Dockerfile docs --load @docker run --rm --init -p 5173:5173 -v $(ROOT_DIR)/docs:/workspace -v /workspace/node_modules $(DOCS_IMG) +install-gnosis: FORCE + @if ! hash gnosis 2>/dev/null; then printf "\e[1;36m>> Installing gnosis...\e[0m\n"; go install github.com/cobaltcore-dev/gnosis/cmd/gnosis@latest; fi + +docs-generate: install-gnosis + @printf "\e[1;36m>> gnosis generate\e[0m\n" + @gnosis generate + docs: install-crd-ref-docs crd-ref-docs --source-path=./api --config=./hack/api-reference/config.yaml --renderer=markdown --output-path=./docs/api-reference/index.md @$(SED) -i \ diff --git a/Makefile.maker.yaml b/Makefile.maker.yaml index 53143db03..aaf03664d 100644 --- a/Makefile.maker.yaml +++ b/Makefile.maker.yaml @@ -233,6 +233,13 @@ verbatim: | @docker build -t $(DOCS_IMG) -f docs/Dockerfile docs --load @docker run --rm --init -p 5173:5173 -v $(ROOT_DIR)/docs:/workspace -v /workspace/node_modules $(DOCS_IMG) + install-gnosis: FORCE + @if ! hash gnosis 2>/dev/null; then printf "\e[1;36m>> Installing gnosis...\e[0m\n"; go install github.com/cobaltcore-dev/gnosis/cmd/gnosis@latest; fi + + docs-generate: install-gnosis + @printf "\e[1;36m>> gnosis generate\e[0m\n" + @gnosis generate + docs: install-crd-ref-docs crd-ref-docs --source-path=./api --config=./hack/api-reference/config.yaml --renderer=markdown --output-path=./docs/api-reference/index.md @$(SED) -i \ diff --git a/docs/content/architecture.doc.yaml b/docs/content/architecture.doc.yaml new file mode 100644 index 000000000..82b8aba16 --- /dev/null +++ b/docs/content/architecture.doc.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +# SPDX-License-Identifier: Apache-2.0 + +kind: guide +title: "Architecture" +description: "How Network Operator reconciles declarative CRDs into device configurations" +prompt: | + Explain the network-operator architecture from a user's perspective. Cover: + 1. The reconciliation model: user applies CRD -> controller detects change -> builds device config -> pushes to device + 2. How the core CRDs (platform-agnostic) relate to platform-specific CRDs (e.g. NX-OS) + 3. How Device registration and credentials work + 4. The role of status conditions and finalizers + 5. How multi-device and multi-vendor support is structured + 6. EVPN/VXLAN fabric provisioning as a concrete example + Keep the tone practical. Operators should understand how their YAML manifests + translate into network device configuration. +context: | + Network Operator is a set of Kubernetes controllers that reconcile CRD + specs into network device configurations. It follows standard controller-runtime + patterns: watch CRDs, compare desired vs actual, push diffs to devices. + + Architecture layers: + - API layer (api/): CRD type definitions (core + platform-specific) + - Controller layer (internal/controller/): reconciliation logic per CRD + - Provider layer: device communication (NX-API for NX-OS, gNMI planned) + + Core vs Platform-specific: + - Core CRDs (api/core/v1alpha1/) define the abstract intent (e.g. "Interface") + - Platform CRDs (api/cisco/nx/v1alpha1/) add vendor-specific knobs + - Controllers translate core intent into platform-native configuration + + Reconciliation flow: + 1. User applies a CRD manifest (e.g. Interface spec) + 2. Controller watches for changes via controller-runtime + 3. Controller resolves the target Device reference + 4. Controller builds the platform-native payload (e.g. NX-API JSON) + 5. Controller pushes config to device and updates status conditions + 6. Finalizers ensure cleanup on deletion + + Key patterns: + - DeviceRef: all config CRDs reference a Device by name + - Status conditions: Ready, Degraded, Progressing + - Pausing: CRDs support a paused field to halt reconciliation + - Ownership: child resources are owned by their parent Device +sources: + - api/core/v1alpha1 + - api/cisco/nx/v1alpha1 + - internal/controller diff --git a/docs/content/getting-started.doc.yaml b/docs/content/getting-started.doc.yaml new file mode 100644 index 000000000..24e0e543b --- /dev/null +++ b/docs/content/getting-started.doc.yaml @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +# SPDX-License-Identifier: Apache-2.0 + +kind: guide +title: "Getting Started" +description: "Deploy Network Operator and configure your first network device" +prompt: | + Write a step-by-step getting started guide for network-operator. Cover: + 1. Prerequisites (Kubernetes cluster, kubectl, Helm) + 2. Installing network-operator via Helm chart + 3. Registering a network device using the Device CRD + 4. Applying a basic Interface configuration + 5. Verifying the configuration was pushed to the device + 6. Next steps (BGP, VLANs, routing policies) + Keep examples realistic for a data center operator provisioning Cisco NX-OS switches. + Use the CRD field names from the context below exactly as written. +context: | + Network Operator is a Kubernetes-native platform for automating multi-vendor + data center network devices. It uses CRDs to declaratively manage device + configurations and reconciles them against the actual device state. + + Supported platforms: Cisco NX-OS (primary), OpenConfig (planned). + + Core CRDs (api/core/v1alpha1): + - Device: represents a managed network device (address, credentials, platform) + - Interface: ethernet, loopback, port-channel interfaces + - BGP / BGPPeer: BGP routing configuration and neighbor peers + - VRF: virtual routing and forwarding instances + - VLAN: VLAN definitions + - ACL: access control lists + - OSPF / ISIS: IGP routing protocols + - NVE / EVPNInstance: VXLAN overlay and EVPN configuration + - PrefixSet / RoutingPolicy: route filtering and policy + - NTP / DNS / Syslog / SNMP: device services + - DHCPRelay / LLDP / Certificate / Banner / User: misc device config + + Platform-specific CRDs (api/cisco/nx/v1alpha1): + - BGPConfig: NX-OS-specific BGP address-family configuration + - InterfaceConfig: NX-OS-specific interface settings + - NVEConfig / LLDPConfig / System: NX-OS platform details + - BorderGateway / VPCDomain: NX-OS multisite and vPC + + Helm chart: deploy/helm/network-operator + Controller: manages reconciliation loops for each CRD type +sources: + - api/core/v1alpha1 + - api/cisco/nx/v1alpha1 + - deploy/helm/network-operator diff --git a/docs/generated/.gitkeep b/docs/generated/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/docs/generated/guides/architecture.md b/docs/generated/guides/architecture.md new file mode 100644 index 000000000..68720b1c3 --- /dev/null +++ b/docs/generated/guides/architecture.md @@ -0,0 +1,357 @@ +--- +title: Architecture +description: How Network Operator reconciles declarative CRDs into device configurations +gnosis_hash: 76f0627f +body_hash: a46e9e82 +--- + +# Architecture + +Network Operator is a set of Kubernetes controllers that translate CRD specifications into live network device configuration. This guide explains how the system is structured, how your YAML manifests become device commands, and how the operator handles multi-device, multi-vendor environments. + +## The Reconciliation Model + +The core interaction pattern is straightforward: you describe the desired state of a network resource in a CRD manifest, apply it to Kubernetes, and the operator takes responsibility for making the device match that description. + +The reconciliation loop works as follows: + +1. **You apply a manifest.** For example, you create a `VLAN` resource describing VLAN 100 on a specific device. +2. **The controller detects the change.** Each CRD type has a dedicated controller built on controller-runtime. The controller watches for create, update, and delete events on its resource kind. +3. **The controller resolves the target device.** Every configuration CRD carries a `deviceRef` field (a `LocalObjectReference`) that names the `Device` resource in the same namespace. The controller reads the `Device` to retrieve the management endpoint and credentials. +4. **The controller builds the platform-native payload.** For NX-OS targets this is an NX-API JSON body. The controller translates the abstract CRD fields into the exact structures the device API expects. +5. **The controller pushes the configuration.** The payload is sent to the device's management address defined in `Device.spec.endpoint.address`. +6. **The controller updates status conditions.** After the push, the controller writes the outcome back to `status.conditions` on the resource. If something goes wrong, the condition reflects that; if the device accepted the configuration, the resource transitions to a ready state. + +This loop is level-triggered, not event-driven: the controller will re-reconcile on any relevant change and will retry on failure, converging toward the desired state over time. + +### Pausing Reconciliation + +The `Device` resource exposes a `spec.paused` boolean. When set to `true`, controllers stop processing the device and all resources that reference it. This is useful during maintenance windows or when you need to manually intervene on a device without the operator fighting your changes. + +## Core CRDs and Platform-Specific CRDs + +The API is split into two layers that work together. + +### Core CRDs (Platform-Agnostic Intent) + +Core CRDs live under `api/core/v1alpha1` and describe *what* you want without prescribing vendor-specific behavior. Examples include: + +- `Interface` — defines interface type, admin state, IP addressing, switchport mode, VRF membership, BFD, and MTU. +- `VLAN` — defines a VLAN ID, name, and admin state. +- `BGP` / `BGPPeer` — defines a BGP instance and its peers, address families, and route policies. +- `VRF` — defines a VRF name, VNI, route distinguisher, and route targets. +- `EVPNInstance` — defines an EVPN instance with VNI, type (bridged or routed), route targets, and VLAN reference. +- `NetworkVirtualizationEdge` — defines the NVE (VTEP) endpoint including source interface, anycast gateway, and host reachability method. + +These types are sufficient to express intent for most network configurations. The fields map to concepts that are consistent across vendors. + +### Platform-Specific CRDs (Vendor Knobs) + +When a vendor exposes controls that have no meaningful cross-platform equivalent, a separate platform config CRD carries those fields. Examples for NX-OS include: + +- `BGPConfig` — adds NX-OS-specific BGP options such as `advertisePIP` for EVPN and `exportGatewayIP` for symmetric IRB. +- `LLDPConfig` — adds NX-OS `initDelay` and `holdTime` timers. +- `ManagementAccessConfig` — adds NX-OS VTY console timeout and SSH ACL name. +- `NetworkVirtualizationEdgeConfig` — adds NX-OS NVE options such as `advertiseVirtualMAC`, `holdDownTime`, and `infraVLANs`. +- `InterfaceConfig` — adds NX-OS spanning-tree port type, BPDU guard/filter, buffer boost, and LACP vPC convergence settings. +- `VPCDomain` — configures the Cisco vPC domain including peer-link, keepalive, role priority, and auto-recovery. + +### Linking Core and Platform CRDs + +The link between a core CRD and its platform-specific extension is the `providerConfigRef` field present on every core resource spec. This is a `TypedLocalObjectReference` that carries the `apiVersion`, `kind`, and `name` of the platform config object: + +```yaml +providerConfigRef: + apiVersion: cisco.nx/v1alpha1 + kind: InterfaceConfig + name: eth1-0-config +``` + +When the controller reconciles a core resource, it checks for a `providerConfigRef`. If present, it reads the referenced platform config and merges its vendor-specific fields into the configuration payload before pushing to the device. + +## Device Registration and Credentials + +Every configuration resource in the operator is scoped to a `Device`. The `Device` CRD is the anchor for all device-level state and connectivity information. + +### Defining a Device + +A minimal `Device` looks like: + +```yaml +apiVersion: core/v1alpha1 +kind: Device +metadata: + name: leaf-01 + namespace: network +spec: + endpoint: + address: "192.0.2.10:443" + secretRef: + name: leaf-01-credentials +``` + +The `endpoint.address` is the management IP and port. The `endpoint.secretRef` points to a Kubernetes `Secret` of type `kubernetes.io/basic-auth` containing `username` and `password` keys. The secret is read by the controller at reconciliation time; credentials are never stored in the CRD itself. + +### TLS + +For gRPC-based transports, `endpoint.tls` carries the CA certificate reference and an optional client certificate for mutual TLS: + +```yaml +endpoint: + tls: + ca: + key: ca.crt + certificate: + secretRef: + name: leaf-01-mtls +``` + +### Device Status + +After the operator connects to a device, it populates `Device.status` with discovered information: `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, `lastRebootTime`, and a `ports` list. The `portSummary` field provides a quick human-readable count grouped by speed (e.g., `"2/4 (10g), 4/64 (100g)"`). This information is read-only and reflects what the operator observed from the device. + +### Device Provisioning + +For zero-touch provisioning, `Device.spec.provisioning` can specify a boot image URL with checksum and a `bootScript` template (inline, from a `Secret`, or from a `ConfigMap`). Provisioning history is tracked in `Device.status.provisioning`. + +## Status Conditions and Finalizers + +### Status Conditions + +Every CRD in the operator exposes a `status.conditions` field — a list of `metav1.Condition` objects. Conditions provide structured, machine-readable state that controllers and external tooling can watch. Standard condition types used across resources include: + +- **Available** — the resource is fully functional and the configuration has been successfully applied to the device. +- **Progressing** — the controller is currently creating or updating the resource. +- **Degraded** — the resource failed to reach or maintain its desired state. + +Some resources expose additional computed status fields beyond conditions. For example: +- `BGPPeer.status.sessionState` reports the operational BGP session state (e.g., Established). +- `BGPPeer.status.addressFamilies` contains per-AFI/SAFI prefix counts. +- `OSPF.status.neighbors` lists OSPF neighbor adjacency states. +- `VPCDomain.status` reports `role`, `keepaliveStatus`, `peerStatus`, and `peerLinkIfOperStatus`. +- `VLAN.status.routedBy` and `VLAN.status.bridgedBy` reflect cross-resource ownership once an `Interface` or `EVPNInstance` references that VLAN. + +### Finalizers + +Finalizers ensure that when you delete a CRD resource, the corresponding configuration is removed from the device before Kubernetes removes the object. The controller adds a finalizer to the resource when it first reconciles it. On deletion: + +1. Kubernetes marks the object for deletion but does not remove it (the finalizer blocks removal). +2. The controller detects the deletion timestamp, pushes the removal configuration to the device, then removes the finalizer. +3. Kubernetes garbage-collects the object. + +This prevents configuration drift where a Kubernetes object is deleted but the device retains stale configuration. + +## Multi-Device and Multi-Vendor Support + +### Multiple Devices + +Each configuration CRD is explicitly bound to one device via `deviceRef`. To configure the same feature across multiple devices, you create one resource per device: + +```yaml +# leaf-01 VLAN +apiVersion: core/v1alpha1 +kind: VLAN +metadata: + name: vlan100-leaf01 +spec: + deviceRef: + name: leaf-01 + id: 100 + +# leaf-02 VLAN +apiVersion: core/v1alpha1 +kind: VLAN +metadata: + name: vlan100-leaf02 +spec: + deviceRef: + name: leaf-02 + id: 100 +``` + +The `deviceRef` is immutable after creation. To move a configuration to a different device, you must delete the resource and create a new one targeting the new device. + +Child resources are logically owned by their parent `Device`. When a `Device` is deleted, its finalizer ensures cleanup of all device configuration before the object is removed. + +### Multiple Vendors + +Vendor differences are isolated to the provider layer and the platform-specific CRDs. The core CRD schema remains the same regardless of which vendor's device you are targeting. The controller for each core CRD implements the translation to the appropriate vendor API (NX-API for NX-OS, gNMI planned for other platforms). When you need vendor-specific settings, you attach a platform config via `providerConfigRef`. If no `providerConfigRef` is set, the controller applies the target platform's defaults. + +## EVPN/VXLAN Fabric Provisioning: A Concrete Example + +EVPN/VXLAN fabric provisioning shows how multiple CRDs compose into a coherent feature. Consider bringing up a new leaf switch as a VXLAN VTEP in a BGP EVPN fabric. The following resources are involved, in dependency order. + +### 1. Register the Device + +```yaml +apiVersion: core/v1alpha1 +kind: Device +metadata: + name: leaf-01 +spec: + endpoint: + address: "10.0.0.1:443" + secretRef: + name: leaf-01-creds +``` + +### 2. Configure Underlay Interfaces and Loopbacks + +Create `Interface` resources for the physical uplinks (routed, with IPv4 addresses) and the loopback used as the NVE source. The loopback will carry the VTEP IP. + +```yaml +apiVersion: core/v1alpha1 +kind: Interface +metadata: + name: lo0-leaf01 +spec: + deviceRef: + name: leaf-01 + name: loopback0 + type: Loopback + adminState: Up + ipv4: + addresses: + - 10.0.255.1/32 +``` + +### 3. Configure the VRF (L3VNI) + +```yaml +apiVersion: core/v1alpha1 +kind: VRF +metadata: + name: tenant-a-leaf01 +spec: + deviceRef: + name: leaf-01 + name: TenantA + vni: 50000 + routeDistinguisher: "65000:50000" + routeTargets: + - value: "65000:50000" + action: Both + addressFamilies: [L2vpnEvpn] +``` + +### 4. Configure the VLAN (L2VNI) + +```yaml +apiVersion: core/v1alpha1 +kind: VLAN +metadata: + name: vlan100-leaf01 +spec: + deviceRef: + name: leaf-01 + id: 100 + name: TenantA-Web + adminState: Active +``` + +The controller will set `vlan100-leaf01.status.bridgedBy` once an `EVPNInstance` references this VLAN. + +### 5. Configure the NVE (VTEP) + +`NetworkVirtualizationEdge` is the VTEP endpoint. It references the loopback for the source IP, sets EVPN-based host reachability, and optionally configures an anycast gateway MAC for distributed routing: + +```yaml +apiVersion: core/v1alpha1 +kind: NetworkVirtualizationEdge +metadata: + name: nve1-leaf01 +spec: + deviceRef: + name: leaf-01 + adminState: Up + sourceInterfaceRef: + name: lo0-leaf01 + hostReachability: EVPN + suppressARP: true + anycastGateway: + virtualMAC: "00:00:5E:00:01:01" +``` + +For NX-OS-specific options like `advertiseVirtualMAC` and `infraVLANs`, attach a `NetworkVirtualizationEdgeConfig` via `providerConfigRef`. + +### 6. Create the EVPN Instance (L2VNI) + +```yaml +apiVersion: core/v1alpha1 +kind: EVPNInstance +metadata: + name: evi-100-leaf01 +spec: + deviceRef: + name: leaf-01 + vni: 10100 + type: Bridged + vlanRef: + name: vlan100-leaf01 + routeDistinguisher: "65000:10100" + routeTargets: + - value: "65000:10100" + action: Both +``` + +When this resource is reconciled, the controller sets `vlan100-leaf01.status.bridgedBy` to reference `evi-100-leaf01`, establishing the cross-resource link visible in status. + +### 7. Configure BGP with EVPN Address Family + +```yaml +apiVersion: core/v1alpha1 +kind: BGP +metadata: + name: bgp-leaf01 +spec: + deviceRef: + name: leaf-01 + asNumber: 65000 + routerId: "10.0.255.1" + addressFamilies: + l2vpnEvpn: + enabled: true + routeTargetPolicy: + retainAll: true +``` + +Then create `BGPPeer` resources for each spine, with the `l2vpnEvpn` address family enabled and `routeReflectorClient: false` on leaf peers. + +### 8. Configure the SVI for Routing (RoutedVLAN Interface) + +For symmetric IRB, create a `RoutedVLAN` `Interface` that references VLAN 100 and lives in `TenantA` VRF. Enabling `ipv4.anycastGateway: true` on this interface causes the controller to use the virtual MAC defined in the NVE resource: + +```yaml +apiVersion: core/v1alpha1 +kind: Interface +metadata: + name: svi100-leaf01 +spec: + deviceRef: + name: leaf-01 + name: Vlan100 + type: RoutedVLAN + adminState: Up + vlanRef: + name: vlan100-leaf01 + vrfRef: + name: tenant-a-leaf01 + ipv4: + addresses: + - 10.100.0.1/24 + anycastGateway: true +``` + +The controller sets `vlan100-leaf01.status.routedBy` to reference `svi100-leaf01`. + +### What Happens End-to-End + +After all of these resources are applied, each controller independently reconciles its piece: + +- The `Interface` controller pushes loopback and physical interface configs via NX-API. +- The `VRF` controller creates the VRF with its L3VNI and route targets. +- The `VLAN` controller creates VLAN 100. +- The `NetworkVirtualizationEdge` controller creates the NVE interface. +- The `EVPNInstance` controller creates the MAC-VRF (L2VNI) under the NVE and links it to VLAN 100. +- The `BGP` and `BGPPeer` controllers configure BGP with the EVPN address family and peer sessions. +- The `Interface` controller for the SVI creates the routed VLAN interface with the anycast gateway MAC diff --git a/docs/generated/guides/getting-started.md b/docs/generated/guides/getting-started.md new file mode 100644 index 000000000..19f3c06f5 --- /dev/null +++ b/docs/generated/guides/getting-started.md @@ -0,0 +1,276 @@ +--- +title: Getting Started +description: Deploy Network Operator and configure your first network device +gnosis_hash: 4357f246 +body_hash: a7ac0394 +--- + +# Getting Started + +This guide walks a data center operator through installing network-operator, registering a Cisco NX-OS switch, pushing an initial interface configuration, and verifying the result. Subsequent sections point toward more advanced topics. + +--- + +## Prerequisites + +Before you begin, ensure the following are available: + +- **Kubernetes cluster** (v1.26 or later recommended) with sufficient RBAC permissions to install CRDs and create namespaces. +- **kubectl** configured to talk to that cluster (`kubectl version` should succeed against the target API server). +- **Helm 3** installed locally (`helm version`). +- **Network reachability** from the cluster nodes (or from a dedicated egress point) to the management address of each NX-OS switch. network-operator connects to devices over gRPC/gNMI; ensure TCP port 50051 (or your device's configured gRPC port) is open between the cluster and the management network. +- **Device credentials** — a username and password that have the `network-admin` role on NX-OS, and optionally a CA certificate if the device uses TLS on its gRPC server. + +--- + +## Installing network-operator via Helm + +The Helm chart lives under `deploy/helm/network-operator` in the repository. + +### 1. Add or clone the chart + +```bash +# Clone the repository and reference the chart locally +git clone https://github.com/your-org/network-operator.git +cd network-operator +``` + +### 2. Create a dedicated namespace + +```bash +kubectl create namespace network-operator +``` + +### 3. Install the chart + +```bash +helm install network-operator deploy/helm/network-operator \ + --namespace network-operator \ + --set controller.replicaCount=2 +``` + +Verify the controller pod reaches `Running`: + +```bash +kubectl get pods -n network-operator +``` + +The controller manages a reconciliation loop for every CRD type. Once running, it watches for `Device`, `Interface`, `BGP`, `VLAN`, and all other resources in any namespace and pushes the desired state to the corresponding device. + +--- + +## Registering a Network Device + +Every managed resource references a `Device` object. The `Device` CRD (API group `core/v1alpha1`) holds the management address and authentication credentials. + +### 1. Store device credentials in a Secret + +The `DeviceSpec.endpoint.secretRef` field requires a secret of type `kubernetes.io/basic-auth` containing `username` and `password` keys. + +```bash +kubectl create secret generic leaf01-credentials \ + --namespace network-operator \ + --type kubernetes.io/basic-auth \ + --from-literal=username=admin \ + --from-literal=password= +``` + +### 2. Create the Device resource + +```yaml +# leaf01-device.yaml +apiVersion: core.network-operator.io/v1alpha1 +kind: Device +metadata: + name: leaf01 + namespace: network-operator +spec: + endpoint: + address: "192.0.2.10:50051" # NX-OS management IP and gRPC port + secretRef: + name: leaf01-credentials + namespace: network-operator +``` + +Apply it: + +```bash +kubectl apply -f leaf01-device.yaml +``` + +After a few seconds the controller discovers the device and populates `status`: + +```bash +kubectl get device leaf01 -n network-operator -o yaml +``` + +Look for `status.phase` moving to `Ready`, and check `status.manufacturer`, `status.model`, and `status.firmwareVersion` to confirm the operator has successfully connected. + +> **TLS:** If your NX-OS switch is configured with a self-signed or enterprise CA certificate on the gRPC server, add a `spec.endpoint.tls` block pointing to a Secret that contains the CA certificate under `spec.endpoint.tls.ca`. + +--- + +## Applying a Basic Interface Configuration + +The `Interface` CRD models ethernet, loopback, port-channel, routed-VLAN, and subinterfaces. Every `Interface` resource must reference the owning `Device` via `spec.deviceRef`. + +### Example: configure a routed Layer 3 interface + +The following manifest configures `Ethernet1/1` on `leaf01` as a routed interface with a /30 address, an MTU of 9216 bytes, and a description. + +```yaml +# leaf01-eth1-1.yaml +apiVersion: core.network-operator.io/v1alpha1 +kind: Interface +metadata: + name: leaf01-eth1-1 + namespace: network-operator +spec: + deviceRef: + name: leaf01 + name: Ethernet1/1 + type: Ethernet + adminState: Up + description: "Uplink to spine01" + mtu: 9216 + ipv4: + addresses: + - 10.0.0.1/30 +``` + +Apply it: + +```bash +kubectl apply -f leaf01-eth1-1.yaml +``` + +### Example: configure a loopback interface + +Loopback interfaces are commonly used as BGP router-IDs and NVE source interfaces. + +```yaml +# leaf01-lo0.yaml +apiVersion: core.network-operator.io/v1alpha1 +kind: Interface +metadata: + name: leaf01-lo0 + namespace: network-operator +spec: + deviceRef: + name: leaf01 + name: loopback0 + type: Loopback + adminState: Up + description: "Router-ID loopback" + ipv4: + addresses: + - 10.255.0.1/32 +``` + +```bash +kubectl apply -f leaf01-lo0.yaml +``` + +### Key `InterfaceSpec` fields reference + +| Field | Purpose | +|---|---| +| `deviceRef.name` | Name of the `Device` object in the same namespace | +| `name` | Interface name exactly as it appears on the device (e.g., `Ethernet1/1`, `loopback0`) | +| `type` | `Ethernet`, `Loopback`, `Aggregate`, `RoutedVLAN`, `Subinterface` | +| `adminState` | `Up` or `Down` | +| `mtu` | Packet MTU in bytes | +| `ipv4.addresses` | List of IPv4 CIDR prefixes; first entry is the primary address | +| `switchport` | Layer 2 switchport configuration (access or trunk mode) | +| `vrfRef` | Assigns the interface to a non-default VRF | + +--- + +## Verifying the Configuration Was Pushed to the Device + +The operator follows a reconcile-then-report model: after applying a manifest it attempts to push the desired state to the device and then reflects the outcome in the resource's `status.conditions`. + +### Check conditions on the Interface resource + +```bash +kubectl get interface leaf01-eth1-1 -n network-operator -o yaml +``` + +A successful push produces a condition similar to: + +```yaml +status: + conditions: + - type: Available + status: "True" + reason: ConfigurationApplied + message: "Interface configuration successfully applied to device" + lastTransitionTime: "2024-11-01T10:15:00Z" +``` + +If the push fails the `status` field is `"False"` and `message` contains the error returned by the device. + +### Useful one-liners + +```bash +# Watch all Interface resources in the namespace +kubectl get interfaces -n network-operator -w + +# Check conditions across all managed resources for leaf01 +kubectl get interfaces,bgp,vlan,vrf -n network-operator \ + -l core.network-operator.io/device=leaf01 + +# Describe a specific resource for full event and condition history +kubectl describe interface leaf01-eth1-1 -n network-operator +``` + +### Confirm on the device directly + +SSH to `leaf01` and verify the configuration was applied: + +``` +leaf01# show running-config interface Ethernet1/1 +interface Ethernet1/1 + description Uplink to spine01 + mtu 9216 + ip address 10.0.0.1/30 + no shutdown +``` + +--- + +## Next Steps + +With a device registered and a first interface configured, you are ready to build out the rest of the fabric. The sections below highlight the most commonly used CRDs for a NX-OS data center deployment. + +### BGP and BGP Peers + +Use the `BGP` CRD to configure an eBGP or iBGP instance on `leaf01`, setting `spec.asNumber` and `spec.routerId`. Add neighbors with individual `BGPPeer` resources, each referencing the parent BGP instance via `spec.bgpRef`. For address-family configuration specific to NX-OS (such as EVPN PIP advertisement), attach a `BGPConfig` resource using `spec.providerConfigRef`. + +### VLANs + +Create `VLAN` resources with `spec.id` (1–4094), `spec.name`, and `spec.deviceRef`. For layer 3 routing over a VLAN, add a `RoutedVLAN` type `Interface` that references the VLAN via `spec.vlanRef`. + +### VRFs + +Use the `VRF` CRD to define tenant VRFs (`spec.name`, `spec.routeDistinguisher`, `spec.routeTargets`). Assign interfaces to VRFs via `spec.vrfRef` on the `Interface` resource. + +### Routing Policies and Prefix Sets + +Define match criteria with `PrefixSet` resources, then reference them from `RoutingPolicy` statements (`spec.statements[].conditions.matchPrefixSet`). Attach policies to BGP peers via `spec.addressFamilies.ipv4Unicast.inboundRoutingPolicyRef` or `outboundRoutingPolicyRef` on `BGPPeer`. + +### VXLAN / EVPN Overlay + +Create a `NetworkVirtualizationEdge` (NVE) resource with `spec.sourceInterfaceRef` pointing to a loopback, configure `EVPNInstance` resources for each L2VNI or L3VNI, and enable the L2VPN EVPN address family in BGP. For NX-OS-specific NVE settings (hold-down time, infra VLANs), attach a `NetworkVirtualizationEdgeConfig` via `spec.providerConfigRef`. + +### Device Services + +| CRD | Purpose | +|---|---| +| `NTP` | NTP server configuration and source interface | +| `DNS` | DNS servers, default domain, and source interface | +| `Syslog` | Remote syslog servers and facility configuration | +| `SNMP` | SNMP communities, trap destinations, and notification types | +| `LLDP` | System-wide and per-interface LLDP control | + +Each service CRD follows the same pattern: set `spec.deviceRef.name` to the target device, fill in the relevant fields, and the controller reconciles the change onto the device. diff --git a/docs/generated/guides/index.md b/docs/generated/guides/index.md new file mode 100644 index 000000000..8cb90ce72 --- /dev/null +++ b/docs/generated/guides/index.md @@ -0,0 +1,11 @@ +--- +title: Guides +description: Architecture and usage guides +--- + +# Guides + +Architecture and usage guides + +- [Getting Started](getting-started.md) — Deploy Network Operator and configure your first network device +- [Architecture](architecture.md) — How Network Operator reconciles declarative CRDs into device configurations diff --git a/docs/generated/index.md b/docs/generated/index.md new file mode 100644 index 000000000..db8af4328 --- /dev/null +++ b/docs/generated/index.md @@ -0,0 +1,15 @@ +--- +title: Network Operator +description: Cloud Native Network Device Provisioning +--- + +# Network Operator + +Cloud Native Network Device Provisioning + +## Guides + +Architecture and usage guides + +- [Getting Started](guides/getting-started.md) — Deploy Network Operator and configure your first network device +- [Architecture](guides/architecture.md) — How Network Operator reconciles declarative CRDs into device configurations diff --git a/gnosis.yaml b/gnosis.yaml new file mode 100644 index 000000000..0c2c05a2e --- /dev/null +++ b/gnosis.yaml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +# SPDX-License-Identifier: Apache-2.0 + +# Gnosis documentation generation config for network-operator + +guides: + - docs/content/getting-started.doc.yaml + - docs/content/architecture.doc.yaml + +output: + dir: docs/generated + site: + title: "Network Operator" + description: "Cloud Native Network Device Provisioning" + categories: + guide: + label: "Guides" + description: "Architecture and usage guides" + +provider: + default: anthropic + model: claude-sonnet-4-6 From ed29b4c87d0b7b609ba500d94514025cdcdf9a3d Mon Sep 17 00:00:00 2001 From: d062260 Date: Tue, 5 May 2026 09:48:47 +0200 Subject: [PATCH 2/3] refine gnosis prompts --- docs/content/architecture.doc.yaml | 7 +- docs/content/getting-started.doc.yaml | 4 +- docs/generated/guides/architecture.md | 402 ++++++++--------------- docs/generated/guides/getting-started.md | 261 ++++++++------- 4 files changed, 285 insertions(+), 389 deletions(-) diff --git a/docs/content/architecture.doc.yaml b/docs/content/architecture.doc.yaml index 82b8aba16..89c62ca9e 100644 --- a/docs/content/architecture.doc.yaml +++ b/docs/content/architecture.doc.yaml @@ -11,7 +11,6 @@ prompt: | 3. How Device registration and credentials work 4. The role of status conditions and finalizers 5. How multi-device and multi-vendor support is structured - 6. EVPN/VXLAN fabric provisioning as a concrete example Keep the tone practical. Operators should understand how their YAML manifests translate into network device configuration. context: | @@ -20,7 +19,11 @@ context: | patterns: watch CRDs, compare desired vs actual, push diffs to devices. Architecture layers: - - API layer (api/): CRD type definitions (core + platform-specific) + - API layer (api/): CRD type definitions (core + platform-specific) the api itself is split into several layers: + - Physical Layer ( devices, interfaces, links) + - Bricks Layer (vendor abstract config 1 brick -> 1 device + status) + - Transit Layer ( translates network demands into brick configs) + - Intent Layer ( network, external connection, routing domain) - Controller layer (internal/controller/): reconciliation logic per CRD - Provider layer: device communication (NX-API for NX-OS, gNMI planned) diff --git a/docs/content/getting-started.doc.yaml b/docs/content/getting-started.doc.yaml index 24e0e543b..09009518d 100644 --- a/docs/content/getting-started.doc.yaml +++ b/docs/content/getting-started.doc.yaml @@ -40,9 +40,9 @@ context: | - NVEConfig / LLDPConfig / System: NX-OS platform details - BorderGateway / VPCDomain: NX-OS multisite and vPC - Helm chart: deploy/helm/network-operator + Helm chart: charts/network-operator Controller: manages reconciliation loops for each CRD type sources: - api/core/v1alpha1 - api/cisco/nx/v1alpha1 - - deploy/helm/network-operator + - charts/network-operator diff --git a/docs/generated/guides/architecture.md b/docs/generated/guides/architecture.md index 68720b1c3..f8fdfd3e9 100644 --- a/docs/generated/guides/architecture.md +++ b/docs/generated/guides/architecture.md @@ -1,357 +1,237 @@ --- title: Architecture description: How Network Operator reconciles declarative CRDs into device configurations -gnosis_hash: 76f0627f -body_hash: a46e9e82 +gnosis_hash: 0f39c23c +body_hash: e3d86a7b --- # Architecture -Network Operator is a set of Kubernetes controllers that translate CRD specifications into live network device configuration. This guide explains how the system is structured, how your YAML manifests become device commands, and how the operator handles multi-device, multi-vendor environments. +## Overview -## The Reconciliation Model - -The core interaction pattern is straightforward: you describe the desired state of a network resource in a CRD manifest, apply it to Kubernetes, and the operator takes responsibility for making the device match that description. - -The reconciliation loop works as follows: - -1. **You apply a manifest.** For example, you create a `VLAN` resource describing VLAN 100 on a specific device. -2. **The controller detects the change.** Each CRD type has a dedicated controller built on controller-runtime. The controller watches for create, update, and delete events on its resource kind. -3. **The controller resolves the target device.** Every configuration CRD carries a `deviceRef` field (a `LocalObjectReference`) that names the `Device` resource in the same namespace. The controller reads the `Device` to retrieve the management endpoint and credentials. -4. **The controller builds the platform-native payload.** For NX-OS targets this is an NX-API JSON body. The controller translates the abstract CRD fields into the exact structures the device API expects. -5. **The controller pushes the configuration.** The payload is sent to the device's management address defined in `Device.spec.endpoint.address`. -6. **The controller updates status conditions.** After the push, the controller writes the outcome back to `status.conditions` on the resource. If something goes wrong, the condition reflects that; if the device accepted the configuration, the resource transitions to a ready state. - -This loop is level-triggered, not event-driven: the controller will re-reconcile on any relevant change and will retry on failure, converging toward the desired state over time. - -### Pausing Reconciliation - -The `Device` resource exposes a `spec.paused` boolean. When set to `true`, controllers stop processing the device and all resources that reference it. This is useful during maintenance windows or when you need to manually intervene on a device without the operator fighting your changes. - -## Core CRDs and Platform-Specific CRDs - -The API is split into two layers that work together. - -### Core CRDs (Platform-Agnostic Intent) - -Core CRDs live under `api/core/v1alpha1` and describe *what* you want without prescribing vendor-specific behavior. Examples include: - -- `Interface` — defines interface type, admin state, IP addressing, switchport mode, VRF membership, BFD, and MTU. -- `VLAN` — defines a VLAN ID, name, and admin state. -- `BGP` / `BGPPeer` — defines a BGP instance and its peers, address families, and route policies. -- `VRF` — defines a VRF name, VNI, route distinguisher, and route targets. -- `EVPNInstance` — defines an EVPN instance with VNI, type (bridged or routed), route targets, and VLAN reference. -- `NetworkVirtualizationEdge` — defines the NVE (VTEP) endpoint including source interface, anycast gateway, and host reachability method. - -These types are sufficient to express intent for most network configurations. The fields map to concepts that are consistent across vendors. +Network Operator is a set of Kubernetes controllers that reconcile CRD specifications into live network device configurations. The core idea is simple: you describe the desired state of a network device in a YAML manifest, apply it to Kubernetes, and the operator pushes the corresponding configuration to the device. No scripting, no manual CLI sessions — the operator handles translation and delivery. -### Platform-Specific CRDs (Vendor Knobs) +The system follows standard controller-runtime patterns: watch CRDs for changes, compare desired state against actual device state, compute a diff, and push updates. This makes it composable with standard Kubernetes tooling — GitOps pipelines, admission webhooks, RBAC, and status monitoring all work as expected. -When a vendor exposes controls that have no meaningful cross-platform equivalent, a separate platform config CRD carries those fields. Examples for NX-OS include: - -- `BGPConfig` — adds NX-OS-specific BGP options such as `advertisePIP` for EVPN and `exportGatewayIP` for symmetric IRB. -- `LLDPConfig` — adds NX-OS `initDelay` and `holdTime` timers. -- `ManagementAccessConfig` — adds NX-OS VTY console timeout and SSH ACL name. -- `NetworkVirtualizationEdgeConfig` — adds NX-OS NVE options such as `advertiseVirtualMAC`, `holdDownTime`, and `infraVLANs`. -- `InterfaceConfig` — adds NX-OS spanning-tree port type, BPDU guard/filter, buffer boost, and LACP vPC convergence settings. -- `VPCDomain` — configures the Cisco vPC domain including peer-link, keepalive, role priority, and auto-recovery. - -### Linking Core and Platform CRDs - -The link between a core CRD and its platform-specific extension is the `providerConfigRef` field present on every core resource spec. This is a `TypedLocalObjectReference` that carries the `apiVersion`, `kind`, and `name` of the platform config object: - -```yaml -providerConfigRef: - apiVersion: cisco.nx/v1alpha1 - kind: InterfaceConfig - name: eth1-0-config -``` - -When the controller reconciles a core resource, it checks for a `providerConfigRef`. If present, it reads the referenced platform config and merges its vendor-specific fields into the configuration payload before pushing to the device. +--- -## Device Registration and Credentials +## The Reconciliation Model -Every configuration resource in the operator is scoped to a `Device`. The `Device` CRD is the anchor for all device-level state and connectivity information. +When you apply a manifest to Kubernetes, the following sequence takes place: -### Defining a Device +1. **You apply a CRD manifest.** For example, an `Interface` spec describing a routed interface with an IPv4 address, or a `BGP` spec describing a BGP router instance. -A minimal `Device` looks like: +2. **The controller detects the change.** controller-runtime watches the relevant CRD type and enqueues a reconciliation request whenever the object is created, updated, or deleted. -```yaml -apiVersion: core/v1alpha1 -kind: Device -metadata: - name: leaf-01 - namespace: network -spec: - endpoint: - address: "192.0.2.10:443" - secretRef: - name: leaf-01-credentials -``` +3. **The controller resolves the target Device.** Every configuration CRD carries a `deviceRef` field (of type `LocalObjectReference`) that names the `Device` object in the same namespace. The controller fetches that `Device` to obtain connection details. -The `endpoint.address` is the management IP and port. The `endpoint.secretRef` points to a Kubernetes `Secret` of type `kubernetes.io/basic-auth` containing `username` and `password` keys. The secret is read by the controller at reconciliation time; credentials are never stored in the CRD itself. +4. **The controller builds the platform-native payload.** Using the spec fields, the controller constructs the vendor-specific API call — for example, an NX-API JSON payload for Cisco NX-OS. -### TLS +5. **The controller pushes the config to the device and updates status.** After a successful push, the controller writes status conditions back to the object. On failure, it sets a `Degraded` condition and requeues for retry. -For gRPC-based transports, `endpoint.tls` carries the CA certificate reference and an optional client certificate for mutual TLS: +6. **Finalizers ensure cleanup on deletion.** When you delete a CRD object, the finalizer prevents immediate removal until the controller has removed the corresponding configuration from the device. -```yaml -endpoint: - tls: - ca: - key: ca.crt - certificate: - secretRef: - name: leaf-01-mtls -``` +A concrete example: applying a `VRF` manifest with `deviceRef.name: leaf-01` causes the VRF controller to look up the `Device` named `leaf-01`, connect to it, and configure the VRF with the specified name, VNI, route distinguisher, and route targets. -### Device Status +--- -After the operator connects to a device, it populates `Device.status` with discovered information: `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, `lastRebootTime`, and a `ports` list. The `portSummary` field provides a quick human-readable count grouped by speed (e.g., `"2/4 (10g), 4/64 (100g)"`). This information is read-only and reflects what the operator observed from the device. +## API Layers -### Device Provisioning +The API is structured in four conceptual layers, from physical to intent: -For zero-touch provisioning, `Device.spec.provisioning` can specify a boot image URL with checksum and a `bootScript` template (inline, from a `Secret`, or from a `ConfigMap`). Provisioning history is tracked in `Device.status.provisioning`. +| Layer | Description | +|---|---| +| **Physical** | Devices, interfaces, links — the raw hardware representation | +| **Bricks** | Vendor-abstract configuration; one brick maps to one device with status | +| **Transit** | Translates network demands into brick configurations | +| **Intent** | High-level constructs: networks, external connections, routing domains | -## Status Conditions and Finalizers +Most operators interact with the Physical and Bricks layers directly. The higher layers compose those primitives into fabric-wide constructs. -### Status Conditions +--- -Every CRD in the operator exposes a `status.conditions` field — a list of `metav1.Condition` objects. Conditions provide structured, machine-readable state that controllers and external tooling can watch. Standard condition types used across resources include: +## Core CRDs and Platform-Specific CRDs -- **Available** — the resource is fully functional and the configuration has been successfully applied to the device. -- **Progressing** — the controller is currently creating or updating the resource. -- **Degraded** — the resource failed to reach or maintain its desired state. +### Core CRDs -Some resources expose additional computed status fields beyond conditions. For example: -- `BGPPeer.status.sessionState` reports the operational BGP session state (e.g., Established). -- `BGPPeer.status.addressFamilies` contains per-AFI/SAFI prefix counts. -- `OSPF.status.neighbors` lists OSPF neighbor adjacency states. -- `VPCDomain.status` reports `role`, `keepaliveStatus`, `peerStatus`, and `peerLinkIfOperStatus`. -- `VLAN.status.routedBy` and `VLAN.status.bridgedBy` reflect cross-resource ownership once an `Interface` or `EVPNInstance` references that VLAN. +Core CRDs, defined under `api/core/v1alpha1`, express platform-agnostic intent. They cover a broad range of network constructs: -### Finalizers +- **Physical layer:** `Device`, `Interface`, `VLAN` +- **Routing:** `BGP`, `BGPPeer`, `OSPF`, `ISIS`, `PIM`, `VRF`, `RoutingPolicy`, `PrefixSet` +- **Overlay:** `EVPNInstance`, `NetworkVirtualizationEdge`, `DHCPRelay` +- **Management & security:** `NTP`, `DNS`, `Syslog`, `SNMP`, `Banner`, `User`, `Certificate`, `AccessControlList`, `ManagementAccess` +- **Platform features:** `LLDP`, `VPCDomain`, `BorderGateway`, `System` -Finalizers ensure that when you delete a CRD resource, the corresponding configuration is removed from the device before Kubernetes removes the object. The controller adds a finalizer to the resource when it first reconciles it. On deletion: +Each of these types exposes fields that are meaningful across vendors. For example, `BGPSpec` defines `asNumber`, `routerId`, and `addressFamilies` — concepts that exist on every BGP implementation. -1. Kubernetes marks the object for deletion but does not remove it (the finalizer blocks removal). -2. The controller detects the deletion timestamp, pushes the removal configuration to the device, then removes the finalizer. -3. Kubernetes garbage-collects the object. +Every core config CRD has a `providerConfigRef` field (of type `*TypedLocalObjectReference`) that optionally links to a platform-specific configuration object. If omitted, the provider applies the platform's default settings. -This prevents configuration drift where a Kubernetes object is deleted but the device retains stale configuration. +### Platform-Specific CRDs -## Multi-Device and Multi-Vendor Support +Platform CRDs, defined under `api/cisco/nx/v1alpha1` (and similar paths for other vendors), carry vendor-specific knobs that have no generic equivalent. Examples include: -### Multiple Devices +- **`InterfaceConfig`** — NX-OS-specific interface settings such as spanning-tree port type, BPDU guard, buffer boost, and LACP vPC convergence options. +- **`LLDPConfig`** — NX-OS LLDP `initDelay` and `holdTime` values. +- **`BGPConfig`** — NX-OS-specific BGP settings such as PIP advertisement for EVPN and gateway IP export for symmetric IRB. +- **`NetworkVirtualizationEdgeConfig`** — NX-OS NVE settings including virtual MAC advertisement and infra-VLAN list. +- **`ManagementAccessConfig`** — NX-OS console timeout and SSH VTY ACL settings. +- **`VPCDomain`** — Cisco vPC domain configuration including peer-link, keepalive, auto-recovery, and role priority. -Each configuration CRD is explicitly bound to one device via `deviceRef`. To configure the same feature across multiple devices, you create one resource per device: +The relationship is: the core CRD references the platform CRD via `providerConfigRef`. This keeps the core manifest portable while allowing per-platform customisation where needed. ```yaml -# leaf-01 VLAN -apiVersion: core/v1alpha1 -kind: VLAN -metadata: - name: vlan100-leaf01 +# Core CRD — platform-agnostic +apiVersion: network.example.io/v1alpha1 +kind: LLDP spec: deviceRef: name: leaf-01 - id: 100 + adminState: Up + providerConfigRef: + apiVersion: network.example.io/v1alpha1 + kind: LLDPConfig + name: leaf-01-lldp-config -# leaf-02 VLAN -apiVersion: core/v1alpha1 -kind: VLAN -metadata: - name: vlan100-leaf02 +--- +# Platform CRD — NX-OS specific knobs +apiVersion: network.cisco.nx/v1alpha1 +kind: LLDPConfig spec: - deviceRef: - name: leaf-02 - id: 100 + initDelay: 5 + holdTime: 120 ``` -The `deviceRef` is immutable after creation. To move a configuration to a different device, you must delete the resource and create a new one targeting the new device. +--- -Child resources are logically owned by their parent `Device`. When a `Device` is deleted, its finalizer ensures cleanup of all device configuration before the object is removed. +## Device Registration and Credentials -### Multiple Vendors +Before any configuration CRD can be reconciled, a `Device` object must exist in the same namespace. -Vendor differences are isolated to the provider layer and the platform-specific CRDs. The core CRD schema remains the same regardless of which vendor's device you are targeting. The controller for each core CRD implements the translation to the appropriate vendor API (NX-API for NX-OS, gNMI planned for other platforms). When you need vendor-specific settings, you attach a platform config via `providerConfigRef`. If no `providerConfigRef` is set, the controller applies the target platform's defaults. +### Device Spec -## EVPN/VXLAN Fabric Provisioning: A Concrete Example +`DeviceSpec` contains two key sections: -EVPN/VXLAN fabric provisioning shows how multiple CRDs compose into a coherent feature. Consider bringing up a new leaf switch as a VXLAN VTEP in a BGP EVPN fabric. The following resources are involved, in dependency order. +**`endpoint`** (required) — specifies how to reach the device: +- `address`: management address in `IP:Port` format. +- `secretRef`: references a Kubernetes `Secret` of type `kubernetes.io/basic-auth`. The secret must contain `username` and `password` keys. +- `tls`: optional TLS configuration. The `ca` field selects a secret key for the CA certificate. The `certificate` field enables mTLS by referencing a `kubernetes.io/tls` secret containing `tls.crt` and `tls.key`. -### 1. Register the Device +**`provisioning`** (optional) — used for zero-touch provisioning. It carries an `image` reference (URL, checksum, checksum type) and a `bootScript` that can be sourced inline, from a `Secret`, or from a `ConfigMap`. ```yaml -apiVersion: core/v1alpha1 +apiVersion: network.example.io/v1alpha1 kind: Device metadata: name: leaf-01 spec: endpoint: - address: "10.0.0.1:443" + address: "192.0.2.10:443" secretRef: - name: leaf-01-creds + name: leaf-01-credentials + paused: false ``` -### 2. Configure Underlay Interfaces and Loopbacks +The `Device` resource is also where the operator writes back hardware inventory: `DeviceStatus` exposes `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, `lastRebootTime`, and a `ports` list detailing each physical port and any associated `Interface` resource. -Create `Interface` resources for the physical uplinks (routed, with IPv4 addresses) and the loopback used as the NVE source. The loopback will carry the VTEP IP. +All configuration CRDs reference their device by name: ```yaml -apiVersion: core/v1alpha1 -kind: Interface -metadata: - name: lo0-leaf01 spec: deviceRef: name: leaf-01 - name: loopback0 - type: Loopback - adminState: Up - ipv4: - addresses: - - 10.0.255.1/32 ``` -### 3. Configure the VRF (L3VNI) +The `deviceRef` field is immutable — moving a configuration object to a different device requires deleting and recreating it. -```yaml -apiVersion: core/v1alpha1 -kind: VRF -metadata: - name: tenant-a-leaf01 -spec: - deviceRef: - name: leaf-01 - name: TenantA - vni: 50000 - routeDistinguisher: "65000:50000" - routeTargets: - - value: "65000:50000" - action: Both - addressFamilies: [L2vpnEvpn] -``` +### Pausing -### 4. Configure the VLAN (L2VNI) +`DeviceSpec` includes a `paused` boolean. When set to `true`, the device controller and all controllers managing objects that reference that device halt reconciliation. This is useful during maintenance windows or when you need to apply configuration changes manually without interference. -```yaml -apiVersion: core/v1alpha1 -kind: VLAN -metadata: - name: vlan100-leaf01 -spec: - deviceRef: - name: leaf-01 - id: 100 - name: TenantA-Web - adminState: Active -``` +--- -The controller will set `vlan100-leaf01.status.bridgedBy` once an `EVPNInstance` references this VLAN. +## Status Conditions and Finalizers -### 5. Configure the NVE (VTEP) +### Status Conditions -`NetworkVirtualizationEdge` is the VTEP endpoint. It references the loopback for the source IP, sets EVPN-based host reachability, and optionally configures an anycast gateway MAC for distributed routing: +Every CRD exposes a `status.conditions` field, a list of `metav1.Condition` objects. The operator uses three standard condition types: -```yaml -apiVersion: core/v1alpha1 -kind: NetworkVirtualizationEdge -metadata: - name: nve1-leaf01 -spec: - deviceRef: - name: leaf-01 - adminState: Up - sourceInterfaceRef: - name: lo0-leaf01 - hostReachability: EVPN - suppressARP: true - anycastGateway: - virtualMAC: "00:00:5E:00:01:01" -``` +| Type | Meaning | +|---|---| +| `Available` | The resource is fully functional and the configuration is applied on the device | +| `Progressing` | The resource is being created or updated | +| `Degraded` | The resource failed to reach or maintain its desired state | -For NX-OS-specific options like `advertiseVirtualMAC` and `infraVLANs`, attach a `NetworkVirtualizationEdgeConfig` via `providerConfigRef`. +Each condition has a `status` of `True`, `False`, or `Unknown`, along with a `reason` and `message` that give actionable detail. -### 6. Create the EVPN Instance (L2VNI) +Some resources expose richer status fields beyond conditions. For example: -```yaml -apiVersion: core/v1alpha1 -kind: EVPNInstance -metadata: - name: evi-100-leaf01 -spec: - deviceRef: - name: leaf-01 - vni: 10100 - type: Bridged - vlanRef: - name: vlan100-leaf01 - routeDistinguisher: "65000:10100" - routeTargets: - - value: "65000:10100" - action: Both -``` +- `OSPFStatus` provides `neighbors` (a list of `OSPFNeighbor` with adjacency states) and an `adjacencySummary` string. +- `BGPPeerStatus` provides `sessionState`, `lastEstablishedTime`, and per-address-family `advertisedPrefixes` and `acceptedPrefixes` counts. +- `VPCDomainStatus` reports `role`, `keepaliveStatus`, `peerStatus`, and `peerLinkIfOperStatus`. +- `DeviceStatus` provides a `phase` and full hardware inventory. + +These fields let you build monitoring and alerting on top of standard Kubernetes tooling (e.g., Prometheus with `kube-state-metrics`, or `kubectl get` for quick operational checks). + +### Finalizers + +All configuration CRDs use finalizers to ensure clean removal from the device when you delete the Kubernetes object. The sequence is: + +1. You run `kubectl delete`. +2. Kubernetes sets the `deletionTimestamp` but does not remove the object because a finalizer is present. +3. The controller detects the deletion, removes the corresponding configuration from the device, then removes the finalizer. +4. Kubernetes completes the deletion. + +This prevents orphaned configuration on devices when Kubernetes objects are removed. + +### Ownership + +Child resources are owned by their parent `Device`. This means cascading behaviour works as expected: if a `Device` is removed, owned resources are garbage-collected according to Kubernetes owner reference semantics. + +--- + +## Multi-Device and Multi-Vendor Support + +### Multi-Device -When this resource is reconciled, the controller sets `vlan100-leaf01.status.bridgedBy` to reference `evi-100-leaf01`, establishing the cross-resource link visible in status. +The operator supports arbitrarily many devices in a single namespace. Each `Device` object represents one physical or virtual network device. Configuration CRDs are scoped to individual devices via `deviceRef` — there is no implicit sharing of configuration across devices. -### 7. Configure BGP with EVPN Address Family +To apply the same logical configuration to multiple devices (for example, identical BGP settings on a spine tier), you create one CRD instance per device: ```yaml -apiVersion: core/v1alpha1 +# Spine 1 +apiVersion: network.example.io/v1alpha1 kind: BGP metadata: - name: bgp-leaf01 + name: spine-01-bgp spec: deviceRef: - name: leaf-01 + name: spine-01 asNumber: 65000 - routerId: "10.0.255.1" - addressFamilies: - l2vpnEvpn: - enabled: true - routeTargetPolicy: - retainAll: true -``` - -Then create `BGPPeer` resources for each spine, with the `l2vpnEvpn` address family enabled and `routeReflectorClient: false` on leaf peers. - -### 8. Configure the SVI for Routing (RoutedVLAN Interface) + routerId: "10.0.0.1" -For symmetric IRB, create a `RoutedVLAN` `Interface` that references VLAN 100 and lives in `TenantA` VRF. Enabling `ipv4.anycastGateway: true` on this interface causes the controller to use the virtual MAC defined in the NVE resource: - -```yaml -apiVersion: core/v1alpha1 -kind: Interface +--- +# Spine 2 +apiVersion: network.example.io/v1alpha1 +kind: BGP metadata: - name: svi100-leaf01 + name: spine-02-bgp spec: deviceRef: - name: leaf-01 - name: Vlan100 - type: RoutedVLAN - adminState: Up - vlanRef: - name: vlan100-leaf01 - vrfRef: - name: tenant-a-leaf01 - ipv4: - addresses: - - 10.100.0.1/24 - anycastGateway: true + name: spine-02 + asNumber: 65000 + routerId: "10.0.0.2" ``` -The controller sets `vlan100-leaf01.status.routedBy` to reference `svi100-leaf01`. +This design keeps each object's lifecycle independent. You can pause, delete, or update configuration on one device without affecting others. + +### Multi-Vendor + +Multi-vendor support is structured through the provider layer: -### What Happens End-to-End +- The **core CRDs** define the intent in vendor-neutral terms. Controllers translate these specs into vendor-specific API calls. +- The **platform CRD layer** (e.g., `api/cisco/nx/v1alpha1`) carries vendor-specific extensions, referenced optionally via `providerConfigRef`. +- The **provider layer** implements device communication. Currently, NX-API is used for Cisco NX-OS; gNMI support is planned for additional platforms. -After all of these resources are applied, each controller independently reconciles its piece: +When a controller reconciles a core CRD, it determines the target platform from the `Device` object (the operator discovers the device platform during initial connection). It then selects the appropriate provider and, if a `providerConfigRef` is present on the spec, merges the platform-specific configuration into the payload before pushing it to the device. -- The `Interface` controller pushes loopback and physical interface configs via NX-API. -- The `VRF` controller creates the VRF with its L3VNI and route targets. -- The `VLAN` controller creates VLAN 100. -- The `NetworkVirtualizationEdge` controller creates the NVE interface. -- The `EVPNInstance` controller creates the MAC-VRF (L2VNI) under the NVE and links it to VLAN 100. -- The `BGP` and `BGPPeer` controllers configure BGP with the EVPN address family and peer sessions. -- The `Interface` controller for the SVI creates the routed VLAN interface with the anycast gateway MAC +This architecture means you can manage heterogeneous fabrics from a single operator instance. Devices running different operating systems co-exist in the same namespace; each controller simply routes to the correct provider implementation based on the resolved `Device`. diff --git a/docs/generated/guides/getting-started.md b/docs/generated/guides/getting-started.md index 19f3c06f5..3b30b6a5d 100644 --- a/docs/generated/guides/getting-started.md +++ b/docs/generated/guides/getting-started.md @@ -1,126 +1,161 @@ --- title: Getting Started description: Deploy Network Operator and configure your first network device -gnosis_hash: 4357f246 -body_hash: a7ac0394 +gnosis_hash: aa516507 +body_hash: ff44ad1d --- # Getting Started -This guide walks a data center operator through installing network-operator, registering a Cisco NX-OS switch, pushing an initial interface configuration, and verifying the result. Subsequent sections point toward more advanced topics. - ---- +This guide walks you through installing network-operator on a Kubernetes cluster and provisioning your first Cisco NX-OS switch using declarative CRD-based configuration. By the end, you will have a managed device registered, an interface configured, and the configuration verified as applied. ## Prerequisites -Before you begin, ensure the following are available: +Before you begin, ensure the following tools and resources are available. -- **Kubernetes cluster** (v1.26 or later recommended) with sufficient RBAC permissions to install CRDs and create namespaces. -- **kubectl** configured to talk to that cluster (`kubectl version` should succeed against the target API server). -- **Helm 3** installed locally (`helm version`). -- **Network reachability** from the cluster nodes (or from a dedicated egress point) to the management address of each NX-OS switch. network-operator connects to devices over gRPC/gNMI; ensure TCP port 50051 (or your device's configured gRPC port) is open between the cluster and the management network. -- **Device credentials** — a username and password that have the `network-admin` role on NX-OS, and optionally a CA certificate if the device uses TLS on its gRPC server. +### Kubernetes Cluster ---- +You need a running Kubernetes cluster (version 1.24 or later is recommended). The cluster must have network reachability to the out-of-band management interfaces of the switches you intend to manage. + +Verify your cluster is accessible: + +```bash +kubectl cluster-info +``` + +### kubectl + +Install `kubectl` matching your cluster version. Confirm it is working: + +```bash +kubectl version --client +``` + +### Helm + +Install Helm v3.10 or later. Confirm the installation: + +```bash +helm version +``` ## Installing network-operator via Helm -The Helm chart lives under `deploy/helm/network-operator` in the repository. +The network-operator is distributed as a Helm chart located at `charts/network-operator` in the project repository. -### 1. Add or clone the chart +### Add the Helm repository + +If the chart is published to a Helm repository, add it first: ```bash -# Clone the repository and reference the chart locally -git clone https://github.com/your-org/network-operator.git -cd network-operator +helm repo add network-operator https://charts.example.com/network-operator +helm repo update ``` -### 2. Create a dedicated namespace +If you are working from a local checkout of the repository, you can reference the chart path directly in the steps below. + +### Create a namespace + +It is recommended to install network-operator into a dedicated namespace: ```bash kubectl create namespace network-operator ``` -### 3. Install the chart +### Install the chart + +Install the Helm chart with the release name `network-operator`: ```bash -helm install network-operator deploy/helm/network-operator \ +helm install network-operator network-operator/network-operator \ --namespace network-operator \ - --set controller.replicaCount=2 + --wait ``` -Verify the controller pod reaches `Running`: +To install from a local chart directory: + +```bash +helm install network-operator ./charts/network-operator \ + --namespace network-operator \ + --wait +``` + +### Verify the installation + +Confirm that the controller pods are running: ```bash kubectl get pods -n network-operator ``` -The controller manages a reconciliation loop for every CRD type. Once running, it watches for `Device`, `Interface`, `BGP`, `VLAN`, and all other resources in any namespace and pushes the desired state to the corresponding device. +You should see output similar to: ---- +``` +NAME READY STATUS RESTARTS AGE +network-operator-controller-7d9f85b-xkp2n 1/1 Running 0 60s +``` + +The controller manages reconciliation loops for each CRD type and begins watching for resources as soon as it is running. ## Registering a Network Device -Every managed resource references a `Device` object. The `Device` CRD (API group `core/v1alpha1`) holds the management address and authentication credentials. +A `Device` resource represents a managed network switch. It contains the management address and the credentials needed to connect to the device. All other CRDs reference a `Device` by name through the `deviceRef` field. -### 1. Store device credentials in a Secret +### Create a credentials secret -The `DeviceSpec.endpoint.secretRef` field requires a secret of type `kubernetes.io/basic-auth` containing `username` and `password` keys. +The controller authenticates to the device using a Kubernetes secret of type `kubernetes.io/basic-auth`. Create one for your NX-OS switch: ```bash -kubectl create secret generic leaf01-credentials \ - --namespace network-operator \ - --type kubernetes.io/basic-auth \ +kubectl create secret generic nxos-leaf01-creds \ + --type=kubernetes.io/basic-auth \ --from-literal=username=admin \ - --from-literal=password= + --from-literal=password= \ + --namespace network-operator ``` -### 2. Create the Device resource +### Apply the Device resource + +Create a file named `device-leaf01.yaml` with the following content. The `endpoint.address` field must be in `IP:Port` format, and `endpoint.secretRef.name` must reference the secret created above. ```yaml -# leaf01-device.yaml -apiVersion: core.network-operator.io/v1alpha1 +apiVersion: core.network-operator.example.com/v1alpha1 kind: Device metadata: name: leaf01 namespace: network-operator spec: endpoint: - address: "192.0.2.10:50051" # NX-OS management IP and gRPC port + address: "10.0.0.101:57400" secretRef: - name: leaf01-credentials - namespace: network-operator + name: nxos-leaf01-creds ``` Apply it: ```bash -kubectl apply -f leaf01-device.yaml +kubectl apply -f device-leaf01.yaml ``` -After a few seconds the controller discovers the device and populates `status`: +### Verify the device is connected + +The controller will attempt to connect to the device and populate the `status` fields. Check the device status: ```bash kubectl get device leaf01 -n network-operator -o yaml ``` -Look for `status.phase` moving to `Ready`, and check `status.manufacturer`, `status.model`, and `status.firmwareVersion` to confirm the operator has successfully connected. - -> **TLS:** If your NX-OS switch is configured with a self-signed or enterprise CA certificate on the gRPC server, add a `spec.endpoint.tls` block pointing to a Secret that contains the CA certificate under `spec.endpoint.tls.ca`. - ---- +Look for the `status.phase` field and the `status.conditions` list. A healthy device will show a phase of `Ready` and a condition with `type: Available` and `status: "True"`. The controller also populates informational fields such as `status.manufacturer`, `status.model`, `status.firmwareVersion`, and `status.serialNumber` when the device is reachable. ## Applying a Basic Interface Configuration -The `Interface` CRD models ethernet, loopback, port-channel, routed-VLAN, and subinterfaces. Every `Interface` resource must reference the owning `Device` via `spec.deviceRef`. +With the device registered, you can now configure its interfaces using the `Interface` CRD. Every `Interface` resource must reference the owning device through `spec.deviceRef.name`. -### Example: configure a routed Layer 3 interface +### Configure a routed Layer 3 interface -The following manifest configures `Ethernet1/1` on `leaf01` as a routed interface with a /30 address, an MTU of 9216 bytes, and a description. +The following example configures a physical Ethernet interface on `leaf01` with an IPv4 address. The `spec.type` field identifies the interface type, `spec.name` must match the interface name on the device, and `spec.adminState` controls whether the interface is brought up. ```yaml -# leaf01-eth1-1.yaml -apiVersion: core.network-operator.io/v1alpha1 +apiVersion: core.network-operator.example.com/v1alpha1 kind: Interface metadata: name: leaf01-eth1-1 @@ -128,75 +163,69 @@ metadata: spec: deviceRef: name: leaf01 - name: Ethernet1/1 - type: Ethernet + name: "Ethernet1/1" + type: Physical adminState: Up description: "Uplink to spine01" mtu: 9216 ipv4: addresses: - - 10.0.0.1/30 + - "192.168.100.1/31" ``` -Apply it: +Apply the resource: ```bash -kubectl apply -f leaf01-eth1-1.yaml +kubectl apply -f interface-eth1-1.yaml ``` -### Example: configure a loopback interface +### Configure a loopback interface -Loopback interfaces are commonly used as BGP router-IDs and NVE source interfaces. +Loopback interfaces are commonly used as BGP router IDs and NVE source interfaces in data center fabrics: ```yaml -# leaf01-lo0.yaml -apiVersion: core.network-operator.io/v1alpha1 +apiVersion: core.network-operator.example.com/v1alpha1 kind: Interface metadata: - name: leaf01-lo0 + name: leaf01-loopback0 namespace: network-operator spec: deviceRef: name: leaf01 - name: loopback0 + name: "Loopback0" type: Loopback adminState: Up - description: "Router-ID loopback" + description: "Router ID loopback" ipv4: addresses: - - 10.255.0.1/32 + - "10.0.255.1/32" ``` +Apply it: + ```bash -kubectl apply -f leaf01-lo0.yaml +kubectl apply -f interface-loopback0.yaml ``` -### Key `InterfaceSpec` fields reference - -| Field | Purpose | -|---|---| -| `deviceRef.name` | Name of the `Device` object in the same namespace | -| `name` | Interface name exactly as it appears on the device (e.g., `Ethernet1/1`, `loopback0`) | -| `type` | `Ethernet`, `Loopback`, `Aggregate`, `RoutedVLAN`, `Subinterface` | -| `adminState` | `Up` or `Down` | -| `mtu` | Packet MTU in bytes | -| `ipv4.addresses` | List of IPv4 CIDR prefixes; first entry is the primary address | -| `switchport` | Layer 2 switchport configuration (access or trunk mode) | -| `vrfRef` | Assigns the interface to a non-default VRF | - ---- - ## Verifying the Configuration Was Pushed to the Device -The operator follows a reconcile-then-report model: after applying a manifest it attempts to push the desired state to the device and then reflects the outcome in the resource's `status.conditions`. +The network-operator controller reconciles each resource against the actual device state and reports the result through the `status` field of the CRD. -### Check conditions on the Interface resource +### Check the Interface status ```bash kubectl get interface leaf01-eth1-1 -n network-operator -o yaml ``` -A successful push produces a condition similar to: +Examine the `status.conditions` list in the output. The controller uses standard condition types: + +| Condition type | Meaning | +|---|---| +| `Available` | The configuration has been successfully applied and is active on the device. | +| `Progressing` | The controller is currently applying the configuration. | +| `Degraded` | The controller encountered an error pushing the configuration. | + +A successfully applied interface will show a condition similar to: ```yaml status: @@ -204,73 +233,57 @@ status: - type: Available status: "True" reason: ConfigurationApplied - message: "Interface configuration successfully applied to device" - lastTransitionTime: "2024-11-01T10:15:00Z" + lastTransitionTime: "2024-01-15T10:30:00Z" ``` -If the push fails the `status` field is `"False"` and `message` contains the error returned by the device. - -### Useful one-liners +### Use kubectl to list all interface statuses ```bash -# Watch all Interface resources in the namespace -kubectl get interfaces -n network-operator -w - -# Check conditions across all managed resources for leaf01 -kubectl get interfaces,bgp,vlan,vrf -n network-operator \ - -l core.network-operator.io/device=leaf01 - -# Describe a specific resource for full event and condition history -kubectl describe interface leaf01-eth1-1 -n network-operator +kubectl get interfaces -n network-operator ``` -### Confirm on the device directly +### Confirm directly on the device -SSH to `leaf01` and verify the configuration was applied: +You can also verify the configuration directly on the NX-OS switch using the NX-OS CLI: ``` +leaf01# show interface Ethernet1/1 leaf01# show running-config interface Ethernet1/1 -interface Ethernet1/1 - description Uplink to spine01 - mtu 9216 - ip address 10.0.0.1/30 - no shutdown ``` ---- +The IP address, MTU, description, and administrative state should match what was declared in the `Interface` resource. -## Next Steps +### Pause reconciliation for troubleshooting -With a device registered and a first interface configured, you are ready to build out the rest of the fabric. The sections below highlight the most commonly used CRDs for a NX-OS data center deployment. +If you need to temporarily stop the controller from reconciling a device (for example, during a maintenance window), set `spec.paused: true` on the `Device` resource: -### BGP and BGP Peers +```bash +kubectl patch device leaf01 -n network-operator \ + --type=merge -p '{"spec":{"paused":true}}' +``` -Use the `BGP` CRD to configure an eBGP or iBGP instance on `leaf01`, setting `spec.asNumber` and `spec.routerId`. Add neighbors with individual `BGPPeer` resources, each referencing the parent BGP instance via `spec.bgpRef`. For address-family configuration specific to NX-OS (such as EVPN PIP advertisement), attach a `BGPConfig` resource using `spec.providerConfigRef`. +Remember to set `paused: false` to resume normal reconciliation. -### VLANs +## Next Steps -Create `VLAN` resources with `spec.id` (1–4094), `spec.name`, and `spec.deviceRef`. For layer 3 routing over a VLAN, add a `RoutedVLAN` type `Interface` that references the VLAN via `spec.vlanRef`. +With a device registered and a basic interface configured, you are ready to build out the rest of your data center network using network-operator's declarative CRDs. -### VRFs +### BGP -Use the `VRF` CRD to define tenant VRFs (`spec.name`, `spec.routeDistinguisher`, `spec.routeTargets`). Assign interfaces to VRFs via `spec.vrfRef` on the `Interface` resource. +Configure BGP routing using the `BGP` and `BGPPeer` CRDs. The `BGP` resource sets the router-level parameters such as `spec.asNumber` and `spec.routerId`, while `BGPPeer` resources define individual neighbor sessions. Each `BGPPeer` references the parent `BGP` instance through `spec.bgpRef.name`. For Cisco NX-OS-specific address-family tuning such as advertising the primary IP (`advertisePIP`) or gateway IP export, use the `BGPConfig` CRD from the `api/cisco/nx/v1alpha1` package. -### Routing Policies and Prefix Sets +### VLANs -Define match criteria with `PrefixSet` resources, then reference them from `RoutingPolicy` statements (`spec.statements[].conditions.matchPrefixSet`). Attach policies to BGP peers via `spec.addressFamilies.ipv4Unicast.inboundRoutingPolicyRef` or `outboundRoutingPolicyRef` on `BGPPeer`. +Define VLANs with the `VLAN` CRD using `spec.id` and `spec.name`. Layer 3 switching for a VLAN is enabled by creating an `Interface` of type `RoutedVLAN` that references the VLAN via `spec.vlanRef.name`. -### VXLAN / EVPN Overlay +### Routing Policies -Create a `NetworkVirtualizationEdge` (NVE) resource with `spec.sourceInterfaceRef` pointing to a loopback, configure `EVPNInstance` resources for each L2VNI or L3VNI, and enable the L2VPN EVPN address family in BGP. For NX-OS-specific NVE settings (hold-down time, infra VLANs), attach a `NetworkVirtualizationEdgeConfig` via `spec.providerConfigRef`. +Control route advertisement and filtering using `PrefixSet` and `RoutingPolicy` resources. A `PrefixSet` defines named prefix lists referenced by `RoutingPolicy` statements. Policy statements contain `conditions` (prefix matching) and `actions` (route disposition and BGP attribute manipulation such as community tagging or AS-path prepending). Routing policies are attached to BGP peers via the `spec.addressFamilies.ipv4Unicast.inboundRoutingPolicyRef` and `outboundRoutingPolicyRef` fields on `BGPPeer`. -### Device Services +### VXLAN and EVPN -| CRD | Purpose | -|---|---| -| `NTP` | NTP server configuration and source interface | -| `DNS` | DNS servers, default domain, and source interface | -| `Syslog` | Remote syslog servers and facility configuration | -| `SNMP` | SNMP communities, trap destinations, and notification types | -| `LLDP` | System-wide and per-interface LLDP control | +For VXLAN overlay fabrics, configure `NetworkVirtualizationEdge` (NVE) resources to define the VTEP, `EVPNInstance` resources to define VXLAN Network Identifiers (VNIs), and VRFs with route targets for L3VNI routing. Enable the `l2vpnEvpn` address family on your `BGP` resource to exchange EVPN routes between VTEPs. + +### Additional Device Services -Each service CRD follows the same pattern: set `spec.deviceRef.name` to the target device, fill in the relevant fields, and the controller reconciles the change onto the device. +network-operator also manages operational services on NX-OS devices including `NTP`, `DNS`, `Syslog`, `SNMP`, `LLDP`, `User`, and `Banner` resources. Each follows the same pattern: create a resource in the same namespace as the target `Device` and reference it via `spec.deviceRef.name`. From 799db90af8cf6dc7233bfd3c2fdec0401a8438f3 Mon Sep 17 00:00:00 2001 From: d062260 Date: Tue, 5 May 2026 10:11:54 +0200 Subject: [PATCH 3/3] refine gnosis prompts --- docs/generated/guides/architecture.md | 246 ++++++++++------------- docs/generated/guides/getting-started.md | 246 +++++++++-------------- 2 files changed, 201 insertions(+), 291 deletions(-) diff --git a/docs/generated/guides/architecture.md b/docs/generated/guides/architecture.md index f8fdfd3e9..ea7511ce5 100644 --- a/docs/generated/guides/architecture.md +++ b/docs/generated/guides/architecture.md @@ -1,237 +1,203 @@ --- title: Architecture description: How Network Operator reconciles declarative CRDs into device configurations -gnosis_hash: 0f39c23c -body_hash: e3d86a7b +gnosis_hash: 605d5949 +body_hash: dcea5613 --- # Architecture -## Overview - -Network Operator is a set of Kubernetes controllers that reconcile CRD specifications into live network device configurations. The core idea is simple: you describe the desired state of a network device in a YAML manifest, apply it to Kubernetes, and the operator pushes the corresponding configuration to the device. No scripting, no manual CLI sessions — the operator handles translation and delivery. - -The system follows standard controller-runtime patterns: watch CRDs for changes, compare desired state against actual device state, compute a diff, and push updates. This makes it composable with standard Kubernetes tooling — GitOps pipelines, admission webhooks, RBAC, and status monitoring all work as expected. - ---- +Network Operator is a set of Kubernetes controllers that continuously reconcile CRD manifests into running configuration on network devices. If you are familiar with how cert-manager or external-dns work, the model is the same: you describe desired state in YAML, Kubernetes stores it, and a controller loop makes the device match that description. ## The Reconciliation Model -When you apply a manifest to Kubernetes, the following sequence takes place: - -1. **You apply a CRD manifest.** For example, an `Interface` spec describing a routed interface with an IPv4 address, or a `BGP` spec describing a BGP router instance. - -2. **The controller detects the change.** controller-runtime watches the relevant CRD type and enqueues a reconciliation request whenever the object is created, updated, or deleted. +Every configuration resource in network-operator follows the same lifecycle: -3. **The controller resolves the target Device.** Every configuration CRD carries a `deviceRef` field (of type `LocalObjectReference`) that names the `Device` object in the same namespace. The controller fetches that `Device` to obtain connection details. +1. **You apply a manifest.** For example, an `Interface` or a `BGP` object with the settings you want. +2. **The controller detects the change.** Controllers are built on `controller-runtime` and watch their respective CRD kinds. Any create, update, or delete event triggers a reconcile. +3. **The controller resolves the target device.** Every configuration CRD carries a `deviceRef` field (a `LocalObjectReference`) that names the `Device` object in the same namespace. The controller looks up that `Device` to retrieve the management endpoint and credentials. +4. **The controller builds a platform-native payload.** It translates the abstract spec fields into the API format the device understands — for NX-OS, this is NX-API JSON. Other transports (e.g. gNMI) are planned. +5. **The controller pushes the configuration and updates status.** After the device acknowledges the change, the controller writes the result back to the resource's `.status.conditions`. -4. **The controller builds the platform-native payload.** Using the spec fields, the controller constructs the vendor-specific API call — for example, an NX-API JSON payload for Cisco NX-OS. - -5. **The controller pushes the config to the device and updates status.** After a successful push, the controller writes status conditions back to the object. On failure, it sets a `Degraded` condition and requeues for retry. - -6. **Finalizers ensure cleanup on deletion.** When you delete a CRD object, the finalizer prevents immediate removal until the controller has removed the corresponding configuration from the device. - -A concrete example: applying a `VRF` manifest with `deviceRef.name: leaf-01` causes the VRF controller to look up the `Device` named `leaf-01`, connect to it, and configure the VRF with the specified name, VNI, route distinguisher, and route targets. - ---- +The loop is level-triggered, not edge-triggered. If a push fails, the controller re-queues and retries. If someone manually changes the device outside of Kubernetes, the next reconcile cycle detects the drift and corrects it. ## API Layers -The API is structured in four conceptual layers, from physical to intent: +The API is structured in four layers, each building on the one below: -| Layer | Description | -|---|---| -| **Physical** | Devices, interfaces, links — the raw hardware representation | -| **Bricks** | Vendor-abstract configuration; one brick maps to one device with status | -| **Transit** | Translates network demands into brick configurations | -| **Intent** | High-level constructs: networks, external connections, routing domains | +| Layer | Purpose | Examples | +|---|---|---| +| **Physical** | Physical inventory — devices, interfaces, links | `Device`, `Interface` | +| **Bricks** | Vendor-abstract configuration, one brick per device | `BGP`, `OSPF`, `VRF`, `VLAN` | +| **Transit** | Translates network demands into brick configs | Routing policies, prefix sets | +| **Intent** | High-level intent — networks, external connections, routing domains | `EVPNInstance`, `NetworkVirtualizationEdge` | -Most operators interact with the Physical and Bricks layers directly. The higher layers compose those primitives into fabric-wide constructs. - ---- +Most day-to-day operator work happens at the Bricks and Intent layers. The Physical layer resources (`Device`) are typically created once during initial setup. ## Core CRDs and Platform-Specific CRDs -### Core CRDs +Network-operator separates **what you want** from **how a specific platform implements it**. -Core CRDs, defined under `api/core/v1alpha1`, express platform-agnostic intent. They cover a broad range of network constructs: - -- **Physical layer:** `Device`, `Interface`, `VLAN` -- **Routing:** `BGP`, `BGPPeer`, `OSPF`, `ISIS`, `PIM`, `VRF`, `RoutingPolicy`, `PrefixSet` -- **Overlay:** `EVPNInstance`, `NetworkVirtualizationEdge`, `DHCPRelay` -- **Management & security:** `NTP`, `DNS`, `Syslog`, `SNMP`, `Banner`, `User`, `Certificate`, `AccessControlList`, `ManagementAccess` -- **Platform features:** `LLDP`, `VPCDomain`, `BorderGateway`, `System` +### Core CRDs -Each of these types exposes fields that are meaningful across vendors. For example, `BGPSpec` defines `asNumber`, `routerId`, and `addressFamilies` — concepts that exist on every BGP implementation. +Core CRDs live in the `api/core/v1alpha1` package. They express vendor-neutral intent using fields that map to standard networking concepts. Examples include: -Every core config CRD has a `providerConfigRef` field (of type `*TypedLocalObjectReference`) that optionally links to a platform-specific configuration object. If omitted, the provider applies the platform's default settings. +- `Interface` — describes an interface with `name`, `type`, `adminState`, `ipv4`, `mtu`, `switchport`, and references to `vlanRef`, `vrfRef`, and `parentInterfaceRef`. +- `BGP` — describes a BGP router with `asNumber`, `routerId`, `addressFamilies`, and an optional `vrfRef`. +- `VRF` — describes a VRF with `name`, `routeDistinguisher`, and `routeTargets`. +- `VLAN`, `OSPF`, `ISIS`, `PIM`, `EVPNInstance`, `NetworkVirtualizationEdge`, and many others. ### Platform-Specific CRDs -Platform CRDs, defined under `api/cisco/nx/v1alpha1` (and similar paths for other vendors), carry vendor-specific knobs that have no generic equivalent. Examples include: +Platform CRDs live in vendor-specific packages (e.g. `api/cisco/nx/v1alpha1`). They provide vendor knobs that have no generic equivalent. Examples: -- **`InterfaceConfig`** — NX-OS-specific interface settings such as spanning-tree port type, BPDU guard, buffer boost, and LACP vPC convergence options. -- **`LLDPConfig`** — NX-OS LLDP `initDelay` and `holdTime` values. -- **`BGPConfig`** — NX-OS-specific BGP settings such as PIP advertisement for EVPN and gateway IP export for symmetric IRB. -- **`NetworkVirtualizationEdgeConfig`** — NX-OS NVE settings including virtual MAC advertisement and infra-VLAN list. -- **`ManagementAccessConfig`** — NX-OS console timeout and SSH VTY ACL settings. -- **`VPCDomain`** — Cisco vPC domain configuration including peer-link, keepalive, auto-recovery, and role priority. +- `InterfaceConfig` — adds NX-OS-specific settings like `SpanningTree` port type, `BufferBoost`, and LACP `vpcConvergence` options. +- `BGPConfig` — adds NX-OS-specific BGP address family settings such as `advertisePIP` for EVPN and `exportGatewayIP` for symmetric IRB. +- `LLDPConfig` — adds NX-OS-specific `initDelay` and `holdTime` timers. +- `NetworkVirtualizationEdgeConfig` — adds NX-OS NVE options like `advertiseVirtualMAC`, `holdDownTime`, and `infraVLANs`. +- `ManagementAccessConfig` — adds NX-OS console timeout and SSH ACL settings. +- `System` — adds NX-OS system-level settings: `jumboMtu`, `reservedVlan`, `vlanLongName`. +- `VPCDomain`, `BorderGateway` — NX-OS-specific constructs for vPC and EVPN multisite. -The relationship is: the core CRD references the platform CRD via `providerConfigRef`. This keeps the core manifest portable while allowing per-platform customisation where needed. +### Linking Core to Platform CRDs + +Core CRDs carry an optional `providerConfigRef` field of type `TypedLocalObjectReference`. When set, this field points to the corresponding platform-specific resource: ```yaml -# Core CRD — platform-agnostic -apiVersion: network.example.io/v1alpha1 -kind: LLDP +# Core CRD +apiVersion: core/v1alpha1 +kind: Interface +metadata: + name: eth1-1 spec: deviceRef: - name: leaf-01 - adminState: Up + name: leaf01 providerConfigRef: - apiVersion: network.example.io/v1alpha1 - kind: LLDPConfig - name: leaf-01-lldp-config - ---- -# Platform CRD — NX-OS specific knobs -apiVersion: network.cisco.nx/v1alpha1 -kind: LLDPConfig -spec: - initDelay: 5 - holdTime: 120 + apiVersion: cisco.nx/v1alpha1 + kind: InterfaceConfig + name: eth1-1-nxos + name: Ethernet1/1 + type: Physical + adminState: Up ``` ---- +This decoupling lets you keep environment-independent intent in core resources and vendor-specific tuning in platform resources, rather than embedding NX-OS CLI details directly into the core spec. ## Device Registration and Credentials -Before any configuration CRD can be reconciled, a `Device` object must exist in the same namespace. +Before any configuration resource can be reconciled, a `Device` object must exist in the same namespace. -### Device Spec +### The Device Object -`DeviceSpec` contains two key sections: +`DeviceSpec` holds two mandatory pieces of information: -**`endpoint`** (required) — specifies how to reach the device: -- `address`: management address in `IP:Port` format. -- `secretRef`: references a Kubernetes `Secret` of type `kubernetes.io/basic-auth`. The secret must contain `username` and `password` keys. -- `tls`: optional TLS configuration. The `ca` field selects a secret key for the CA certificate. The `certificate` field enables mTLS by referencing a `kubernetes.io/tls` secret containing `tls.crt` and `tls.key`. +- **`endpoint.address`** — the management address of the device in `IP:Port` format. +- **`endpoint.secretRef`** — a reference to a Kubernetes Secret of type `kubernetes.io/basic-auth` containing `username` and `password` keys. -**`provisioning`** (optional) — used for zero-touch provisioning. It carries an `image` reference (URL, checksum, checksum type) and a `bootScript` that can be sourced inline, from a `Secret`, or from a `ConfigMap`. +Optionally, TLS can be configured via `endpoint.tls`, which accepts a CA certificate (`tls.ca`) and optionally a client certificate and key (`tls.certificate`) for mutual TLS. ```yaml -apiVersion: network.example.io/v1alpha1 +apiVersion: core/v1alpha1 kind: Device metadata: - name: leaf-01 + name: leaf01 spec: endpoint: address: "192.0.2.10:443" secretRef: - name: leaf-01-credentials - paused: false + name: leaf01-credentials + namespace: network ``` -The `Device` resource is also where the operator writes back hardware inventory: `DeviceStatus` exposes `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, `lastRebootTime`, and a `ports` list detailing each physical port and any associated `Interface` resource. - -All configuration CRDs reference their device by name: - -```yaml -spec: - deviceRef: - name: leaf-01 -``` +The `Device` also supports a `provisioning` field for bootstrap workflows (boot scripts, images), and a `paused` flag to halt all reconciliation activity on the device and all its child resources. -The `deviceRef` field is immutable — moving a configuration object to a different device requires deleting and recreating it. +### DeviceRef in Configuration Resources -### Pausing +Every configuration CRD's spec includes a required `deviceRef` field. This is always a `LocalObjectReference` — it names a `Device` in the same namespace. The field is immutable after creation, meaning a configuration resource is permanently bound to one device. To move a config to a different device, you delete and recreate the resource. -`DeviceSpec` includes a `paused` boolean. When set to `true`, the device controller and all controllers managing objects that reference that device halt reconciliation. This is useful during maintenance windows or when you need to apply configuration changes manually without interference. +### What the Device Status Reports ---- +After connecting to a device, the controller populates `DeviceStatus` with discovered information: `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, `lastRebootTime`, a list of physical `ports`, and a human-readable `portSummary`. The `phase` field reflects the device's current lifecycle state. ## Status Conditions and Finalizers ### Status Conditions -Every CRD exposes a `status.conditions` field, a list of `metav1.Condition` objects. The operator uses three standard condition types: +Every CRD — `Device`, `Interface`, `BGP`, `VRF`, etc. — has a `.status.conditions` field that contains a list of `metav1.Condition` objects. Conditions follow the standard Kubernetes convention with `type`, `status` (`True`/`False`/`Unknown`), `reason`, and `message`. -| Type | Meaning | -|---|---| -| `Available` | The resource is fully functional and the configuration is applied on the device | -| `Progressing` | The resource is being created or updated | -| `Degraded` | The resource failed to reach or maintain its desired state | +Standard condition types used across resources: -Each condition has a `status` of `True`, `False`, or `Unknown`, along with a `reason` and `message` that give actionable detail. +- **`Available`** — the resource is fully functional and the configuration is active on the device. +- **`Progressing`** — the controller is currently applying the configuration. +- **`Degraded`** — the configuration could not be applied or the device is not in the desired state. -Some resources expose richer status fields beyond conditions. For example: +Some resources expose richer status beyond conditions. For example: -- `OSPFStatus` provides `neighbors` (a list of `OSPFNeighbor` with adjacency states) and an `adjacencySummary` string. -- `BGPPeerStatus` provides `sessionState`, `lastEstablishedTime`, and per-address-family `advertisedPrefixes` and `acceptedPrefixes` counts. -- `VPCDomainStatus` reports `role`, `keepaliveStatus`, `peerStatus`, and `peerLinkIfOperStatus`. -- `DeviceStatus` provides a `phase` and full hardware inventory. +- `BGPPeer` status includes `sessionState`, `lastEstablishedTime`, and per-address-family prefix counts (`acceptedPrefixes`, `advertisedPrefixes`). +- `OSPF` status includes a `neighbors` list with adjacency states and an `adjacencySummary`. +- `VPCDomain` status includes `role`, `keepaliveStatus`, `peerStatus`, and `peerLinkIfOperStatus`. +- `VLAN` status tracks which interface is providing Layer 3 routing (`routedBy`) and which EVPN instance provides the L2VNI (`bridgedBy`). -These fields let you build monitoring and alerting on top of standard Kubernetes tooling (e.g., Prometheus with `kube-state-metrics`, or `kubectl get` for quick operational checks). +To check whether a resource has been successfully applied, inspect the conditions: -### Finalizers - -All configuration CRDs use finalizers to ensure clean removal from the device when you delete the Kubernetes object. The sequence is: +```bash +kubectl get interface eth1-1 -o jsonpath='{.status.conditions}' +``` -1. You run `kubectl delete`. -2. Kubernetes sets the `deletionTimestamp` but does not remove the object because a finalizer is present. -3. The controller detects the deletion, removes the corresponding configuration from the device, then removes the finalizer. -4. Kubernetes completes the deletion. +### Finalizers -This prevents orphaned configuration on devices when Kubernetes objects are removed. +Finalizers ensure that when you delete a CRD resource, the controller first removes the corresponding configuration from the device before Kubernetes removes the object. Without finalizers, deleting a Kubernetes object would leave orphaned configuration on the device. -### Ownership +The finalizer is added to a resource when the controller first reconciles it. On deletion, Kubernetes sets a deletion timestamp but does not remove the object. The controller sees the deletion timestamp, pushes a removal operation to the device, then removes the finalizer to let Kubernetes complete the deletion. -Child resources are owned by their parent `Device`. This means cascading behaviour works as expected: if a `Device` is removed, owned resources are garbage-collected according to Kubernetes owner reference semantics. +### Pausing Reconciliation ---- +The `Device` spec includes a `paused` field. Setting it to `true` halts reconciliation for the device and all configuration resources that reference it. This is useful when performing manual maintenance or investigating issues without triggering automated changes. ## Multi-Device and Multi-Vendor Support ### Multi-Device -The operator supports arbitrarily many devices in a single namespace. Each `Device` object represents one physical or virtual network device. Configuration CRDs are scoped to individual devices via `deviceRef` — there is no implicit sharing of configuration across devices. - -To apply the same logical configuration to multiple devices (for example, identical BGP settings on a spine tier), you create one CRD instance per device: +Each configuration resource is scoped to exactly one device through its immutable `deviceRef`. To configure the same feature on multiple devices, you create one resource per device: ```yaml -# Spine 1 -apiVersion: network.example.io/v1alpha1 +# BGP on leaf01 +apiVersion: core/v1alpha1 kind: BGP metadata: - name: spine-01-bgp + name: leaf01-bgp spec: deviceRef: - name: spine-01 - asNumber: 65000 - routerId: "10.0.0.1" + name: leaf01 + asNumber: 65001 + routerId: 10.0.0.1 --- -# Spine 2 -apiVersion: network.example.io/v1alpha1 +# BGP on leaf02 +apiVersion: core/v1alpha1 kind: BGP metadata: - name: spine-02-bgp + name: leaf02-bgp spec: deviceRef: - name: spine-02 - asNumber: 65000 - routerId: "10.0.0.2" + name: leaf02 + asNumber: 65002 + routerId: 10.0.0.2 ``` -This design keeps each object's lifecycle independent. You can pause, delete, or update configuration on one device without affecting others. +Controllers reconcile all resources concurrently. There is no ordering dependency between resources on different devices unless you express it through cross-references (for example, a `BGPPeer` referencing a `BGP` instance via `bgpRef`). + +### Ownership + +Child resources are owned by their parent `Device`. This means that when a `Device` is deleted, all configuration resources referencing it are also subject to cleanup through the finalizer mechanism. ### Multi-Vendor -Multi-vendor support is structured through the provider layer: +Vendor support is structured through the provider layer. Each vendor implements a provider that understands how to translate core CRD specs into device-native API calls: -- The **core CRDs** define the intent in vendor-neutral terms. Controllers translate these specs into vendor-specific API calls. -- The **platform CRD layer** (e.g., `api/cisco/nx/v1alpha1`) carries vendor-specific extensions, referenced optionally via `providerConfigRef`. -- The **provider layer** implements device communication. Currently, NX-API is used for Cisco NX-OS; gNMI support is planned for additional platforms. +- **NX-OS** uses NX-API (HTTP/JSON). This provider is currently implemented. +- **gNMI** is planned as an additional transport. -When a controller reconciles a core CRD, it determines the target platform from the `Device` object (the operator discovers the device platform during initial connection). It then selects the appropriate provider and, if a `providerConfigRef` is present on the spec, merges the platform-specific configuration into the payload before pushing it to the device. +A different vendor would implement a new provider that consumes the same core CRDs and translates them into its own wire format. Platform-specific CRDs (like `InterfaceConfig` for NX-OS) are vendor-namespaced and linked to core resources via `providerConfigRef`, so adding a new vendor does not require changes to core CRD definitions. -This architecture means you can manage heterogeneous fabrics from a single operator instance. Devices running different operating systems co-exist in the same namespace; each controller simply routes to the correct provider implementation based on the resolved `Device`. +From an operator's perspective, the YAML you write for core resources (`Interface`, `BGP`, `VRF`, etc.) is identical regardless of vendor. Vendor-specific tuning is expressed separately in platform CRDs and linked in by reference. diff --git a/docs/generated/guides/getting-started.md b/docs/generated/guides/getting-started.md index 3b30b6a5d..fa4801b88 100644 --- a/docs/generated/guides/getting-started.md +++ b/docs/generated/guides/getting-started.md @@ -1,289 +1,233 @@ --- title: Getting Started description: Deploy Network Operator and configure your first network device -gnosis_hash: aa516507 -body_hash: ff44ad1d +gnosis_hash: 4124bd7e +body_hash: e6312e7f --- # Getting Started -This guide walks you through installing network-operator on a Kubernetes cluster and provisioning your first Cisco NX-OS switch using declarative CRD-based configuration. By the end, you will have a managed device registered, an interface configured, and the configuration verified as applied. +This guide walks you through installing network-operator, registering your first Cisco NX-OS switch, pushing an interface configuration, and verifying that the configuration was applied. By the end, you will have a working foundation to build out BGP, VLANs, and routing policies. ## Prerequisites -Before you begin, ensure the following tools and resources are available. +Before you begin, ensure the following are available in your environment: -### Kubernetes Cluster - -You need a running Kubernetes cluster (version 1.24 or later is recommended). The cluster must have network reachability to the out-of-band management interfaces of the switches you intend to manage. - -Verify your cluster is accessible: +- **Kubernetes cluster** (v1.26 or later recommended) with sufficient permissions to install CRDs and deploy controllers. A single namespace is sufficient for getting started. +- **kubectl** configured to reach the cluster (`kubectl cluster-info` should succeed). +- **Helm 3.10+** for chart installation (`helm version` should succeed). +- **Network reachability** from the Kubernetes worker nodes to the management address of your NX-OS switches. The controller connects to each device over the address specified in the `Device` resource. +- **Device credentials** stored in a Kubernetes Secret of type `kubernetes.io/basic-auth` containing `username` and `password` keys. Create one now: ```bash -kubectl cluster-info -``` - -### kubectl - -Install `kubectl` matching your cluster version. Confirm it is working: - -```bash -kubectl version --client +kubectl create namespace network-operator +kubectl -n network-operator create secret generic spine-01-creds \ + --type=kubernetes.io/basic-auth \ + --from-literal=username=admin \ + --from-literal=password= ``` -### Helm - -Install Helm v3.10 or later. Confirm the installation: - -```bash -helm version -``` +--- ## Installing network-operator via Helm -The network-operator is distributed as a Helm chart located at `charts/network-operator` in the project repository. - -### Add the Helm repository - -If the chart is published to a Helm repository, add it first: +Add the chart repository and install the operator into the `network-operator` namespace: ```bash helm repo add network-operator https://charts.example.com/network-operator helm repo update -``` -If you are working from a local checkout of the repository, you can reference the chart path directly in the steps below. - -### Create a namespace - -It is recommended to install network-operator into a dedicated namespace: - -```bash -kubectl create namespace network-operator -``` - -### Install the chart - -Install the Helm chart with the release name `network-operator`: - -```bash helm install network-operator network-operator/network-operator \ --namespace network-operator \ + --create-namespace \ --wait ``` -To install from a local chart directory: - -```bash -helm install network-operator ./charts/network-operator \ - --namespace network-operator \ - --wait -``` - -### Verify the installation - -Confirm that the controller pods are running: +Verify that the controller pod is running: ```bash -kubectl get pods -n network-operator +kubectl -n network-operator get pods ``` You should see output similar to: ``` -NAME READY STATUS RESTARTS AGE -network-operator-controller-7d9f85b-xkp2n 1/1 Running 0 60s +NAME READY STATUS RESTARTS AGE +network-operator-controller- 1/1 Running 0 30s ``` -The controller manages reconciliation loops for each CRD type and begins watching for resources as soon as it is running. - -## Registering a Network Device - -A `Device` resource represents a managed network switch. It contains the management address and the credentials needed to connect to the device. All other CRDs reference a `Device` by name through the `deviceRef` field. - -### Create a credentials secret - -The controller authenticates to the device using a Kubernetes secret of type `kubernetes.io/basic-auth`. Create one for your NX-OS switch: +The Helm chart installs all CRDs automatically. Confirm they are registered: ```bash -kubectl create secret generic nxos-leaf01-creds \ - --type=kubernetes.io/basic-auth \ - --from-literal=username=admin \ - --from-literal=password= \ - --namespace network-operator +kubectl get crds | grep network-operator ``` -### Apply the Device resource +--- + +## Registering a Network Device -Create a file named `device-leaf01.yaml` with the following content. The `endpoint.address` field must be in `IP:Port` format, and `endpoint.secretRef.name` must reference the secret created above. +Create a `Device` resource to register your NX-OS switch with the operator. The `spec.endpoint.address` field must be the management IP and port of the device. The `spec.endpoint.secretRef` field points to the `kubernetes.io/basic-auth` Secret you created above. ```yaml +# spine-01.yaml apiVersion: core.network-operator.example.com/v1alpha1 kind: Device metadata: - name: leaf01 + name: spine-01 namespace: network-operator spec: endpoint: - address: "10.0.0.101:57400" + address: "10.0.0.1:22" secretRef: - name: nxos-leaf01-creds + name: spine-01-creds + namespace: network-operator ``` Apply it: ```bash -kubectl apply -f device-leaf01.yaml +kubectl apply -f spine-01.yaml ``` -### Verify the device is connected - -The controller will attempt to connect to the device and populate the `status` fields. Check the device status: +The controller will connect to the device and populate `status` fields including `manufacturer`, `model`, `serialNumber`, `firmwareVersion`, and the list of physical ports. Check the device status: ```bash -kubectl get device leaf01 -n network-operator -o yaml +kubectl -n network-operator get device spine-01 -o yaml ``` -Look for the `status.phase` field and the `status.conditions` list. A healthy device will show a phase of `Ready` and a condition with `type: Available` and `status: "True"`. The controller also populates informational fields such as `status.manufacturer`, `status.model`, `status.firmwareVersion`, and `status.serialNumber` when the device is reachable. +Look for `status.phase` to become `Ready` and inspect the discovered `status.ports`. A healthy device will also have a `status.conditions` entry of type `Ready` with `status: "True"`. -## Applying a Basic Interface Configuration +> **Note:** If `status.phase` does not become `Ready` within a few minutes, verify network reachability and check the controller logs: +> ```bash +> kubectl -n network-operator logs -l app=network-operator-controller +> ``` -With the device registered, you can now configure its interfaces using the `Interface` CRD. Every `Interface` resource must reference the owning device through `spec.deviceRef.name`. +### Pausing reconciliation -### Configure a routed Layer 3 interface +If you need to temporarily prevent the operator from pushing changes to a device (for example, during a maintenance window), set `spec.paused: true` on the `Device` resource. The controller will stop reconciling all objects associated with that device until `paused` is removed or set back to `false`. -The following example configures a physical Ethernet interface on `leaf01` with an IPv4 address. The `spec.type` field identifies the interface type, `spec.name` must match the interface name on the device, and `spec.adminState` controls whether the interface is brought up. +--- + +## Applying a Basic Interface Configuration + +Once the device is registered, create an `Interface` resource to configure a routed Layer 3 interface. Every interface resource must reference its owning device via the `spec.deviceRef.name` field. + +The following example configures `Ethernet1/1` on `spine-01` as a routed Layer 3 interface with an IPv4 address and an MTU of 9216 bytes, which is typical for a data center fabric spine uplink. ```yaml +# spine-01-eth1-1.yaml apiVersion: core.network-operator.example.com/v1alpha1 kind: Interface metadata: - name: leaf01-eth1-1 + name: spine-01-eth1-1 namespace: network-operator spec: deviceRef: - name: leaf01 - name: "Ethernet1/1" - type: Physical + name: spine-01 + name: Ethernet1/1 + type: Ethernet adminState: Up - description: "Uplink to spine01" + description: "Uplink to leaf-01 Ethernet1/49" mtu: 9216 ipv4: addresses: - - "192.168.100.1/31" + - "192.168.100.0/31" ``` -Apply the resource: +Apply it: ```bash -kubectl apply -f interface-eth1-1.yaml +kubectl apply -f spine-01-eth1-1.yaml ``` -### Configure a loopback interface +### Configuring a loopback interface -Loopback interfaces are commonly used as BGP router IDs and NVE source interfaces in data center fabrics: +Loopback interfaces are commonly used as BGP router IDs and NVE source interfaces. Configure one alongside your Ethernet interface: ```yaml +# spine-01-lo0.yaml apiVersion: core.network-operator.example.com/v1alpha1 kind: Interface metadata: - name: leaf01-loopback0 + name: spine-01-lo0 namespace: network-operator spec: deviceRef: - name: leaf01 - name: "Loopback0" + name: spine-01 + name: loopback0 type: Loopback adminState: Up - description: "Router ID loopback" + description: "Router ID / BGP source" ipv4: addresses: - - "10.0.255.1/32" + - "10.255.0.1/32" ``` -Apply it: - ```bash -kubectl apply -f interface-loopback0.yaml +kubectl apply -f spine-01-lo0.yaml ``` -## Verifying the Configuration Was Pushed to the Device +--- -The network-operator controller reconciles each resource against the actual device state and reports the result through the `status` field of the CRD. +## Verifying the Configuration Was Pushed to the Device -### Check the Interface status +After applying an `Interface` resource, the controller reconciles the desired state against the device. Check the resource status: ```bash -kubectl get interface leaf01-eth1-1 -n network-operator -o yaml +kubectl -n network-operator get interface spine-01-eth1-1 -o yaml ``` -Examine the `status.conditions` list in the output. The controller uses standard condition types: - -| Condition type | Meaning | -|---|---| -| `Available` | The configuration has been successfully applied and is active on the device. | -| `Progressing` | The controller is currently applying the configuration. | -| `Degraded` | The controller encountered an error pushing the configuration. | - -A successfully applied interface will show a condition similar to: +In the output, inspect the `status.conditions` list. A successfully applied configuration will contain a condition of type `Ready` with `status: "True"` and a `reason` indicating the configuration was pushed. For example: ```yaml status: conditions: - - type: Available + - type: Ready status: "True" reason: ConfigurationApplied - lastTransitionTime: "2024-01-15T10:30:00Z" + lastTransitionTime: "2024-06-01T12:00:00Z" ``` -### Use kubectl to list all interface statuses +If the condition shows `status: "False"`, the `message` field will describe the error. Common issues include connectivity failures, authentication errors, or unsupported configuration values for the target platform. -```bash -kubectl get interfaces -n network-operator -``` +You can also list all interface resources and their readiness at a glance: -### Confirm directly on the device - -You can also verify the configuration directly on the NX-OS switch using the NX-OS CLI: - -``` -leaf01# show interface Ethernet1/1 -leaf01# show running-config interface Ethernet1/1 +```bash +kubectl -n network-operator get interfaces ``` -The IP address, MTU, description, and administrative state should match what was declared in the `Interface` resource. - -### Pause reconciliation for troubleshooting - -If you need to temporarily stop the controller from reconciling a device (for example, during a maintenance window), set `spec.paused: true` on the `Device` resource: +To confirm the configuration on the device itself, SSH to the switch and verify: ```bash -kubectl patch device leaf01 -n network-operator \ - --type=merge -p '{"spec":{"paused":true}}' +# On the NX-OS device: +show interface Ethernet1/1 +show ip interface Ethernet1/1 ``` -Remember to set `paused: false` to resume normal reconciliation. +The interface should show the configured IP address, MTU, and admin state. + +--- ## Next Steps -With a device registered and a basic interface configured, you are ready to build out the rest of your data center network using network-operator's declarative CRDs. +With a device registered and a basic interface configured, you are ready to build out the full data center network fabric. The following CRDs are the logical next steps: -### BGP +### BGP routing -Configure BGP routing using the `BGP` and `BGPPeer` CRDs. The `BGP` resource sets the router-level parameters such as `spec.asNumber` and `spec.routerId`, while `BGPPeer` resources define individual neighbor sessions. Each `BGPPeer` references the parent `BGP` instance through `spec.bgpRef.name`. For Cisco NX-OS-specific address-family tuning such as advertising the primary IP (`advertisePIP`) or gateway IP export, use the `BGPConfig` CRD from the `api/cisco/nx/v1alpha1` package. +Use the `BGP` CRD to configure a BGP router instance on the device, specifying `spec.asNumber` and `spec.routerId`. Create `BGPPeer` resources to define eBGP or iBGP neighbors, referencing the BGP instance via `spec.bgpRef.name` and the local source interface via `spec.localAddress.interfaceRef`. For NX-OS-specific address-family settings such as EVPN PIP advertisement, use the `BGPConfig` CRD from the `cisco/nx` API group. ### VLANs -Define VLANs with the `VLAN` CRD using `spec.id` and `spec.name`. Layer 3 switching for a VLAN is enabled by creating an `Interface` of type `RoutedVLAN` that references the VLAN via `spec.vlanRef.name`. +Define VLANs on a device using the `VLAN` CRD, setting `spec.id` (1–4094) and an optional `spec.name`. Trunk or access switchport behavior on physical interfaces is controlled through `spec.switchport` on the `Interface` resource, where `spec.switchport.mode`, `spec.switchport.allowedVlans`, and `spec.switchport.accessVlan` give you full Layer 2 control. -### Routing Policies +### Routing policies and prefix filtering -Control route advertisement and filtering using `PrefixSet` and `RoutingPolicy` resources. A `PrefixSet` defines named prefix lists referenced by `RoutingPolicy` statements. Policy statements contain `conditions` (prefix matching) and `actions` (route disposition and BGP attribute manipulation such as community tagging or AS-path prepending). Routing policies are attached to BGP peers via the `spec.addressFamilies.ipv4Unicast.inboundRoutingPolicyRef` and `outboundRoutingPolicyRef` fields on `BGPPeer`. +Create `PrefixSet` resources to define named lists of IP prefixes with optional mask length ranges. Reference these in `RoutingPolicy` resources using `spec.statements[].conditions.matchPrefixSet.prefixSetRef`. Policies can accept or reject routes and apply BGP community tagging via `spec.statements[].actions.bgpActions`. Attach policies to BGP peers using `spec.addressFamilies.ipv4Unicast.inboundRoutingPolicyRef` and `spec.addressFamilies.ipv4Unicast.outboundRoutingPolicyRef` on the `BGPPeer` resource. -### VXLAN and EVPN +### VXLAN / EVPN overlay -For VXLAN overlay fabrics, configure `NetworkVirtualizationEdge` (NVE) resources to define the VTEP, `EVPNInstance` resources to define VXLAN Network Identifiers (VNIs), and VRFs with route targets for L3VNI routing. Enable the `l2vpnEvpn` address family on your `BGP` resource to exchange EVPN routes between VTEPs. +For a VXLAN BGP EVPN fabric, configure a `NetworkVirtualizationEdge` (NVE) resource pointing its `spec.sourceInterfaceRef` to a loopback interface. Then create `EVPNInstance` resources with the appropriate `spec.vni` and `spec.type` (Bridged for L2VNI) to bind VLANs to the overlay. VRF resources with `spec.vni` set enable L3VNI support for inter-tenant routing. -### Additional Device Services +### Device services -network-operator also manages operational services on NX-OS devices including `NTP`, `DNS`, `Syslog`, `SNMP`, `LLDP`, `User`, and `Banner` resources. Each follows the same pattern: create a resource in the same namespace as the target `Device` and reference it via `spec.deviceRef.name`. +Once the core forwarding plane is established, configure operational services such as `NTP`, `DNS`, `Syslog`, and `SNMP` by creating the respective CRDs and referencing the device via `spec.deviceRef.name` in each resource — the same pattern used by all configuration objects in network-operator.