diff --git a/core/helm-charts/genai-gateway/templates/deployment.yaml b/core/helm-charts/genai-gateway/templates/deployment.yaml index e45baf87..38cefe63 100644 --- a/core/helm-charts/genai-gateway/templates/deployment.yaml +++ b/core/helm-charts/genai-gateway/templates/deployment.yaml @@ -18,7 +18,7 @@ spec: spec: initContainers: - name: wait-for-postgres-redis - image: busybox:1.36 + image: docker.io/library/busybox:1.28 command: - /bin/sh - -c diff --git a/core/helm-charts/genai-gateway/values.yaml b/core/helm-charts/genai-gateway/values.yaml index c03b4b0e..be362366 100644 --- a/core/helm-charts/genai-gateway/values.yaml +++ b/core/helm-charts/genai-gateway/values.yaml @@ -4,7 +4,7 @@ replicaCount: 1 image: repository: ghcr.io/berriai/litellm-non_root tag: main-v1.75.8-stable - pullPolicy: Always + pullPolicy: IfNotPresent imagePullSecrets: [] service: type: LoadBalancer diff --git a/core/helm-charts/vllm/xeon-values.yaml b/core/helm-charts/vllm/xeon-values.yaml index 77d92bf1..e695c610 100644 --- a/core/helm-charts/vllm/xeon-values.yaml +++ b/core/helm-charts/vllm/xeon-values.yaml @@ -11,6 +11,11 @@ accelDevice: "" # CPU Balloon configuration for NRI resource policy cpu_balloon_annotation: "" +# Override tensor parallelism to 1 for Xeon — NRI balloon CPU allocation creates +# asymmetric NUMA splits (85 vs 84 physical cores) when TP=2, causing PyTorch +# shm assertion failure: ptr->thread_num == thread_num +tensor_parallel_size: "1" + resources: requests: diff --git a/core/inventory/metadata/offline.yml b/core/inventory/metadata/offline.yml new file mode 100644 index 00000000..46bdb0ae --- /dev/null +++ b/core/inventory/metadata/offline.yml @@ -0,0 +1,69 @@ +files_repo: "http://JFROG_HOST:8082/artifactory/ei-generic-binaries" +kube_version: v1.30.4 +crictl_version: "v1.30.1" +etcd_version: "v3.5.16" +runc_version: "v1.2.3" +containerd_version: "1.7.24" +kubeadm_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubeadm" +kubectl_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubectl" +kubelet_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubelet" +cni_download_url: "{{ files_repo }}/github.com/containernetworking/plugins/releases/download/{{ cni_version }}/cni-plugins-linux-{{ image_arch }}-{{ cni_version }}.tgz" +crictl_download_url: "{{ files_repo }}/github.com/kubernetes-sigs/cri-tools/releases/download/{{ crictl_version }}/crictl-{{ crictl_version }}-{{ ansible_system | lower }}-{{ image_arch }}.tar.gz" +etcd_download_url: "{{ files_repo }}/github.com/etcd-io/etcd/releases/download/{{ etcd_version }}/etcd-{{ etcd_version }}-linux-{{ image_arch }}.tar.gz" +calicoctl_download_url: "{{ files_repo }}/github.com/projectcalico/calico/releases/download/{{ calico_ctl_version }}/calicoctl-linux-{{ image_arch }}" +calico_crds_download_url: "{{ files_repo }}/github.com/projectcalico/calico/archive/{{ calico_version }}.tar.gz" +helm_download_url: "{{ files_repo }}/get.helm.sh/helm-{{ helm_version }}-linux-{{ image_arch }}.tar.gz" +containerd_download_url: "{{ files_repo }}/github.com/containerd/containerd/releases/download/v{{ containerd_version }}/containerd-{{ containerd_version }}-linux-{{ image_arch }}.tar.gz" +runc_download_url: "{{ files_repo }}/github.com/opencontainers/runc/releases/download/{{ runc_version }}/runc.{{ image_arch }}" +nerdctl_download_url: "{{ files_repo }}/github.com/containerd/nerdctl/releases/download/v{{ nerdctl_version }}/nerdctl-{{ nerdctl_version }}-linux-{{ image_arch }}.tar.gz" + +## Pin Calico to version validated in JFrog airgap cache +calico_version: v3.28.1 + +## Pin CoreDNS to version validated in JFrog airgap cache +coredns_version: v1.11.1 + +# JFrog registry mirrors — Kubespray writes these into /etc/containerd/certs.d +# on every cluster node during cluster.yml. JFROG_HOST is substituted with the +# actual JFrog IP by setup-env.sh before Kubespray runs. +containerd_registries_mirrors: + - registry: "docker.io" + prefix: "docker.io" + mirrors: + - host: "http://JFROG_HOST:8082/v2/ei-docker-virtual" + capabilities: + - pull + - resolve + override_path: true + - registry: "ghcr.io" + prefix: "ghcr.io" + mirrors: + - host: "http://JFROG_HOST:8082/v2/ei-docker-virtual" + capabilities: + - pull + - resolve + override_path: true + - registry: "registry.k8s.io" + prefix: "registry.k8s.io" + mirrors: + - host: "http://JFROG_HOST:8082/v2/ei-docker-virtual" + capabilities: + - pull + - resolve + override_path: true + - registry: "quay.io" + prefix: "quay.io" + mirrors: + - host: "http://JFROG_HOST:8082/v2/ei-docker-virtual" + capabilities: + - pull + - resolve + override_path: true + - registry: "public.ecr.aws" + prefix: "public.ecr.aws" + mirrors: + - host: "http://JFROG_HOST:8082/v2/ei-docker-virtual" + capabilities: + - pull + - resolve + override_path: true diff --git a/core/inventory/metadata/vars/inference_common.yml b/core/inventory/metadata/vars/inference_common.yml index 99f97770..9b1c77e2 100644 --- a/core/inventory/metadata/vars/inference_common.yml +++ b/core/inventory/metadata/vars/inference_common.yml @@ -4,4 +4,17 @@ helm_charts_base: "{{ lookup('env', 'PWD') }}/helm-charts" remote_home_dir: "{{ lookup('env', 'PWD') }}/scripts" remote_helm_charts_base: "/tmp/helm-charts" ansible_python_interpreter: "{{ lookup('env', 'ANSIBLE_PYTHON_INTERPRETER') or '/usr/bin/python3' }}" -remote_home_scripts_dir: "{{ lookup('env', 'PWD') }}/scripts" \ No newline at end of file +remote_home_scripts_dir: "{{ lookup('env', 'PWD') }}/scripts" + +# --------------------------------------------------------------------------- +# Airgap — Helm repository URLs +# airgap_enabled, jfrog_url, jfrog_username, jfrog_password are sourced from +# inference-config.cfg and passed in via --extra-vars by the shell layer. +# When airgap_enabled=true → routes to JFrog ei-helm-virtual on VM1. +# When airgap_enabled=false → uses original upstream URLs (internet). +# --------------------------------------------------------------------------- +helm_repo_ingress_nginx: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://kubernetes.github.io/ingress-nginx' }}" +helm_repo_langfuse: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://langfuse.github.io/langfuse-k8s' }}" +helm_repo_apisix: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://charts.apiseven.com' }}" +helm_repo_nri_plugins: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | default(false) | bool else 'https://containers.github.io/nri-plugins' }}" +helm_oci_jfrog_host: "{{ jfrog_url | regex_replace('^https?://', '') | regex_replace('/.*$', '') }}" \ No newline at end of file diff --git a/core/lib/cluster/config/cluster-config-init.sh b/core/lib/cluster/config/cluster-config-init.sh index 2e5564e2..ef3a4696 100644 --- a/core/lib/cluster/config/cluster-config-init.sh +++ b/core/lib/cluster/config/cluster-config-init.sh @@ -8,5 +8,5 @@ deploy_cluster_config_playbook() { tags="" fi - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-cluster-config.yml --become --become-user=root --extra-vars "brownfield_deployment=${brownfield_deployment} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file}" --tags "$tags" + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-cluster-config.yml --become --become-user=root --extra-vars "brownfield_deployment=${brownfield_deployment} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" --tags "$tags" } \ No newline at end of file diff --git a/core/lib/cluster/config/label-nodes.sh b/core/lib/cluster/config/label-nodes.sh index cf2269b6..7c1302b8 100644 --- a/core/lib/cluster/config/label-nodes.sh +++ b/core/lib/cluster/config/label-nodes.sh @@ -3,5 +3,5 @@ run_label_nodes_playbook() { echo "Running the label-nodes.yml playbook to label Kubernetes nodes..." - ansible-playbook -i "${INVENTORY_PATH}" playbooks/label-nodes.yml + ansible-playbook -i "${INVENTORY_PATH}" playbooks/label-nodes.yml --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" } diff --git a/core/lib/cluster/deployment/cluster-purge.sh b/core/lib/cluster/deployment/cluster-purge.sh index 95a3f8c6..ed8d354c 100644 --- a/core/lib/cluster/deployment/cluster-purge.sh +++ b/core/lib/cluster/deployment/cluster-purge.sh @@ -10,7 +10,7 @@ run_reset_playbook() { uninstall_ceph_cluster fi - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "delete_pv_on_purge=${delete_pv_on_purge}" + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "delete_pv_on_purge=${delete_pv_on_purge} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root reset.yml -e "confirm_reset=yes reset_nodes=false" # Check the exit status of the Ansible playbook command if [ $? -eq 0 ]; then diff --git a/core/lib/cluster/deployment/fresh-install.sh b/core/lib/cluster/deployment/fresh-install.sh index 1ec01aae..2b7254a4 100644 --- a/core/lib/cluster/deployment/fresh-install.sh +++ b/core/lib/cluster/deployment/fresh-install.sh @@ -46,7 +46,40 @@ fresh_installation() { if [[ "$deploy_kubernetes_fresh" == "yes" ]]; then echo "Starting fresh installation of Intel AI for Enterprise Inference Cluster..." + if [[ "$airgap_enabled" == "yes" ]]; then + echo "Airgap mode: fixing containerd mirrors and purging any stale image blobs before Kubernetes install..." + local _b64 _jfrog_host + _jfrog_host=$(echo "$jfrog_url" | sed 's|https\?://||' | sed 's|/.*||') + _b64=$(echo -n "${jfrog_username}:${jfrog_password}" | base64 -w 0) + for _reg in docker.io ghcr.io registry.k8s.io quay.io public.ecr.aws; do + sudo mkdir -p /etc/containerd/certs.d/$_reg + sudo tee /etc/containerd/certs.d/$_reg/hosts.toml > /dev/null </dev/null; true + sudo ctr -n k8s.io images rm "$_img" 2>/dev/null; true + done + sudo find /var/lib/containerd/io.containerd.content.v1.content/blobs/sha256 \ + -size +100k -newer /etc/containerd/config.toml \ + -exec sh -c 'file "$1" | grep -q "HTML" && sudo rm -f "$1"' _ {} \; 2>/dev/null; true + sudo systemctl restart containerd + echo "Containerd mirrors configured and restarted." + fi install_kubernetes "$@" + if [[ "$airgap_enabled" == "yes" ]]; then + echo "Patching local-path-config to use busybox:1.28 (airgap mode)..." + kubectl patch configmap local-path-config -n local-path-storage --type merge -p \ + '{"data":{"helperPod.yaml":"apiVersion: v1\nkind: Pod\nmetadata:\n name: helper-pod\nspec:\n containers:\n - name: helper-pod\n image: \"docker.io/library/busybox:1.28\"\n imagePullPolicy: IfNotPresent"}}' \ + 2>/dev/null || true + fi else echo "Skipping Kubernetes installation..." fi @@ -137,7 +170,11 @@ fresh_installation() { --extra-vars "cluster_url=${cluster_url} \ cert_file=${cert_file} \ key_file=${key_file} \ - kubernetes_platform=${kubernetes_platform}" \ + kubernetes_platform=${kubernetes_platform} \ + airgap_enabled=${airgap_enabled} \ + jfrog_url=${jfrog_url} \ + jfrog_username=${jfrog_username} \ + jfrog_password=${jfrog_password}" \ --vault-password-file "$vault_pass_file" if [ $? -eq 0 ]; then echo "Agentic AI Plugin deployed successfully." @@ -230,5 +267,9 @@ fresh_installation() { run_fresh_install_playbook() { echo "Running the cluster.yml playbook to set up the Kubernetes cluster..." - ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root cluster.yml + local _airgap_extra_vars="" + if [[ "$airgap_enabled" == "yes" ]]; then + _airgap_extra_vars="--extra-vars \"airgap_enabled=true jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}\"" + fi + eval ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root cluster.yml ${_airgap_extra_vars} } \ No newline at end of file diff --git a/core/lib/cluster/drv-fw-update.sh b/core/lib/cluster/drv-fw-update.sh index 44367b9c..d0d546e2 100644 --- a/core/lib/cluster/drv-fw-update.sh +++ b/core/lib/cluster/drv-fw-update.sh @@ -45,7 +45,7 @@ update_drivers() { invoke_prereq_workflows echo "${YELLOW}Updating drivers...${NC}" ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-gaudi-firmware-driver.yml \ - --extra-vars "update_type=drivers" + --extra-vars "update_type=drivers airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" echo "${GREEN}Drivers updated successfully!${NC}" } @@ -54,7 +54,7 @@ update_firmware() { invoke_prereq_workflows echo "${YELLOW}Updating firmware...${NC}" ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-gaudi-firmware-driver.yml \ - --extra-vars "update_type=firmware" + --extra-vars "update_type=firmware airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" echo "${GREEN}Firmware updated successfully!${NC}" } diff --git a/core/lib/cluster/nodes/add-node.sh b/core/lib/cluster/nodes/add-node.sh index 6e976731..8fc09659 100644 --- a/core/lib/cluster/nodes/add-node.sh +++ b/core/lib/cluster/nodes/add-node.sh @@ -16,7 +16,8 @@ add_inference_nodes_playbook() { invoke_prereq_workflows "$@" - ansible-playbook -i "${INVENTORY_PATH}" playbooks/cluster.yml --become --become-user=root + ansible-playbook -i "${INVENTORY_PATH}" playbooks/cluster.yml --become --become-user=root \ + --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" } diff --git a/core/lib/components/genai-gateway-controller.sh b/core/lib/components/genai-gateway-controller.sh index f0ef44d5..7a602a7a 100644 --- a/core/lib/components/genai-gateway-controller.sh +++ b/core/lib/components/genai-gateway-controller.sh @@ -4,5 +4,5 @@ run_genai_gateway_playbook() { echo "Deploying GenAI Gateway Service..." echo "************************************" - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-genai-gateway.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_genai_gateway=${deploy_genai_gateway} model_name_list='${model_name_list//\ /,}' genai_gateway_trace_chart_version=${genai_gateway_trace_chart_version} kubernetes_platform=${kubernetes_platform}" --vault-password-file "$vault_pass_file" + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-genai-gateway.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_genai_gateway=${deploy_genai_gateway} model_name_list='${model_name_list//\ /,}' genai_gateway_trace_chart_version=${genai_gateway_trace_chart_version} kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" --vault-password-file "$vault_pass_file" } diff --git a/core/lib/components/ingress-controller.sh b/core/lib/components/ingress-controller.sh index fc15d7a4..714f6e8c 100644 --- a/core/lib/components/ingress-controller.sh +++ b/core/lib/components/ingress-controller.sh @@ -3,5 +3,5 @@ run_ingress_nginx_playbook() { echo "Deploying the Ingress NGINX Controller..." - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ingress-controller.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} ingress_controller=${ingress_controller}" + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ingress-controller.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} ingress_controller=${ingress_controller} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" } \ No newline at end of file diff --git a/core/lib/components/intel-base-operator.sh b/core/lib/components/intel-base-operator.sh index e7bcbbda..34295894 100644 --- a/core/lib/components/intel-base-operator.sh +++ b/core/lib/components/intel-base-operator.sh @@ -3,7 +3,9 @@ run_deploy_habana_ai_operator_playbook() { echo "Running the deploy-habana-ai-operator.yml playbook to deploy the habana-ai-operator..." - ansible-galaxy collection install community.kubernetes + if [[ "$airgap_enabled" != "yes" ]]; then + ansible-galaxy collection install kubernetes.core + fi if [[ "$gaudi_platform" == "gaudi2" ]]; then gaudi_operator="$gaudi2_operator" elif [[ "$gaudi_platform" == "gaudi3" ]]; then @@ -11,7 +13,7 @@ run_deploy_habana_ai_operator_playbook() { else gaudi_operator="" fi - ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root playbooks/deploy-habana-ai-operator.yml --extra-vars "gaudi_operator=${gaudi_operator}" + ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root playbooks/deploy-habana-ai-operator.yml --extra-vars "gaudi_operator=${gaudi_operator} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" if [ $? -eq 0 ]; then echo "The deploy-habana-ai-operator.yml playbook ran successfully." else diff --git a/core/lib/components/keycloak-controller.sh b/core/lib/components/keycloak-controller.sh index fc3e786c..f1412109 100644 --- a/core/lib/components/keycloak-controller.sh +++ b/core/lib/components/keycloak-controller.sh @@ -3,8 +3,8 @@ run_keycloak_playbook() { echo "Deploying Keycloak using Ansible playbook..." - install_ansible_collection - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml + install_ansible_collection + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" } create_keycloak_tls_secret_playbook() { @@ -12,7 +12,7 @@ create_keycloak_tls_secret_playbook() { echo "************************************" ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-tls-cert.yml \ - --extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} model_name_list='${model_name_list//\ /,}' deploy_keycloak=${deploy_keycloak} deploy_apisix=${deploy_apisix} keycloak_chart_version=${keycloak_chart_version}" + --extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} model_name_list='${model_name_list//\ /,}' deploy_keycloak=${deploy_keycloak} deploy_apisix=${deploy_apisix} keycloak_chart_version=${keycloak_chart_version} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" } diff --git a/core/lib/components/observability-controller.sh b/core/lib/components/observability-controller.sh index bf37126c..895733cf 100644 --- a/core/lib/components/observability-controller.sh +++ b/core/lib/components/observability-controller.sh @@ -16,7 +16,7 @@ deploy_observability_playbook() { playbook_path="playbooks/deploy-observability-openshift.yml" fi - local extra_vars="secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_observability=${deploy_observability} deploy_logging=${deploy_logging} observability_stack_chart_version=${observability_stack_chart_version} kubernetes_platform=${kubernetes_platform}" + local extra_vars="secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_observability=${deploy_observability} deploy_logging=${deploy_logging} observability_stack_chart_version=${observability_stack_chart_version} kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" ansible-playbook -i "${INVENTORY_PATH}" "$playbook_path" --become --become-user=root --extra-vars "$extra_vars" --tags "$tags" --vault-password-file "$vault_pass_file" } \ No newline at end of file diff --git a/core/lib/components/service-mesh/install-istio.sh b/core/lib/components/service-mesh/install-istio.sh index e0694651..4656e865 100644 --- a/core/lib/components/service-mesh/install-istio.sh +++ b/core/lib/components/service-mesh/install-istio.sh @@ -11,9 +11,9 @@ deploy_istio_playbook() { # Expect kubernetes_platform to be set globally (brownfield or fresh install path) if [ "$(echo "${kubernetes_platform:-vanilla}" | tr '[:upper:]' '[:lower:]')" = "openshift" ]; then echo "Detected OpenShift platform. Using OpenShift Service Mesh playbook." - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio-openshift.yml --extra-vars "kubernetes_platform=${kubernetes_platform}" || return 1 + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio-openshift.yml --extra-vars "kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" || return 1 else echo "Using vanilla/helm-based Istio playbook for platform: ${kubernetes_platform}" - ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio.yml --extra-vars "kubernetes_platform=${kubernetes_platform}" || return 1 + ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio.yml --extra-vars "kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" || return 1 fi } \ No newline at end of file diff --git a/core/lib/components/storage/install-ceph-cluster.sh b/core/lib/components/storage/install-ceph-cluster.sh index a8f3c9ca..f4c090a4 100644 --- a/core/lib/components/storage/install-ceph-cluster.sh +++ b/core/lib/components/storage/install-ceph-cluster.sh @@ -13,7 +13,7 @@ deploy_ceph_cluster() { fi echo "Deploying Ceph storage cluster..." - if ! ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ceph-storage.yml; then + if ! ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ceph-storage.yml --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"; then echo -e "${RED} Ceph Cluster deployment FAILED!${NC}" echo "" echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" diff --git a/core/lib/models/install-model.sh b/core/lib/models/install-model.sh index 40321f8d..d5fef59f 100644 --- a/core/lib/models/install-model.sh +++ b/core/lib/models/install-model.sh @@ -10,7 +10,9 @@ deploy_inference_llm_models_playbook() { cpu_playbook="true" gpu_playbook="false" gaudi_deployment="false" - enable_cpu_balloons="true" # Enable NRI balloons for CPU deployments + if [ "$deploy_nri_balloon_policy" == "yes" ]; then + enable_cpu_balloons="true" # Enable NRI balloons only when explicitly requested + fi huggingface_model_deployment_name="${huggingface_model_deployment_name}-cpu" if [ "$balloon_policy_cpu" == "enabled" ]; then echo "${GREEN}CPU deployment detected - using generic NRI balloon policy${NC}" @@ -77,7 +79,7 @@ deploy_inference_llm_models_playbook() { fi ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-inference-models.yml \ - --extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} install_true=${install_true} model_name_list='${model_name_list//\ /,}' cpu_playbook=${cpu_playbook} gpu_playbook=${gpu_playbook} hugging_face_token_falcon3=${hugging_face_token_falcon3} deploy_keycloak=${deploy_keycloak} apisix_enabled=${apisix_enabled} ingress_enabled=${ingress_enabled} gaudi_deployment=${gaudi_deployment} huggingface_model_id=${huggingface_model_id} hugging_face_model_deployment=${hugging_face_model_deployment} huggingface_model_deployment_name=${huggingface_model_deployment_name} deploy_inference_llm_models_playbook=${deploy_inference_llm_models_playbook} huggingface_tensor_parellel_size=${huggingface_tensor_parellel_size} deploy_genai_gateway=${deploy_genai_gateway} vllm_metrics_enabled=${vllm_metrics_enabled} gaudi_values_file=${gaudi_values_file} xeon_values_file=${xeon_values_file_path} deploy_ceph=${deploy_ceph} enable_cpu_balloons=${enable_cpu_balloons} balloon_policy_cpu=${balloon_policy_cpu} aws_certificate_arn=${aws_certificate_arn}" --tags "$tags" --vault-password-file "$vault_pass_file" + --extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} install_true=${install_true} model_name_list='${model_name_list//\ /,}' cpu_playbook=${cpu_playbook} gpu_playbook=${gpu_playbook} hugging_face_token_falcon3=${hugging_face_token_falcon3} deploy_keycloak=${deploy_keycloak} apisix_enabled=${apisix_enabled} ingress_enabled=${ingress_enabled} gaudi_deployment=${gaudi_deployment} huggingface_model_id=${huggingface_model_id} hugging_face_model_deployment=${hugging_face_model_deployment} huggingface_model_deployment_name=${huggingface_model_deployment_name} deploy_inference_llm_models_playbook=${deploy_inference_llm_models_playbook} huggingface_tensor_parellel_size=${huggingface_tensor_parellel_size} deploy_genai_gateway=${deploy_genai_gateway} vllm_metrics_enabled=${vllm_metrics_enabled} gaudi_values_file=${gaudi_values_file} xeon_values_file=${xeon_values_file_path} deploy_ceph=${deploy_ceph} enable_cpu_balloons=${enable_cpu_balloons} balloon_policy_cpu=${balloon_policy_cpu} aws_certificate_arn=${aws_certificate_arn} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" --tags "$tags" --vault-password-file "$vault_pass_file" } diff --git a/core/lib/system/precheck/prereq-check.sh b/core/lib/system/precheck/prereq-check.sh index 31122211..25e230af 100644 --- a/core/lib/system/precheck/prereq-check.sh +++ b/core/lib/system/precheck/prereq-check.sh @@ -72,21 +72,39 @@ run_system_prerequisites_check() { echo -e "${GREEN}✓ curl found${NC}" fi - # Check internet connectivity (essential for Docker images, packages, repositories) - echo "Checking internet connectivity..." + # Check internet or JFrog connectivity + echo "Checking connectivity..." if command -v curl &> /dev/null; then - # Test multiple reliable endpoints to ensure connectivity - if curl -s --connect-timeout 10 --max-time 15 https://google.com > /dev/null 2>&1 || \ - curl -s --connect-timeout 10 --max-time 15 https://github.com > /dev/null 2>&1 || \ - curl -s --connect-timeout 10 --max-time 15 https://registry-1.docker.io > /dev/null 2>&1; then - echo -e "${GREEN}✓ Internet connectivity confirmed${NC}" + if [[ "$airgap_enabled" == "yes" ]]; then + # In airgap mode, verify JFrog Artifactory is reachable instead of the internet + if curl -s --connect-timeout 10 --max-time 15 \ + -u "${jfrog_username}:${jfrog_password}" \ + "${jfrog_url}/api/system/ping" > /dev/null 2>&1; then + echo -e "${GREEN}✓ JFrog Artifactory connectivity confirmed (airgap mode)${NC}" + if curl -s --connect-timeout 5 --max-time 10 https://google.com > /dev/null 2>&1 || \ + curl -s --connect-timeout 5 --max-time 10 https://github.com > /dev/null 2>&1; then + echo -e "${RED}✗ airgap_enabled is set to yes but this machine has internet connectivity.${NC}" + echo -e "${RED} Disable internet access before proceeding with airgap deployment.${NC}" + exit 1 + fi + else + echo -e "${RED}✗ Cannot reach JFrog Artifactory at ${jfrog_url}${NC}" + missing_deps+=("internet-connectivity") + fi else - echo -e "${RED}✗ No internet connectivity detected${NC}" - missing_deps+=("internet-connectivity") + # Test multiple reliable endpoints to ensure internet connectivity + if curl -s --connect-timeout 10 --max-time 15 https://google.com > /dev/null 2>&1 || \ + curl -s --connect-timeout 10 --max-time 15 https://github.com > /dev/null 2>&1 || \ + curl -s --connect-timeout 10 --max-time 15 https://registry-1.docker.io > /dev/null 2>&1; then + echo -e "${GREEN}✓ Internet connectivity confirmed${NC}" + else + echo -e "${RED}✗ No internet connectivity detected${NC}" + missing_deps+=("internet-connectivity") + fi fi else # If curl is not available, we'll check this later after curl is installed - warnings+=("Internet connectivity check skipped - curl not available") + warnings+=("Connectivity check skipped - curl not available") fi # Check if pip is available for the configured Python interpreter @@ -117,25 +135,29 @@ run_system_prerequisites_check() { fi - echo "Updating system package lists..." - if command -v apt &> /dev/null; then - echo "Updating package lists using apt Ubuntu..." - if sudo apt update; then - echo -e "${GREEN}Package lists updated successfully${NC}" - else - echo -e "${YELLOW}Package list update failed, continuing anyway${NC}" - fi - elif command -v dnf &> /dev/null; then - echo "Updating package lists using dnf (RHEL/CentOS)..." - if sudo dnf check-update || [ $? -eq 100 ]; then - echo -e "${GREEN} Package lists updated successfully${NC}" + if [[ "$airgap_enabled" != "yes" ]]; then + echo "Updating system package lists..." + if command -v apt &> /dev/null; then + echo "Updating package lists using apt Ubuntu..." + if sudo apt update; then + echo -e "${GREEN}Package lists updated successfully${NC}" + else + echo -e "${YELLOW}Package list update failed, continuing anyway${NC}" + fi + elif command -v dnf &> /dev/null; then + echo "Updating package lists using dnf (RHEL/CentOS)..." + if sudo dnf check-update || [ $? -eq 100 ]; then + echo -e "${GREEN} Package lists updated successfully${NC}" + else + echo -e "${YELLOW} Package list update failed, continuing anyway${NC}" + fi else - echo -e "${YELLOW} Package list update failed, continuing anyway${NC}" + echo -e "${YELLOW}Unknown package manager, skipping package list update${NC}" fi + echo "" else - echo -e "${YELLOW}Unknown package manager, skipping package list update${NC}" + echo -e "${YELLOW}Skipping package list update in airgap mode (no package mirror configured)${NC}" fi - echo "" # Check if any critical dependencies are missing and handle appropriately if [ ${#missing_deps[@]} -gt 0 ]; then @@ -160,20 +182,31 @@ run_system_prerequisites_check() { fi done - # Handle internet connectivity issues first - EXIT IMMEDIATELY (cannot be auto-fixed) + # Handle internet/JFrog connectivity issues first - EXIT IMMEDIATELY (cannot be auto-fixed) if [ ${#connectivity_issues[@]} -gt 0 ]; then - echo -e "${RED}Critical connectivity requirements not met:${NC}" - echo -e "${RED} - Internet connectivity is required for:${NC}" - echo -e "${RED} * Pulling Docker images${NC}" - echo -e "${RED} * Downloading packages and dependencies${NC}" - echo -e "${RED} * Accessing container registries${NC}" - echo -e "${RED} * Cloning Git repositories${NC}" - echo "" - echo -e "${YELLOW}Please ensure internet connectivity and try again.${NC}" - echo -e "${YELLOW}Common solutions:${NC}" - echo -e "${YELLOW} - Check network configuration${NC}" - echo -e "${YELLOW} - Verify firewall/proxy settings${NC}" - echo -e "${YELLOW} - Test: curl -I https://google.com${NC}" + if [[ "$airgap_enabled" == "yes" ]]; then + echo -e "${RED}Critical connectivity requirements not met:${NC}" + echo -e "${RED} - JFrog Artifactory is unreachable at ${jfrog_url}${NC}" + echo "" + echo -e "${YELLOW}In airgap mode all packages and images are served by JFrog.${NC}" + echo -e "${YELLOW}Common solutions:${NC}" + echo -e "${YELLOW} - Verify JFrog is running on VM1${NC}" + echo -e "${YELLOW} - Check jfrog_url, jfrog_username, jfrog_password in inference-config.cfg${NC}" + echo -e "${YELLOW} - Test: curl -u \${jfrog_username}:\${jfrog_password} ${jfrog_url}/api/system/ping${NC}" + else + echo -e "${RED}Critical connectivity requirements not met:${NC}" + echo -e "${RED} - Internet connectivity is required for:${NC}" + echo -e "${RED} * Pulling Docker images${NC}" + echo -e "${RED} * Downloading packages and dependencies${NC}" + echo -e "${RED} * Accessing container registries${NC}" + echo -e "${RED} * Cloning Git repositories${NC}" + echo "" + echo -e "${YELLOW}Please ensure internet connectivity and try again.${NC}" + echo -e "${YELLOW}Common solutions:${NC}" + echo -e "${YELLOW} - Check network configuration${NC}" + echo -e "${YELLOW} - Verify firewall/proxy settings${NC}" + echo -e "${YELLOW} - Test: curl -I https://google.com${NC}" + fi exit 1 fi @@ -232,37 +265,82 @@ run_system_prerequisites_check() { # Install pip using system package manager if needed if [ "$pip_needed" = true ]; then - echo "Installing pip using system package manager..." - if command -v dnf &> /dev/null; then - python_version=$($python3_interpreter -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - if [[ "$python_version" == "3.11" ]]; then - echo "Installing python3.11-pip using dnf (RHEL 9)..." - if ! sudo dnf install -y python3.11-pip; then - echo -e "${RED}Failed to install python3.11-pip using dnf${NC}" - exit 1 + if [[ "$airgap_enabled" == "yes" ]]; then + echo "Installing pip in airgap mode using pip wheel from JFrog..." + local pip_whl_url="${jfrog_url}/ei-generic-binaries/pip.whl" + local tmp_pip_whl="/tmp/pip.whl" + if curl -f -s -u "${jfrog_username}:${jfrog_password}" \ + -o "$tmp_pip_whl" "$pip_whl_url" 2>/dev/null; then + # Wheel filenames must follow {name}-{version}-{pytag}-{abitag}-{platform}.whl + # The file is stored in JFrog as "pip.whl" (generic name) — rename it + # using the version and tag from the WHEEL metadata inside the zip. + proper_name=$($python3_interpreter -c " +import zipfile, sys +try: + z = zipfile.ZipFile('$tmp_pip_whl') + wf = next(x for x in z.namelist() if x.endswith('.dist-info/WHEEL')) + base = wf.split('/')[0].replace('.dist-info', '') + meta = {} + for line in z.read(wf).decode().splitlines(): + if ': ' in line: + k, v = line.split(': ', 1) + meta[k] = v + tag = meta.get('Tag', 'py3-none-any') + print(f'{base}-{tag}.whl') +except Exception as e: + sys.exit(1) +" 2>/dev/null) + if [ -n "$proper_name" ]; then + mv "$tmp_pip_whl" "/tmp/$proper_name" + tmp_pip_whl="/tmp/$proper_name" fi - elif [[ "$python_version" == "3.12" ]]; then - echo "Installing python3.12-pip using dnf (RHEL 9)..." - if ! sudo dnf install -y python3.12-pip; then - echo -e "${RED}Failed to install python3.12-pip using dnf${NC}" + if sudo PYTHONPATH="$tmp_pip_whl" $python3_interpreter -m pip install \ + --no-index "$tmp_pip_whl"; then + echo -e "${GREEN}pip installed from JFrog${NC}" + else + echo -e "${RED}Failed to install pip from wheel${NC}" exit 1 fi else - echo "Installing python3-pip using dnf (RHEL 9)..." - if ! sudo dnf install -y python3-pip; then - echo -e "${RED}Failed to install python3-pip using dnf${NC}" + echo -e "${RED}Failed to download pip wheel from JFrog at ${pip_whl_url}${NC}" + echo -e "${YELLOW}Please upload pip wheel to JFrog ei-generic-binaries:${NC}" + echo -e "${YELLOW} pip download pip --no-deps -d /tmp/pip-dl/${NC}" + echo -e "${YELLOW} curl -u admin:password -T /tmp/pip-dl/pip-*.whl ${jfrog_url}/ei-generic-binaries/pip.whl${NC}" + exit 1 + fi + else + echo "Installing pip using system package manager..." + if command -v dnf &> /dev/null; then + python_version=$($python3_interpreter -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") + if [[ "$python_version" == "3.11" ]]; then + echo "Installing python3.11-pip using dnf (RHEL 9)..." + if ! sudo dnf install -y python3.11-pip; then + echo -e "${RED}Failed to install python3.11-pip using dnf${NC}" + exit 1 + fi + elif [[ "$python_version" == "3.12" ]]; then + echo "Installing python3.12-pip using dnf (RHEL 9)..." + if ! sudo dnf install -y python3.12-pip; then + echo -e "${RED}Failed to install python3.12-pip using dnf${NC}" + exit 1 + fi + else + echo "Installing python3-pip using dnf (RHEL 9)..." + if ! sudo dnf install -y python3-pip; then + echo -e "${RED}Failed to install python3-pip using dnf${NC}" + exit 1 + fi + fi + elif command -v apt &> /dev/null; then + echo "Installing python3-pip using apt (Ubuntu 22/24)..." + if ! sudo apt install -y python3-pip; then + echo -e "${RED}Failed to install python3-pip using apt${NC}" exit 1 fi - fi - elif command -v apt &> /dev/null; then - echo "Installing python3-pip using apt (Ubuntu 22/24)..." - if ! sudo apt install -y python3-pip; then - echo -e "${RED}Failed to install python3-pip using apt${NC}" + else + echo -e "${RED}Unsupported system. This deployment only supports Ubuntu 22/24 and RHEL 9.4${NC}" exit 1 fi - else - echo -e "${RED}Unsupported system. This deployment only supports Ubuntu 22/24 and RHEL 9.4${NC}" - exit 1 fi fi diff --git a/core/lib/system/setup-env.sh b/core/lib/system/setup-env.sh index 9df77aa6..7fa65cba 100644 --- a/core/lib/system/setup-env.sh +++ b/core/lib/system/setup-env.sh @@ -15,22 +15,61 @@ setup_initial_env() { echo "Skipping system prerequisites check due to --skip-check argument." fi + # In airgap mode, configure apt to use JFrog as the Debian/Ubuntu mirror + # so Kubespray's apt update and apt install steps do not reach the internet. + if [[ "$airgap_enabled" == "yes" ]] && command -v apt &> /dev/null; then + echo "Configuring apt to use JFrog Artifactory as Debian mirror (airgap mode)..." + local jfrog_apt_base="http://${jfrog_username}:${jfrog_password}@${jfrog_url#*://}/ei-debian-virtual" + # Strip any trailing /artifactory double-path — jfrog_url already contains /artifactory + jfrog_apt_base="http://${jfrog_username}:${jfrog_password}@$(echo "${jfrog_url}" | sed 's|^https\?://||')/ei-debian-virtual" + sudo tee /etc/apt/sources.list > /dev/null << EOF +deb [trusted=yes] ${jfrog_apt_base} jammy main restricted universe multiverse +deb [trusted=yes] ${jfrog_apt_base} jammy-updates main restricted universe multiverse +deb [trusted=yes] ${jfrog_apt_base} jammy-security main restricted universe multiverse +EOF + echo -e "${GREEN}apt configured to use JFrog at ${jfrog_url}/ei-debian-virtual${NC}" + echo "Refreshing apt package lists from JFrog..." + sudo apt-get update -qq + echo -e "${GREEN}apt package lists updated from JFrog${NC}" + fi + if [[ -n "$https_proxy" ]]; then git config --global http.proxy "$https_proxy" git config --global https.proxy "$https_proxy" fi if [ ! -d "$KUBESPRAYDIR" ]; then - git clone https://github.com/kubernetes-sigs/kubespray.git $KUBESPRAYDIR - if [ $? -ne 0 ] || [ ! -d "$KUBESPRAYDIR/.git" ]; then - echo -e "${RED}----------------------------------------------------------------------------${NC}" - echo -e "${RED}| NOTICE: Failed to clone Kubespray Repository. |${NC}" - echo -e "${RED}| Unable to proceed with Inference Stack Deployment |${NC}" - echo -e "${RED}| due to missing dependency |${NC}" - echo -e "${RED}----------------------------------------------------------------------------${NC}" - exit 1 + if [[ "$airgap_enabled" == "yes" ]]; then + echo "Downloading kubespray from JFrog Artifactory (airgap mode)..." + kubespray_tarball="/tmp/kubespray.tar.gz" + if curl -sf -u "${jfrog_username}:${jfrog_password}" \ + -o "${kubespray_tarball}" \ + "${jfrog_url}/ei-generic-binaries/kubespray.tar.gz"; then + tar -xzf "${kubespray_tarball}" -C "$(dirname "$KUBESPRAYDIR")" + if [ ! -d "$KUBESPRAYDIR" ]; then + echo -e "${RED}Failed to extract kubespray tarball — expected directory: $KUBESPRAYDIR${NC}" + exit 1 + fi + cd $KUBESPRAYDIR + else + echo -e "${RED}----------------------------------------------------------------------------${NC}" + echo -e "${RED}| NOTICE: Failed to download Kubespray from JFrog. |${NC}" + echo -e "${RED}| Ensure kubespray.tar.gz is uploaded to ei-generic-binaries. |${NC}" + echo -e "${RED}----------------------------------------------------------------------------${NC}" + exit 1 + fi + else + git clone https://github.com/kubernetes-sigs/kubespray.git $KUBESPRAYDIR + if [ $? -ne 0 ] || [ ! -d "$KUBESPRAYDIR/.git" ]; then + echo -e "${RED}----------------------------------------------------------------------------${NC}" + echo -e "${RED}| NOTICE: Failed to clone Kubespray Repository. |${NC}" + echo -e "${RED}| Unable to proceed with Inference Stack Deployment |${NC}" + echo -e "${RED}| due to missing dependency |${NC}" + echo -e "${RED}----------------------------------------------------------------------------${NC}" + exit 1 + fi + cd $KUBESPRAYDIR + git checkout "$kubespray_version" fi - cd $KUBESPRAYDIR - git checkout "$kubespray_version" else echo "Kubespray directory already exists, skipping clone." cd $KUBESPRAYDIR @@ -43,17 +82,62 @@ setup_initial_env() { VENVDIR="$KUBESPRAYDIR/venv" REMOTEDIR="/tmp/helm-charts" if [ ! -d "$VENVDIR" ]; then - echo "Installing python3-venv package..." - if command -v apt &> /dev/null; then - python_version=$($python3_interpreter -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") - sudo apt install -y python${python_version}-venv || sudo apt install -y python3-venv + if [[ "$airgap_enabled" != "yes" ]]; then + echo "Installing python3-venv package..." + if command -v apt &> /dev/null; then + python_version=$($python3_interpreter -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") + sudo apt install -y python${python_version}-venv || sudo apt install -y python3-venv + fi fi - if $python3_interpreter -m venv $VENVDIR; then - echo "Virtual environment created within Kubespray directory." + if [[ "$airgap_enabled" == "yes" ]]; then + # In airgap mode ensurepip is unavailable (python3-pip-whl not installed). + # Create the venv without pip, then bootstrap pip from the JFrog wheel. + if ! $python3_interpreter -m venv --without-pip $VENVDIR; then + echo -e "${RED}Failed to create virtual environment.${NC}" + exit 1 + fi + echo "Virtual environment created (without-pip). Bootstrapping pip from JFrog..." + local pip_whl_url="${jfrog_url}/ei-generic-binaries/pip.whl" + local tmp_pip_whl="/tmp/pip-bootstrap.whl" + if ! curl -f -s -u "${jfrog_username}:${jfrog_password}" \ + -o "$tmp_pip_whl" "$pip_whl_url" 2>/dev/null; then + echo -e "${RED}Failed to download pip wheel from JFrog at ${pip_whl_url}${NC}" + exit 1 + fi + # Rename to proper wheel filename using metadata inside the zip + proper_name=$($python3_interpreter -c " +import zipfile, sys +try: + z = zipfile.ZipFile('$tmp_pip_whl') + wf = next(x for x in z.namelist() if x.endswith('.dist-info/WHEEL')) + base = wf.split('/')[0].replace('.dist-info', '') + meta = {} + for line in z.read(wf).decode().splitlines(): + if ': ' in line: + k, v = line.split(': ', 1) + meta[k] = v + tag = meta.get('Tag', 'py3-none-any') + print(f'{base}-{tag}.whl') +except Exception as e: + sys.exit(1) +" 2>/dev/null) + if [ -n "$proper_name" ]; then + mv "$tmp_pip_whl" "/tmp/$proper_name" + tmp_pip_whl="/tmp/$proper_name" + fi + if ! PYTHONPATH="$tmp_pip_whl" $VENVDIR/bin/python3 -m pip install \ + --no-index "$tmp_pip_whl"; then + echo -e "${RED}Failed to bootstrap pip inside virtual environment.${NC}" + exit 1 + fi + echo "pip bootstrapped successfully inside virtual environment." else - echo -e "${RED}Failed to create virtual environment.${NC}" - exit 1 + if ! $python3_interpreter -m venv $VENVDIR; then + echo -e "${RED}Failed to create virtual environment.${NC}" + exit 1 + fi fi + echo "Virtual environment created within Kubespray directory." else echo "Virtual environment already exists within Kubespray directory, skipping creation." fi @@ -72,8 +156,17 @@ setup_initial_env() { fi export PIP_BREAK_SYSTEM_PACKAGES=1 - $VENVDIR/bin/python3 -m pip install --upgrade pip - $VENVDIR/bin/python3 -m pip install -U -r requirements.txt + if [[ "$airgap_enabled" == "yes" ]]; then + jfrog_pip_index="${jfrog_url}/api/pypi/ei-pypi-virtual/simple" + jfrog_host="${jfrog_url#*://}" + jfrog_host="${jfrog_host%%/*}" + pip_extra_args="--index-url http://${jfrog_username}:${jfrog_password}@${jfrog_host}/artifactory/api/pypi/ei-pypi-virtual/simple --trusted-host ${jfrog_host}" + $VENVDIR/bin/python3 -m pip install --upgrade pip $pip_extra_args + $VENVDIR/bin/python3 -m pip install -U -r requirements.txt $pip_extra_args + else + $VENVDIR/bin/python3 -m pip install --upgrade pip + $VENVDIR/bin/python3 -m pip install -U -r requirements.txt + fi echo "Verifying Ansible Installation..." if $VENVDIR/bin/python3 -c "import ansible" &> /dev/null; then @@ -98,7 +191,79 @@ setup_initial_env() { xeon_values_file_path="$REMOTEDIR/vllm/xeon-values.yaml" cp "$HOMEDIR"/inventory/metadata/addons.yml $KUBESPRAYDIR/inventory/mycluster/group_vars/k8s_cluster/addons.yml cp "$HOMEDIR"/inventory/metadata/all.yml $KUBESPRAYDIR/inventory/mycluster/group_vars/all/all.yml - cp -r "$HOMEDIR"/roles/* $KUBESPRAYDIR/roles/ + if [[ "$airgap_enabled" == "yes" ]] && [ -f "$HOMEDIR/inventory/metadata/offline.yml" ]; then + cp "$HOMEDIR"/inventory/metadata/offline.yml $KUBESPRAYDIR/inventory/mycluster/group_vars/all/offline.yml + # Replace any hardcoded IP:8082 in the copied files with the actual JFrog + # host from jfrog_url, so the repo can be reused across environments without + # manual IP edits + local _jfrog_host + _jfrog_host=$(echo "$jfrog_url" | sed 's|https\?://||' | sed 's|/.*||') + # Replace placeholder (fresh copies) AND any stale real IP (reruns after JFrog IP change) + for _f in "$KUBESPRAYDIR/inventory/mycluster/group_vars/all/all.yml" \ + "$KUBESPRAYDIR/inventory/mycluster/group_vars/all/offline.yml"; do + sed -i "s|JFROG_HOST:8082|$_jfrog_host|g" "$_f" + sed -i -E "s|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:8082|$_jfrog_host|g" "$_f" + done + # Inject JFrog credentials into files_repo so Kubespray can authenticate + # when downloading binaries (anonymous access is not enabled for generic repos) + sed -i "s|files_repo: \"http://|files_repo: \"http://${jfrog_username}:${jfrog_password}@|g" \ + "$KUBESPRAYDIR/inventory/mycluster/group_vars/all/offline.yml" + # If credentials were already injected on a prior run the pattern above won't match. + # Normalise by replacing the credentialled URL to ensure the current password is used. + sed -i "s|files_repo: \"http://[^@]*@[0-9.]*:[0-9]*/artifactory|files_repo: \"http://${jfrog_username}:${jfrog_password}@${_jfrog_host}/artifactory|g" \ + "$KUBESPRAYDIR/inventory/mycluster/group_vars/all/offline.yml" + fi + # In airgap mode, force kube_version in k8s_cluster group_vars to match the version + # cached in JFrog. group_vars/k8s_cluster/ has higher Ansible precedence than + # group_vars/all/ so offline.yml's kube_version pin is silently ignored without this. + if [[ "$airgap_enabled" == "yes" ]]; then + local _k8s_cluster_yml="$KUBESPRAYDIR/inventory/mycluster/group_vars/k8s_cluster/k8s-cluster.yml" + local _kube_ver + _kube_ver=$(grep '^kube_version:' "$HOMEDIR/inventory/metadata/offline.yml" 2>/dev/null | awk '{print $2}') + if [ -n "$_kube_ver" ] && [ -f "$_k8s_cluster_yml" ]; then + if grep -q "^kube_version:" "$_k8s_cluster_yml"; then + sed -i "s|^kube_version:.*|kube_version: ${_kube_ver}|" "$_k8s_cluster_yml" + else + echo "kube_version: ${_kube_ver}" >> "$_k8s_cluster_yml" + fi + echo "Pinned kube_version: ${_kube_ver} in k8s_cluster group_vars (airgap mode)" + fi + fi + + # In airgap mode, patch containerd hosts.toml.j2 so every mirror host includes + # Basic auth credentials. containerd's anonymous Bearer token flow fails when + # JFrog anonymous access is restricted — injecting credentials directly bypasses it. + if [[ "$airgap_enabled" == "yes" ]]; then + local _tmpl="$KUBESPRAYDIR/roles/container-engine/containerd/templates/hosts.toml.j2" + if [ -f "$_tmpl" ] && ! grep -q "jfrog_username" "$_tmpl"; then + echo "Patching containerd hosts.toml.j2 with JFrog auth header (airgap mode)..." + cat > /tmp/patch_hosts_toml.py << 'PYEOF' +import sys +path = sys.argv[1] +content = open(path).read() +auth_block = ( + "{%- if airgap_enabled | default(false) | bool and jfrog_username is defined and jfrog_username != '' %}\n" + " [host.\"{{ mirror.host }}\".header]\n" + " Authorization = [\"Basic {{ (jfrog_username + ':' + jfrog_password) | b64encode }}\"]\n" + "{%- endif %}\n" +) +for marker in ('{%- endfor %}', '{% endfor %}'): + idx = content.rfind(marker) + if idx != -1: + content = content[:idx] + auth_block + content[idx:] + open(path, 'w').write(content) + print(f"Patched {path} with JFrog auth header") + break +else: + print(f"WARNING: endfor not found in {path} — skipping patch") +PYEOF + python3 /tmp/patch_hosts_toml.py "$_tmpl" + else + [ ! -f "$_tmpl" ] && echo -e "${YELLOW}hosts.toml.j2 not found at $_tmpl — skipping auth patch${NC}" + fi + fi + + cp -r "$HOMEDIR"/roles/* $KUBESPRAYDIR/roles/ mkdir -p "$KUBESPRAYDIR/config" chmod +x $HOMEDIR/scripts/generate-vault-secrets.sh @@ -158,7 +323,23 @@ setup_initial_env() { echo "Infrastructure readiness check completed successfully." gaudi2_values_file_path="$REMOTEDIR/vllm/gaudi-values.yaml" gaudi3_values_file_path="$REMOTEDIR/vllm/gaudi3-values.yaml" - ansible-galaxy collection install community.kubernetes + if [[ "$airgap_enabled" == "yes" ]]; then + echo "Installing Ansible collections from JFrog Artifactory (airgap mode)..." + for coll_entry in "kubernetes-core:kubernetes.core" "ansible-posix:ansible.posix" "community-kubernetes:community.kubernetes" "community-general:community.general"; do + coll_file="${coll_entry%%:*}" + coll_name="${coll_entry##*:}" + tarball_url="${jfrog_url}/ei-generic-binaries/ansible-collections/${coll_file}-latest.tar.gz" + tmp_file="/tmp/${coll_file}.tar.gz" + echo "Installing ${coll_name} from JFrog..." + if curl -sf -u "${jfrog_username}:${jfrog_password}" -o "${tmp_file}" "${tarball_url}"; then + ansible-galaxy collection install "${tmp_file}" --force + else + echo -e "${YELLOW}Warning: ${coll_name} not found in JFrog at ${tarball_url} — skipping${NC}" + fi + done + else + ansible-galaxy collection install kubernetes.core community.general ansible.posix + fi } diff --git a/core/lib/xeon/ballon-policy.sh b/core/lib/xeon/ballon-policy.sh index 69069fa6..9fc6bf51 100644 --- a/core/lib/xeon/ballon-policy.sh +++ b/core/lib/xeon/ballon-policy.sh @@ -12,11 +12,12 @@ deploy_nri_balloons_playbook() { exit 1 fi - if [ "$deploy_nri_balloon_policy" == "yes" ] || [ "$cpu_or_gpu" == "c" ]; then + if [ "$deploy_nri_balloon_policy" == "yes" ]; then echo "${GREEN}Deploying CPU optimization with topology detection and NRI balloon policy${NC}" ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-cpu-optimization.yml \ --extra-vars "cpu_playbook=true" \ - --extra-vars "kubernetes_platform=${kubernetes_platform}" + --extra-vars "kubernetes_platform=${kubernetes_platform}" \ + --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" if [ $? -eq 0 ]; then echo "${GREEN}CPU optimization deployed successfully${NC}" else diff --git a/core/playbooks/deploy-cluster-config.yml b/core/playbooks/deploy-cluster-config.yml index bcd66d3f..d46ebe1f 100644 --- a/core/playbooks/deploy-cluster-config.yml +++ b/core/playbooks/deploy-cluster-config.yml @@ -15,7 +15,7 @@ - role: inference-tools tasks: - name: Create TLS cert for Admin dashboard - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -30,7 +30,7 @@ register: kubectl_output tags: deploy_cluster_dashboard - name: Create Admin dashboard ingress - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: networking.k8s.io/v1 @@ -61,7 +61,7 @@ when: brownfield_deployment != "yes" tags: deploy_cluster_dashboard - name: Create ServiceAccount for Admin Dashboard - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -71,7 +71,7 @@ namespace: kube-system tags: deploy_cluster_dashboard - name: Create ClusterRole for Kubernetes Dashboard - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -97,7 +97,7 @@ verbs: ["get", "create", "update"] tags: deploy_cluster_dashboard - name: Create ClusterRoleBinding for Admin Dashboard - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 diff --git a/core/playbooks/deploy-cpu-optimization.yml b/core/playbooks/deploy-cpu-optimization.yml index 078140f3..56edd97f 100644 --- a/core/playbooks/deploy-cpu-optimization.yml +++ b/core/playbooks/deploy-cpu-optimization.yml @@ -7,6 +7,7 @@ gather_facts: true run_once: true vars_files: + - "{{ lookup('env', 'PWD') }}/config/vars/inference_common.yml" - "{{ lookup('env', 'PWD') }}/config/vars/inference_llm_models.yml" - "{{ lookup('env', 'PWD') }}/config/inference_env.yml" vars: diff --git a/core/playbooks/deploy-genai-gateway.yml b/core/playbooks/deploy-genai-gateway.yml index d4394f6e..81d469f2 100644 --- a/core/playbooks/deploy-genai-gateway.yml +++ b/core/playbooks/deploy-genai-gateway.yml @@ -29,7 +29,7 @@ name: genai-gateway run_once: true - name: Create TLS secret for GenAI Gateway - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -109,11 +109,61 @@ helm dependency update {{ remote_helm_charts_base }}/genai-gateway register: genai_gateway_deps run_once: true + when: not airgap_enabled | bool + + - name: Ensure genai-gateway charts directory exists (airgap) + ansible.builtin.file: + path: "{{ remote_helm_charts_base }}/genai-gateway/charts" + state: directory + mode: "0755" + run_once: true + when: airgap_enabled | bool + + - name: Add JFrog Helm repo for genai-gateway dependencies (airgap) + ansible.builtin.command: > + helm repo add ei-helm-deps {{ jfrog_url }}/ei-helm-virtual + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + run_once: true + when: airgap_enabled | bool + changed_when: false + + - name: Patch genai-gateway Chart.yaml - replace OCI repo with JFrog HTTP (airgap) + ansible.builtin.replace: + path: "{{ remote_helm_charts_base }}/genai-gateway/Chart.yaml" + regexp: 'repository: oci://registry-1\.docker\.io/bitnamicharts' + replace: "repository: {{ jfrog_url }}/ei-helm-virtual" + run_once: true + when: airgap_enabled | bool + + - name: Patch genai-gateway Chart.yaml - pin postgresql version (airgap) + ansible.builtin.replace: + path: "{{ remote_helm_charts_base }}/genai-gateway/Chart.yaml" + regexp: 'version: ">=13\.3\.0"' + replace: 'version: "16.7.4"' + run_once: true + when: airgap_enabled | bool + + - name: Patch genai-gateway Chart.yaml - pin redis version (airgap) + ansible.builtin.replace: + path: "{{ remote_helm_charts_base }}/genai-gateway/Chart.yaml" + regexp: 'version: ">=18\.0\.0"' + replace: 'version: "21.1.3"' + run_once: true + when: airgap_enabled | bool + + - name: Update genai-gateway helm dependencies (airgap via JFrog) + ansible.builtin.command: > + helm dependency update {{ remote_helm_charts_base }}/genai-gateway + register: genai_gateway_deps + run_once: true + when: airgap_enabled | bool - name: Install GenAI Gateway System command: > helm upgrade --install genai-gateway {{ remote_helm_charts_base }}/genai-gateway --namespace genai-gateway --create-namespace + --set global.security.allowInsecureImages=true --set env.LITELLM_MASTER_KEY={{ litellm_master_key }} --set env.LITELLM_SALT_KEY={{ litellm_salt_key }} --set env.LANGFUSE_SECRET_KEY={{ langfuse_secret_key }} @@ -158,17 +208,27 @@ failed_when: pod_status.rc != 0 and pod_status.stdout != "0" run_once: true - - name: Add GenAI Gateway Trace Repository + - name: Add GenAI Gateway Trace Repository (internet) command: > helm repo add langfuse https://langfuse.github.io/langfuse-k8s run_once: true + when: not airgap_enabled | bool + + - name: Add GenAI Gateway Trace Repository (airgap via JFrog) + command: > + helm repo add langfuse {{ helm_repo_langfuse }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + run_once: true + when: airgap_enabled | bool + changed_when: false - name: Update GenAI Gateway Trace Repositories command: > helm repo update run_once: true - name: Create TLS secret for GenAI Gateway - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 diff --git a/core/playbooks/deploy-habana-ai-operator.yml b/core/playbooks/deploy-habana-ai-operator.yml index 8dc26935..8c759304 100644 --- a/core/playbooks/deploy-habana-ai-operator.yml +++ b/core/playbooks/deploy-habana-ai-operator.yml @@ -13,12 +13,12 @@ - role: inference-tools tasks: - name: Create Kubernetes Namespace for Habana AI Operator - community.kubernetes.k8s: + kubernetes.core.k8s: state: present kind: Namespace name: habana-ai-operator - name: Label namespace for pod Security Enforcement - community.kubernetes.k8s: + kubernetes.core.k8s: state: present kind: Namespace name: habana-ai-operator @@ -29,7 +29,7 @@ pod-security.kubernetes.io/audit: privileged pod-security.kubernetes.io/warn: privileged - name: Add Gaudi Repository - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: gaudi-helm repo_url: https://vault.habana.ai/artifactory/api/helm/gaudi-helm state: present @@ -40,7 +40,7 @@ run_once: true - name: Deploy Habana AI Operator - community.kubernetes.helm: + kubernetes.core.helm: name: habana-ai-operator chart_ref: gaudi-helm/habana-ai-operator release_namespace: habana-ai-operator diff --git a/core/playbooks/deploy-inference-models.yml b/core/playbooks/deploy-inference-models.yml index ee370268..385e34ac 100644 --- a/core/playbooks/deploy-inference-models.yml +++ b/core/playbooks/deploy-inference-models.yml @@ -745,12 +745,14 @@ {% endif %} {% if cpu_playbook == 'true' %} --values {{ remote_helm_charts_base }}/vllm/xeon-values.yaml + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" --set memory="{{ optimal_memory_gb | default(8) }}Gi" --set tensor_parallel_size={{ optimal_balloon_config.tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ optimal_balloon_config.pipeline_parallel_size | default(1) }} + {% endif %} {% elif gaudi_deployment|lower == "true" %} --set tensor_parallel_size={{ huggingface_tensor_parellel_size }} --values {{ gaudi_values_file }} @@ -1998,6 +2000,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2005,6 +2008,7 @@ --set tensor_parallel_size={{ tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2028,6 +2032,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_llama8b @@ -2101,6 +2109,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2108,6 +2117,7 @@ --set tensor_parallel_size={{ model_paralletensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2131,6 +2141,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_deepseek_qwen32b @@ -2203,6 +2217,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2210,6 +2225,7 @@ --set tensor_parallel_size={{ tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2233,6 +2249,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_deepseek_llama_8b @@ -2308,6 +2328,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2315,6 +2336,7 @@ --set tensor_parallel_size={{ tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2338,6 +2360,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_llama3_2_3b_cpu @@ -2418,6 +2444,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2425,6 +2452,7 @@ --set tensor_parallel_size={{ tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2448,6 +2476,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_llama3_2_3b_cpu @@ -2523,6 +2555,7 @@ --set svcmonitor.enabled="{{ vllm_metrics_enabled }}" --set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }} {% if cpu_playbook == 'true' %} + {% if enable_cpu_balloons | default(false) | bool %} --set cpu_balloon_annotation="vllm-balloon" --set podLabels.name="vllm" --set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}" @@ -2530,6 +2563,7 @@ --set tensor_parallel_size={{ tensor_parallel_size | default(1) }} --set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }} {% endif %} + {% endif %} {% if apisix_enabled %} --set apisix.enabled={{ apisix_enabled }} --set platform={{ kubernetes_platform }} @@ -2553,6 +2587,10 @@ --set oidc.client_id={{ keycloak_client_id | default('') }} --set oidc.client_secret={{ client_secret | default('') }} {% endif %} + {% if airgap_enabled | default(false) | bool %} + --set defaultModelConfigs.configMapValues.HF_HUB_OFFLINE="1" + --set defaultModelConfigs.configMapValues.TRANSFORMERS_OFFLINE="1" + {% endif %} {{ helm_proxy_args | default('') }} --force register: helm_upgrade_install_model_deployment_cpu_llama3_2_3b_cpu @@ -2697,6 +2735,7 @@ ansible.builtin.shell: cmd: "helm list --short | grep 'vllm-'" register: inference_models + failed_when: inference_models.rc not in [0, 1] when: list_model_true == 'true' - name: Print Installed Models in Comma Separated Format ansible.builtin.debug: diff --git a/core/playbooks/deploy-ingress-controller.yml b/core/playbooks/deploy-ingress-controller.yml index 5e0fd648..a68e12f4 100644 --- a/core/playbooks/deploy-ingress-controller.yml +++ b/core/playbooks/deploy-ingress-controller.yml @@ -11,11 +11,20 @@ roles: - role: inference-tools tasks: - - name: Add the Ingress-NGINX Helm repository - community.kubernetes.helm_repository: + - name: Add the Ingress-NGINX Helm repository (internet) + kubernetes.core.helm_repository: name: ingress-nginx repo_url: https://kubernetes.github.io/ingress-nginx state: present + when: not airgap_enabled | bool + + - name: Add the Ingress-NGINX Helm repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add ingress-nginx {{ helm_repo_ingress_nginx }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: airgap_enabled | bool + changed_when: false - name: Validate if the Helm repositories are configured correctly ansible.builtin.command: helm repo list register: helm_repo_list @@ -51,18 +60,21 @@ run_once: true - name: Deploy Ingress Nginx Controller - community.kubernetes.helm: + kubernetes.core.helm: name: ingress-nginx chart_ref: ingress-nginx/ingress-nginx release_namespace: ingress-nginx create_namespace: true chart_version: "{{ ingress_controller | default('4.12.2') }}" state: present + force: true values: controller: progressDeadlineSeconds: 300 minReadySeconds: 0 replicaCount: "{{ inference_infra_replica_count | int }}" + image: + digest: "{{ '' if airgap_enabled | default(false) | bool else omit }}" hostPort: enabled: true ports: @@ -94,6 +106,10 @@ matchLabels: app: ingress-nginx topologyKey: "kubernetes.io/hostname" + admissionWebhooks: + patch: + image: + digest: "{{ '' if airgap_enabled | default(false) | bool else omit }}" run_once: true - name: Pause to Allow Controller to Initialize pause: @@ -102,7 +118,8 @@ shell: | kubectl get pods -n ingress-nginx -o json | jq -r ' .items[] | - select(.status.phase != "Running" or (.status.containerStatuses[] | select(.ready != true))) | + select(.status.phase != "Running" and .status.phase != "Succeeded") | + select(.status.containerStatuses == null or (.status.containerStatuses[] | select(.ready != true))) | .metadata.name' | wc -l register: pod_status until: pod_status.stdout == "0" and pod_status.rc == 0 diff --git a/core/playbooks/deploy-keycloak-controller.yml b/core/playbooks/deploy-keycloak-controller.yml index de0220c6..b6db9358 100644 --- a/core/playbooks/deploy-keycloak-controller.yml +++ b/core/playbooks/deploy-keycloak-controller.yml @@ -12,12 +12,20 @@ roles: - role: inference-tools tasks: - - name: Add Ingress-Nginx repository - community.kubernetes.helm_repository: + - name: Add Ingress-Nginx repository (internet) + kubernetes.core.helm_repository: name: ingress-nginx repo_url: https://kubernetes.github.io/ingress-nginx state: present - when: delete_pv_on_purge == "no" + when: delete_pv_on_purge == "no" and not airgap_enabled | bool + + - name: Add Ingress-Nginx repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add ingress-nginx {{ helm_repo_ingress_nginx }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: delete_pv_on_purge == "no" and airgap_enabled | bool + changed_when: false - name: Verify repository availability ansible.builtin.command: helm repo list register: helm_repo_list diff --git a/core/playbooks/deploy-keycloak-service.yml b/core/playbooks/deploy-keycloak-service.yml index 30219569..f5385cb6 100644 --- a/core/playbooks/deploy-keycloak-service.yml +++ b/core/playbooks/deploy-keycloak-service.yml @@ -11,11 +11,20 @@ roles: - role: inference-tools tasks: - - name: Add ingress-nginx repository using Helm module - community.kubernetes.helm_repository: + - name: Add ingress-nginx repository using Helm module (internet) + kubernetes.core.helm_repository: name: ingress-nginx repo_url: https://kubernetes.github.io/ingress-nginx state: present + when: not airgap_enabled | bool + + - name: Add ingress-nginx repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add ingress-nginx {{ helm_repo_ingress_nginx }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: airgap_enabled | bool + changed_when: false - name: Add Ingress-Nginx repository ansible.builtin.command: helm repo list register: helm_repo_list diff --git a/core/playbooks/deploy-keycloak-tls-cert.yml b/core/playbooks/deploy-keycloak-tls-cert.yml index 65836378..073fe3c5 100644 --- a/core/playbooks/deploy-keycloak-tls-cert.yml +++ b/core/playbooks/deploy-keycloak-tls-cert.yml @@ -24,7 +24,7 @@ debug: var: cert_file, key_file, secret_name - name: Create TLS secret for Keycloak - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -38,7 +38,7 @@ tls.key: "{{ lookup('file', key_file) | b64encode }}" register: kubectl_output - name: Create TLS secret for Keycloak in APISIX namespace - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -75,13 +75,26 @@ namespace: genai-gateway ignore_errors: true + - name: Add ei-helm repository for Keycloak (airgap via JFrog) + ansible.builtin.command: > + helm repo add ei-helm {{ jfrog_url }}/ei-helm-virtual + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: airgap_enabled | bool and deploy_keycloak == "yes" + changed_when: false + + - name: Update Helm repositories for Keycloak (airgap) + ansible.builtin.command: helm repo update + when: airgap_enabled | bool and deploy_keycloak == "yes" + changed_when: false + - name: Deploy Keycloak System run_once: true register: helm_output when: deploy_keycloak == "yes" - community.kubernetes.helm: + kubernetes.core.helm: name: keycloak - chart_ref: oci://registry-1.docker.io/bitnamicharts/keycloak + chart_ref: "{{ 'ei-helm/keycloak' if airgap_enabled | bool else 'oci://registry-1.docker.io/bitnamicharts/keycloak' }}" release_namespace: default # Set the namespace where Keycloak will be installed create_namespace: true chart_version: "{{ keycloak_chart_version|default('22.1.0') }}" @@ -202,7 +215,7 @@ app: keycloak topologyKey: "kubernetes.io/hostname" - name: Verify Keycloak StatefulSet readiness - community.kubernetes.k8s_info: + kubernetes.core.k8s_info: kind: StatefulSet namespace: default name: keycloak @@ -368,13 +381,68 @@ path: "{{ remote_helm_charts_base }}/keycloak/templates/ingress_eks.yaml" state: absent when: kubernetes_platform != "eks" - - name: Synchronize dependencies for APISIX + - name: Add APISIX Helm repository (internet) + ansible.builtin.command: > + helm repo add apisix https://charts.apiseven.com --force-update + when: deploy_apisix == "yes" and not airgap_enabled | bool + changed_when: false + + - name: Add APISIX Helm repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add apisix {{ helm_repo_apisix }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: deploy_apisix == "yes" and airgap_enabled | bool + changed_when: false + + - name: Update Helm repositories for APISIX + ansible.builtin.command: helm repo update + when: deploy_apisix == "yes" + changed_when: false + + - name: Synchronize dependencies for APISIX (internet) ansible.builtin.command: helm dependency update "{{ remote_helm_charts_base }}/apisix-helm/" register: helm_dependency_update failed_when: helm_dependency_update.rc != 0 - when: deploy_apisix == "yes" + when: + - deploy_apisix == "yes" + - not airgap_enabled | bool + + - name: Create charts directory for APISIX (airgap) + ansible.builtin.file: + path: "{{ remote_helm_charts_base }}/apisix-helm/charts" + state: directory + mode: '0755' + when: + - deploy_apisix == "yes" + - airgap_enabled | bool + + - name: Pull APISIX subchart from JFrog (airgap) + ansible.builtin.command: > + helm pull apisix/apisix --version 2.8.1 + --destination "{{ remote_helm_charts_base }}/apisix-helm/charts" + when: + - deploy_apisix == "yes" + - airgap_enabled | bool + changed_when: true + + - name: Patch APISIX Chart.yaml to use JFrog repository (airgap) + ansible.builtin.replace: + path: "{{ remote_helm_charts_base }}/apisix-helm/Chart.yaml" + regexp: 'repository: https://charts\.apiseven\.com' + replace: "repository: {{ helm_repo_apisix }}" + when: + - deploy_apisix == "yes" + - airgap_enabled | bool + + - name: Build APISIX dependencies from local charts (airgap) + ansible.builtin.command: helm dependency build "{{ remote_helm_charts_base }}/apisix-helm/" + when: + - deploy_apisix == "yes" + - airgap_enabled | bool + changed_when: true - name: Synchronize dependencies for APISIX - community.kubernetes.helm: + kubernetes.core.helm: name: auth-apisix chart_ref: "{{ remote_helm_charts_base }}/apisix-helm/" update_repo_cache: yes diff --git a/core/playbooks/deploy-observability-openshift.yml b/core/playbooks/deploy-observability-openshift.yml index a559a1e0..b0b438c5 100644 --- a/core/playbooks/deploy-observability-openshift.yml +++ b/core/playbooks/deploy-observability-openshift.yml @@ -37,7 +37,7 @@ tags: always - name: Create observability namespace - community.kubernetes.k8s: + kubernetes.core.k8s: name: observability api_version: v1 kind: Namespace @@ -51,7 +51,7 @@ - name: Enable OpenShift User Workload Monitoring run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -146,7 +146,7 @@ # ========================================================================= - name: Add Bitnami Helm repository tags: deploy_logging - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: bitnami repo_url: https://charts.bitnami.com/bitnami when: deploy_logging == "yes" @@ -167,7 +167,7 @@ tags: deploy_logging - name: Install Elasticsearch System (logging) - community.kubernetes.helm: + kubernetes.core.helm: name: logging-elasticsearch chart_ref: bitnami/elasticsearch release_namespace: observability @@ -217,7 +217,7 @@ tags: deploy_logging - name: Create Fluent Bit ClusterRoleBinding - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -236,7 +236,7 @@ tags: deploy_logging - name: Install Fluent Bit - community.kubernetes.helm: + kubernetes.core.helm: name: logging-fluentbit chart_ref: oci://registry-1.docker.io/bitnamicharts/fluent-bit release_namespace: observability @@ -251,7 +251,7 @@ tags: deploy_logging block: - name: Update Fluent Bit ConfigMap - community.kubernetes.k8s: + kubernetes.core.k8s: state: present src: "{{ remote_helm_charts_base }}/fluentbit/fluentbit-config.yml" kind: ConfigMap @@ -267,14 +267,14 @@ - name: Add Grafana Helm repo (Loki) run_once: true tags: deploy_observability - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: grafana repo_url: https://grafana.github.io/helm-charts - name: Add OpenTelemetry Helm repo run_once: true tags: deploy_observability - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: open-telemetry repo_url: https://open-telemetry.github.io/opentelemetry-helm-charts @@ -300,7 +300,7 @@ - name: Create ClusterRole for OTEL Collector log access run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -318,7 +318,7 @@ - name: Bind OTEL Collector ClusterRole run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -412,7 +412,7 @@ - name: Create Grafana ServiceAccount run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -424,7 +424,7 @@ - name: Grant Grafana access to Prometheus run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -443,7 +443,7 @@ - name: Deploy Grafana run_once: true tags: deploy_observability - community.kubernetes.helm: + kubernetes.core.helm: name: grafana chart_ref: grafana/grafana release_namespace: observability @@ -524,7 +524,7 @@ - name: Create Grafana ServiceAccount token secret run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -716,7 +716,7 @@ - name: Create Grafana Route run_once: true tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: route.openshift.io/v1 @@ -882,7 +882,7 @@ tags: deploy_observability - name: Apply Habana Metrics Service Monitor - community.kubernetes.k8s: + kubernetes.core.k8s: state: present src: "{{ remote_helm_charts_base }}/habana-exporter/habana-metrics.yml" when: gaudi_available diff --git a/core/playbooks/deploy-observability.yml b/core/playbooks/deploy-observability.yml index c1e5863a..5c99a1f3 100644 --- a/core/playbooks/deploy-observability.yml +++ b/core/playbooks/deploy-observability.yml @@ -14,7 +14,7 @@ - role: inference-tools tasks: - name: Create kubernetes namespace for Observability - community.kubernetes.k8s: + kubernetes.core.k8s: name: observability api_version: v1 kind: Namespace @@ -22,7 +22,7 @@ run_once: true tags: always - name: Add Observability repository - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: prometheus-community repo_url: https://prometheus-community.github.io/helm-charts tags: deploy_observability @@ -83,7 +83,7 @@ - name: Install Observability Stack tags: deploy_observability - community.kubernetes.helm: + kubernetes.core.helm: name: observability chart_ref: prometheus-community/kube-prometheus-stack release_namespace: observability @@ -153,7 +153,7 @@ run_once: true - name: Create TLS cert for observability tags: deploy_observability - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: v1 @@ -170,7 +170,7 @@ - name: Create Grafana observability ingress with ALB (EKS) tags: deploy_observability when: kubernetes_platform == "eks" - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: networking.k8s.io/v1 @@ -209,7 +209,7 @@ - name: Create Grafana observability ingress with nginx (non-EKS) tags: deploy_observability when: kubernetes_platform is not defined or kubernetes_platform != "eks" - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: networking.k8s.io/v1 @@ -239,7 +239,7 @@ - name: Add Bitnami Helm repository tags: deploy_logging - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: bitnami repo_url: https://charts.bitnami.com/bitnami when: deploy_logging == "yes" @@ -260,7 +260,7 @@ tags: deploy_logging - name: Install Elasticsearch System - community.kubernetes.helm: + kubernetes.core.helm: name: logging-elasticsearch chart_ref: bitnami/elasticsearch release_namespace: observability @@ -311,7 +311,7 @@ tags: deploy_logging - name: Create Fluent Bit ClusterRoleBinding - community.kubernetes.k8s: + kubernetes.core.k8s: state: present definition: apiVersion: rbac.authorization.k8s.io/v1 @@ -330,7 +330,7 @@ tags: deploy_logging - name: Install Fluent Bit - community.kubernetes.helm: + kubernetes.core.helm: name: logging-fluentbit chart_ref: oci://registry-1.docker.io/bitnamicharts/fluent-bit release_namespace: observability @@ -341,7 +341,7 @@ tags: deploy_logging - name: Update Fluent Bit ConfigMap - community.kubernetes.k8s: + kubernetes.core.k8s: state: present src: "{{ remote_helm_charts_base }}/fluentbit/fluentbit-config.yml" kind: ConfigMap @@ -360,14 +360,14 @@ - name: Add Grafana Helm repo (Loki) run_once: true tags: deploy_observability - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: grafana repo_url: https://grafana.github.io/helm-charts - name: Add OpenTelemetry Helm repo run_once: true tags: deploy_observability - community.kubernetes.helm_repository: + kubernetes.core.helm_repository: name: open-telemetry repo_url: https://open-telemetry.github.io/opentelemetry-helm-charts @@ -451,7 +451,7 @@ tags: deploy_observability - name: Apply Habana Metrics Service Monitor - community.kubernetes.k8s: + kubernetes.core.k8s: state: present src: "{{ remote_helm_charts_base }}/habana-exporter/habana-metrics.yml" when: gaudi_available diff --git a/core/playbooks/register-model-genai-gateway.yml b/core/playbooks/register-model-genai-gateway.yml index 12f51e24..96a7b43a 100644 --- a/core/playbooks/register-model-genai-gateway.yml +++ b/core/playbooks/register-model-genai-gateway.yml @@ -20,7 +20,8 @@ restartPolicy: Never containers: - name: register-model - image: curlimages/curl:latest + image: docker.io/library/nginx:1.25.2-alpine + imagePullPolicy: IfNotPresent command: ["/bin/sh", "-c"] args: - | diff --git a/core/roles/container-engine/containerd/templates/hosts.toml.j2 b/core/roles/container-engine/containerd/templates/hosts.toml.j2 new file mode 100644 index 00000000..a46bb997 --- /dev/null +++ b/core/roles/container-engine/containerd/templates/hosts.toml.j2 @@ -0,0 +1,13 @@ +server = "{{ item.server | default("https://" + item.prefix) }}" +{% for mirror in item.mirrors %} +[host."{{ mirror.host }}"] + capabilities = ["{{ ([ mirror.capabilities ] | flatten ) | join('","') }}"] + {% if mirror.skip_verify | default(false) %} + skip_verify = true + {% endif %} + override_path = {{ mirror.override_path | default('false') | string | lower }} + {% if airgap_enabled | default(false) | bool and jfrog_username is defined and jfrog_password is defined %} + [host."{{ mirror.host }}".header] + Authorization = ["Basic {{ (jfrog_username + ':' + jfrog_password) | b64encode }}"] + {% endif %} +{% endfor %} diff --git a/core/roles/inference-tools/tasks/main.yml b/core/roles/inference-tools/tasks/main.yml index 12801161..28e75795 100644 --- a/core/roles/inference-tools/tasks/main.yml +++ b/core/roles/inference-tools/tasks/main.yml @@ -1,27 +1,69 @@ # Copyright (C) 2025-2026 Intel Corporation # SPDX-License-Identifier: Apache-2.0 --- -- name: Ensure Python pip module is installed +- name: Ensure Python pip module is installed (internet) ansible.builtin.package: name: python3-pip state: present become: true + when: not airgap_enabled | default(false) | bool tags: always -- name: Install Kubernetes Python SDK + +- name: Ensure Python pip module is installed (airgap) + ansible.builtin.shell: | + python3 -m pip --version &>/dev/null && exit 0 + apt-get install -y --fix-missing python3-pip 2>&1 || true + args: + executable: /bin/bash + become: true + when: airgap_enabled | default(false) | bool + tags: always +- name: Install Kubernetes Python SDK (internet) ansible.builtin.pip: name: kubernetes state: present become: true ignore_errors: true register: pip_install_result + when: not airgap_enabled | default(false) | bool tags: always -- name: Install Kubernetes Python SDK Fallback + +- name: Install Kubernetes Python SDK (airgap via JFrog) + ansible.builtin.pip: + name: kubernetes + state: present + extra_args: >- + --index-url http://{{ jfrog_username }}:{{ jfrog_password }}@{{ jfrog_url | regex_replace('^https?://', '') }}/api/pypi/ei-pypi-virtual/simple + --trusted-host {{ jfrog_url | regex_replace('^https?://', '') | regex_replace('/.*$', '') }} + become: true + ignore_errors: true + register: pip_install_result + when: airgap_enabled | default(false) | bool + tags: always + +- name: Install Kubernetes Python SDK Fallback (internet) ansible.builtin.pip: name: kubernetes state: present extra_args: "--break-system-packages" become: true - when: pip_install_result is failed + when: + - not airgap_enabled | default(false) | bool + - pip_install_result is failed + tags: always + +- name: Install Kubernetes Python SDK Fallback (airgap via JFrog) + ansible.builtin.pip: + name: kubernetes + state: present + extra_args: >- + --break-system-packages + --index-url http://{{ jfrog_username }}:{{ jfrog_password }}@{{ jfrog_url | regex_replace('^https?://', '') }}/api/pypi/ei-pypi-virtual/simple + --trusted-host {{ jfrog_url | regex_replace('^https?://', '') | regex_replace('/.*$', '') }} + become: true + when: + - airgap_enabled | default(false) | bool + - pip_install_result is failed tags: always - name: Deploy fix script for kubernetes SDK no_proxy bug ansible.builtin.copy: @@ -61,54 +103,57 @@ changed_when: _no_proxy_fix.rc == 2 failed_when: _no_proxy_fix.rc not in [0, 2] tags: always -- name: Deploy fix script for kubernetes SDK no_proxy bug - ansible.builtin.copy: - dest: /tmp/_fix_k8s_no_proxy.py - mode: '0644' - content: | - import re, sys, os - path = sys.argv[1] - with open(path, 'r') as f: - original = f.read() - # Remove the duplicate self.no_proxy = None that appears after the no_proxy - # env-loading block (bug introduced in kubernetes SDK >= 34.x by code generator). - # Pattern: the env-loading line is followed within 3 lines by a bare self.no_proxy = None - fixed = re.sub( - r'(if os\.getenv\("no_proxy"\)[^\n]+\n(?:.*\n){1,3}?)(\s+self\.no_proxy = None\n)', - r'\1', - original - ) - if fixed == original: - print("OK: no duplicate no_proxy line found, nothing to do") - sys.exit(0) - with open(path, 'w') as f: - f.write(fixed) - print("FIXED: removed duplicate self.no_proxy = None from " + path) - sys.exit(2) - become: true - tags: always -- name: Fix kubernetes SDK no_proxy bug (duplicate self.no_proxy = None after env-loading block) +- name: Install Deployment Client (internet) ansible.builtin.shell: | - set -e - KUBE_CFG=$(python3 -c "import kubernetes, os; print(os.path.join(os.path.dirname(kubernetes.__file__), 'client', 'configuration.py'))" 2>/dev/null) || exit 0 - python3 /tmp/_fix_k8s_no_proxy.py "$KUBE_CFG" + curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash args: executable: /bin/bash become: true - register: _no_proxy_fix - changed_when: _no_proxy_fix.rc == 2 - failed_when: _no_proxy_fix.rc not in [0, 2] + when: not airgap_enabled | default(false) | bool tags: always -- name: Install Deployment Client + +- name: Install Deployment Client (airgap via JFrog) ansible.builtin.shell: | - curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + if ! command -v helm &>/dev/null; then + curl -sfL -u {{ jfrog_username }}:{{ jfrog_password }} \ + {{ jfrog_url }}/ei-generic-binaries/get.helm.sh/helm-v3.15.4-linux-amd64.tar.gz \ + -o /tmp/helm.tar.gz && \ + tar xzf /tmp/helm.tar.gz -C /tmp && \ + mv /tmp/linux-amd64/helm /usr/local/bin/helm && \ + chmod +x /usr/local/bin/helm && \ + rm -f /tmp/helm.tar.gz + fi args: executable: /bin/bash become: true + when: airgap_enabled | default(false) | bool tags: always -- name: Ensure jq is installed +- name: Ensure jq is installed (internet) ansible.builtin.package: name: jq state: present become: true + when: not airgap_enabled | default(false) | bool + tags: always + +- name: Ensure jq is installed (airgap via JFrog) + ansible.builtin.shell: | + if command -v jq &>/dev/null; then exit 0; fi + cd /tmp + for pkg in libonig5 libjq1 jq; do + deb=$(curl -s -u {{ jfrog_username }}:{{ jfrog_password }} \ + "{{ jfrog_url }}/ei-generic-binaries/apt-debs/" | grep -o "${pkg}[^\"]*\.deb" | head -1) + if [ -z "$deb" ]; then + echo "ERROR: could not find $pkg deb in JFrog" + exit 1 + fi + curl -sfL -u {{ jfrog_username }}:{{ jfrog_password }} \ + -o "/tmp/${deb}" \ + "{{ jfrog_url }}/ei-generic-binaries/apt-debs/${deb}" + dpkg -i "/tmp/${deb}" || true + done + args: + executable: /bin/bash + become: true + when: airgap_enabled | default(false) | bool tags: always diff --git a/core/roles/nri_cpu_balloons/tasks/install_nri.yaml b/core/roles/nri_cpu_balloons/tasks/install_nri.yaml index b83248c8..8d3207a4 100644 --- a/core/roles/nri_cpu_balloons/tasks/install_nri.yaml +++ b/core/roles/nri_cpu_balloons/tasks/install_nri.yaml @@ -61,7 +61,45 @@ - "Installing NRI support with runtime-specific configuration..." - "Runtime patching required: {{ (containerd_needs_patch | default(false)) or (crio_needs_patch | default(false)) }}" -- name: Add NRI plugins Helm repository +- name: Check if NRI plugin section already exists in containerd config + ansible.builtin.shell: grep -c "io.containerd.nri.v1.nri" /etc/containerd/config.toml || true + become: true + register: nri_already_configured + changed_when: false + when: runtime_name == 'containerd' + +- name: Enable NRI in containerd config + ansible.builtin.blockinfile: + path: /etc/containerd/config.toml + marker: "# {mark} ANSIBLE MANAGED - NRI" + block: | + [plugins."io.containerd.nri.v1.nri"] + disable = false + become: true + register: containerd_nri_config + when: + - runtime_name == 'containerd' + - (nri_already_configured.stdout | default('0') | int) == 0 + +- name: Restart containerd to apply NRI config + ansible.builtin.systemd: + name: containerd + state: restarted + become: true + when: + - runtime_name == 'containerd' + - containerd_nri_config.changed + +- name: Wait for containerd to be ready after restart + ansible.builtin.wait_for: + path: /run/containerd/containerd.sock + state: present + timeout: 60 + when: + - runtime_name == 'containerd' + - containerd_nri_config.changed + +- name: Add NRI plugins Helm repository (internet) kubernetes.core.helm_repository: name: "{{ nri_cpu_balloons.helm.repo_name }}" repo_url: "{{ nri_cpu_balloons.helm.repo_url }}" @@ -70,6 +108,15 @@ HTTP_PROXY: "{{ http_proxy | default(ansible_env.HTTP_PROXY | default('')) }}" HTTPS_PROXY: "{{ https_proxy | default(ansible_env.HTTPS_PROXY | default('')) }}" NO_PROXY: "{{ no_proxy | default(ansible_env.NO_PROXY | default('')) }}" + when: not airgap_enabled | default(false) | bool + +- name: Add NRI plugins Helm repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add {{ nri_cpu_balloons.helm.repo_name }} {{ helm_repo_nri_plugins }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: airgap_enabled | default(false) | bool + changed_when: false - name: Update Helm repositories for NRI ansible.builtin.command: helm repo update diff --git a/core/roles/nri_cpu_balloons/tasks/install_nri_openshift.yaml b/core/roles/nri_cpu_balloons/tasks/install_nri_openshift.yaml index 5a12129d..e3653dfd 100644 --- a/core/roles/nri_cpu_balloons/tasks/install_nri_openshift.yaml +++ b/core/roles/nri_cpu_balloons/tasks/install_nri_openshift.yaml @@ -180,7 +180,7 @@ # ============================================================================ # STEP 3: Deploy NRI using Helm # ============================================================================ -- name: Add NRI plugins Helm repository +- name: Add NRI plugins Helm repository (internet) kubernetes.core.helm_repository: name: "{{ nri_cpu_balloons.helm.repo_name }}" repo_url: "{{ nri_cpu_balloons.helm.repo_url }}" @@ -194,6 +194,16 @@ register: helm_repo_result until: helm_repo_result is succeeded run_once: true + when: not airgap_enabled | default(false) | bool + +- name: Add NRI plugins Helm repository (airgap via JFrog) + ansible.builtin.command: > + helm repo add {{ nri_cpu_balloons.helm.repo_name }} {{ helm_repo_nri_plugins }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + when: airgap_enabled | default(false) | bool + changed_when: false + run_once: true - name: Update Helm repositories for NRI ansible.builtin.command: helm repo update diff --git a/core/roles/nri_cpu_balloons/tasks/main.yaml b/core/roles/nri_cpu_balloons/tasks/main.yaml index aae9f7b1..8ec2e285 100644 --- a/core/roles/nri_cpu_balloons/tasks/main.yaml +++ b/core/roles/nri_cpu_balloons/tasks/main.yaml @@ -281,7 +281,7 @@ tags: - install -- name: Ensure NRI plugins Helm repository is available +- name: Ensure NRI plugins Helm repository is available (internet) kubernetes.core.helm_repository: name: "{{ nri_cpu_balloons.helm.repo_name }}" repo_url: "{{ nri_cpu_balloons.helm.repo_url }}" @@ -296,7 +296,23 @@ until: helm_repo_result is succeeded delegate_to: "{{ groups['kube_control_plane'][0] }}" run_once: true - when: not vllm_balloon_exists or balloon_needs_update | default(false) + when: + - not airgap_enabled | default(false) | bool + - not vllm_balloon_exists or balloon_needs_update | default(false) + tags: + - install + +- name: Ensure NRI plugins Helm repository is available (airgap via JFrog) + ansible.builtin.command: > + helm repo add {{ nri_cpu_balloons.helm.repo_name }} {{ helm_repo_nri_plugins }} + --username {{ jfrog_username }} --password {{ jfrog_password }} + --force-update + delegate_to: "{{ groups['kube_control_plane'][0] }}" + run_once: true + changed_when: false + when: + - airgap_enabled | default(false) | bool + - not vllm_balloon_exists or balloon_needs_update | default(false) tags: - install diff --git a/core/scripts/habana/config.toml b/core/scripts/habana/config.toml index 282670f7..025682e4 100644 --- a/core/scripts/habana/config.toml +++ b/core/scripts/habana/config.toml @@ -48,7 +48,7 @@ oom_score = 0 [plugins."io.containerd.grpc.v1.cri".registry] [plugins."io.containerd.grpc.v1.cri".registry.mirrors] [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"] - endpoint = ["https://registry-1.docker.io"] + endpoint = ["http://100.67.152.212:8082/ei-docker-virtual"] [plugins."io.containerd.runtime.v1.linux"] runtime = "habana-container-runtime" diff --git a/docs/ingress-architecture.md b/docs/ingress-architecture.md new file mode 100644 index 00000000..e8087608 --- /dev/null +++ b/docs/ingress-architecture.md @@ -0,0 +1,220 @@ +# Ingress & Routing Architecture + +This document explains how traffic is routed to vLLM models in Enterprise Inference (EI), +covering both the automated EI deployment path and direct `helm install` deployment. + +--- + +## Table of Contents +- [Components Overview](#components-overview) +- [EI Deployment Architecture](#ei-deployment-architecture) +- [Direct Helm Deployment Architecture](#direct-helm-deployment-architecture) +- [Side-by-Side Comparison](#side-by-side-comparison) +- [The Dual Ingress Template Issue](#the-dual-ingress-template-issue) +- [Helm Resources Created Per Model](#helm-resources-created-per-model) + +--- + +## Components Overview + +| Component | Role | +|-----------|------| +| **nginx Ingress Controller** | Terminates TLS, routes external traffic by hostname/path | +| **APISIX Gateway** (`auth-apisix-gateway`) | API gateway — enforces OIDC auth, rewrites paths | +| **APISIX Ingress Controller** | Watches `ApisixRoute` objects and programs APISIX | +| **ApisixRoute** | Per-model routing rule: maps URL path → vLLM service | +| **K8s Ingress** (`ingress.yaml`) | Routes nginx → APISIX gateway for a specific model path | +| **vLLM Service** | ClusterIP service exposing the vLLM pod | + +--- + +## EI Deployment Architecture + +When deploying through EI (`inference_stack_deploy.sh`) with `deploy_keycloak_apisix=on`: + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ External Client │ +│ curl https://api.example.com/TinyLlama-1.1B-Chat-v1.0-vllmcpu/... │ +└───────────────────────┬─────────────────────────────────────────────┘ + │ HTTPS (port 443) + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ nginx Ingress Controller │ +│ Matches: host=api.example.com │ +│ path=/TinyLlama-1.1B-Chat-v1.0-vllmcpu/(.*) │ +│ Routes to: auth-apisix-gateway:80 │ +│ (Ingress in auth-apisix namespace, class: nginx) │ +└───────────────────────┬─────────────────────────────────────────────┘ + │ HTTP (port 80) + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ APISIX Gateway (auth-apisix-gateway) │ +│ - Validates Bearer token via Keycloak OIDC introspection │ +│ - Rewrites path: /TinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/... → /v1/ │ +└───────────────────────┬─────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ ApisixRoute (per-model) │ +│ name: tinyllama-1-1b-cpu-vllm-apisixroute │ +│ namespace: default │ +│ Plugins: openid-connect, proxy-rewrite │ +└───────────────────────┬─────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ vLLM Service (ClusterIP) │ +│ tinyllama-1-1b-cpu-vllm-service:80 │ +└───────────────────────┬─────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ vLLM Pod │ +│ Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### What EI Does Before Running helm install + +EI's Ansible playbook (`deploy-inference-models.yml`) handles the dual ingress template +problem at the filesystem level **before** running `helm install`: + +**On vanilla Kubernetes (Xeon / non-EKS):** +``` +Ansible: "Remove EKS ingress template for VLLM when not on EKS" + → deletes core/helm-charts/vllm/templates/ingress_eks.yaml from remote machine + → helm install only renders ingress.yaml (class: nginx) ✓ +``` + +**On EKS:** +``` +Ansible: "Use EKS-specific ingress configuration for VLLM" + → copies ingress_eks.yaml → ingress.yaml (replaces it) + → deletes ingress_eks.yaml + → helm install only renders the ALB ingress ✓ +``` + +**On OpenShift:** +``` +Ansible: Removes both ingress.yaml and ingress_eks.yaml + → deploys an OpenShift Route instead (route.enabled=true) +``` + +### EI helm install (simplified, vanilla Kubernetes) + +```bash +helm install ./vllm \ + --values xeon-values.yaml \ + --set ingress.enabled=true \ + --set ingress.host=api.example.com \ + --set ingress.secretname=api.example.com \ + --set apisix.enabled=true \ + --set platform=vanilla \ + --set oidc.client_id= \ + --set oidc.client_secret= +``` + +> `ingress.enabled=true` is set only when `deploy_keycloak_apisix=on` in `inference-config.cfg`. + +--- + +## Direct Helm Deployment Architecture + +When running `helm install` directly **without** EI on vanilla Kubernetes: + +### The Problem — Both ingress templates render + +The vLLM chart has two ingress templates that both trigger when `ingress.enabled=true`: + +| Template | Condition | Class | +|----------|-----------|-------| +| `ingress.yaml` | `{{- if .Values.ingress.enabled }}` | `nginx` | +| `ingress_eks.yaml` | `{{- if .Values.ingress.enabled }}` | `alb` | + +Both create an Ingress with the **same name** in the **same namespace**. +`ingress_eks.yaml` renders last and overwrites `ingress.yaml` → ALB ingress wins. + +``` +helm install with ingress.enabled=true (no fix): + + ingress.yaml → tinyllama-1-1b-cpu-vllm-ingress (class: nginx) ← created first + ingress_eks.yaml → tinyllama-1-1b-cpu-vllm-ingress (class: alb) ← overwrites + ↑ + Result: ALB ingress — broken on vanilla k8s +``` + +### The Fix — Platform guard on ingress_eks.yaml + +Change the condition in `core/helm-charts/vllm/templates/ingress_eks.yaml` from: + +```yaml +{{- if .Values.ingress.enabled }} +``` + +to: + +```yaml +{{- if and .Values.ingress.enabled (eq .Values.platform "eks") }} +``` + +With `platform: vanilla` (default in `values.yaml`), `ingress_eks.yaml` never renders +on non-EKS clusters. Only `ingress.yaml` (nginx) renders → correct result. + +``` +helm install with ingress.enabled=true (after fix): + + ingress.yaml → tinyllama-1-1b-cpu-vllm-ingress (class: nginx) ✓ + ingress_eks.yaml → skipped (platform=vanilla) ✓ +``` + +### Correct direct helm command + +```bash +helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \ + --values ./core/helm-charts/vllm/xeon-values.yaml \ + --set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" \ + --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \ + --set ingress.enabled=true \ + --set ingress.host="${BASE_URL}" \ + --set ingress.secretname="${BASE_URL}" \ + --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \ + --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \ + --set apisix.enabled=true \ + --set tensor_parallel_size="1" \ + --set pipeline_parallel_size="1" +``` + +> **Note:** `ingress.host` must always be set even if `ingress.enabled=false`, because +> the `ApisixRoute` template uses it as the hostname: +> `{{ .Values.route.host | default .Values.ingress.host }}` + +--- + +## Side-by-Side Comparison + +| | EI Deployment | Direct Helm (after fix) | +|--|---------------|------------------------| +| **ingress.enabled** | `true` when `deploy_keycloak_apisix=on` | Set explicitly by user | +| **ingress_eks.yaml** | Deleted by Ansible before helm runs | Skipped by platform guard | +| **Ingress class** | `nginx` ✓ | `nginx` ✓ | +| **ApisixRoute created** | Yes (`apisix.enabled=true`) | Yes (`apisix.enabled=true`) | +| **Traffic path** | nginx → APISIX → vLLM | nginx → APISIX → vLLM | +| **OIDC auth** | Enforced by APISIX (openid-connect plugin) | Enforced by APISIX (openid-connect plugin) | +| **Why no ingress without fix** | Works (Ansible deletes EKS template) | ALB ingress created instead of nginx | + +--- + +## Helm Resources Created Per Model + +When `helm install` runs with `apisix.enabled=true` and `ingress.enabled=true`: + +| Resource | Kind | Namespace | Purpose | +|----------|------|-----------|---------| +| `-vllm-ingress` | Ingress (nginx) | `auth-apisix` | Routes nginx → APISIX gateway | +| `-vllm-apisixroute` | ApisixRoute | `default` | Routes APISIX → vLLM service | +| `-vllm-service` | Service (ClusterIP) | `default` | Exposes vLLM pod | +| `-vllm-secret` | Secret | `default` | OIDC client credentials for APISIX | +| `-vllm-configmap` | ConfigMap | `default` | vLLM environment variables | +| `-vllm` | Deployment | `default` | vLLM pod | +| `-vllm-pvc` | PersistentVolumeClaim | `default` | Model weights storage | diff --git a/third_party/Dell/air-gap/EI/single-node/air-gap-troubleshooting.md b/third_party/Dell/air-gap/EI/single-node/air-gap-troubleshooting.md new file mode 100644 index 00000000..31ac66ed --- /dev/null +++ b/third_party/Dell/air-gap/EI/single-node/air-gap-troubleshooting.md @@ -0,0 +1,559 @@ +# Airgapped Deployment - Troubleshooting Guide + +This document covers common failures encountered during airgapped deployment of Enterprise Inference, their root causes, and fixes. Issues are grouped by the stage at which they occur. + +--- + +## 1. Pre-flight / Prerequisites Stage + +### pip install fails - ensurepip not available + +**Symptom**: `python3 -m venv` fails or pip is not found after venv creation. + +**Root cause**: Ubuntu disables `ensurepip` by default. `python3-pip` cannot be installed via apt in airgap before the Debian mirror is configured. + +**Fix**: Upload `pip.whl` to JFrog and bootstrap pip from it: +```bash +# On VM1 - download and upload pip wheel +pip download pip --no-deps -d /tmp/pip-dl/ +curl -u admin:password -T /tmp/pip-dl/pip-*.whl \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/pip.whl" +``` +The deployment script (`setup-env.sh`) handles the rest automatically - it downloads the wheel, reads the version from its WHEEL metadata, renames it to the proper format (e.g. `pip-26.0.1-py3-none-any.whl`), and installs it. + +--- + +### pip install fails - package not found in JFrog PyPI + +**Symptom**: `pip install` fails with `404 Not Found` or `No matching distribution found` even though the package appears in the JFrog simple index. + +**Root cause**: JFrog's PyPI simple index lists the package name but the `.whl` file was never uploaded - only the index entry exists. + +**Fix**: Upload the missing wheel file physically to `ei-pypi-local`: +```bash +pip download == --no-deps -d /tmp/wheels/ +curl -u admin:password -T /tmp/wheels/.whl \ + "http://100.67.152.212:8082/artifactory/ei-pypi-local/.whl" +``` + +--- + +### Ansible collection install fails - galaxy.ansible.com unreachable + +**Symptom**: `ansible-galaxy collection install` hangs or fails with a connection error. + +**Root cause**: `galaxy.ansible.com` is not reachable in airgap. Collections must come from JFrog. + +**Fix**: Upload collection tarballs to `ei-generic-binaries/ansible-collections/` with the `-latest` suffix. `setup-env.sh` downloads them automatically: +```bash +ansible-galaxy collection download kubernetes.core:6.3.0 -p /tmp/ +curl -u admin:password -T /tmp/kubernetes-core-6.3.0.tar.gz \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/ansible-collections/kubernetes-core-latest.tar.gz" +``` + +> **Warning**: Files must use the `-latest` suffix (e.g. `kubernetes-core-latest.tar.gz`). Versioned filenames (e.g. `kubernetes-core-6.3.0.tar.gz`) are silently skipped by `setup-env.sh`. + +--- + +### `community.kubernetes` module not found + +**Symptom**: Playbook fails with: +``` +couldn't resolve module/action 'community.kubernetes.k8s' +``` + +**Root cause**: `community.kubernetes` is deprecated and not installed in airgap. The modern equivalent is `kubernetes.core`, which is installed via JFrog tarball. + +**Fix**: All EI playbooks have been migrated to `kubernetes.core.*`. If you see this error in a custom playbook, replace all occurrences: +```bash +sed -i 's/community\.kubernetes\./kubernetes.core./g' .yml +``` + +--- + +### apt-get update fails or hangs in airgap + +**Symptom**: `apt-get update` hangs for 10-18 minutes then fails with connection timeout. + +**Root cause**: No Debian mirror is configured in JFrog, or `/etc/apt/sources.list` still points to `archive.ubuntu.com`. + +**Fix**: `setup-env.sh` automatically rewrites `/etc/apt/sources.list` to point to JFrog when `airgap_enabled=yes`. If running manually: +```bash +sudo tee /etc/apt/sources.list > /dev/null << EOF +deb [trusted=yes] http://admin:password@100.67.152.212:8082/artifactory/ei-debian-virtual jammy main restricted universe multiverse +deb [trusted=yes] http://admin:password@100.67.152.212:8082/artifactory/ei-debian-virtual jammy-updates main restricted universe multiverse +deb [trusted=yes] http://admin:password@100.67.152.212:8082/artifactory/ei-debian-virtual jammy-security main restricted universe multiverse +EOF +sudo apt-get update +``` + +--- + +### apt-get install returns 404 for .deb files + +**Symptom**: `apt-get install jq` (or any package) fails with `404 Not Found` even after `apt-get update` succeeds. + +**Root cause**: JFrog's Debian remote correctly proxies the package index (`Packages.gz`, `Release`) but returns 404 for actual `.deb` pool file downloads. This is a JFrog Debian remote limitation. + +**Fix**: Upload the required `.deb` files directly to `ei-generic-binaries/apt-debs/` and install via `dpkg`. The `inference-tools` role handles `jq` automatically in airgap mode. For other packages: +```bash +# On VM1 - download debs +apt-get download +# Upload to JFrog +curl -u admin:password -T .deb \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/apt-debs/.deb" + +# On VM2 - install +curl -sfL -u admin:password \ + -o /tmp/.deb \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/apt-debs/.deb" +sudo dpkg -i /tmp/.deb +``` + +--- + +### Kubespray clone fails - github.com unreachable + +**Symptom**: `git clone https://github.com/kubernetes-sigs/kubespray.git` fails. + +**Root cause**: GitHub is not reachable in airgap. + +**Fix**: Upload a kubespray tarball to JFrog before deploying. `setup-env.sh` downloads it automatically when `airgap_enabled=yes`: +```bash +# On VM1 +git clone https://github.com/kubernetes-sigs/kubespray.git +cd kubespray && git checkout v2.27.0 && cd .. +tar -czf kubespray.tar.gz kubespray/ +curl -u admin:password -T kubespray.tar.gz \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/kubespray.tar.gz" +``` + +--- + +### Windows CRLF line endings break bash scripts + +**Symptom**: Scripts fail with errors like `function not found` or `bad interpreter: No such file or directory` when copied from a Windows machine. + +**Root cause**: Windows adds `\r` (carriage return) to line endings. Bash interprets function names as `functionname\r` which does not match the call site. + +**Fix**: After copying files to VM2, strip CRLF: +```bash +find ~/Enterprise-Inference -name "*.sh" -o -name "*.yml" -o -name "*.yaml" -o -name "*.cfg" | \ + xargs sed -i 's/\r//' +``` + +--- + +## 2. Kubespray / Kubernetes Bootstrap Stage + +### Binary download fails - 404 from JFrog + +**Symptom**: Kubespray `download` role fails with: +``` +trying next host - response was http.StatusNotFound" host="100.67.152.212:8082" +``` + +**Root cause**: Kubespray constructs binary download URLs from component versions. The binary was not uploaded to JFrog with the matching path structure. + +**Fix**: Upload the missing binary to JFrog preserving the exact URL path. Check which binary failed and upload it: +```bash +# Example: missing kubelet +curl -LO https://dl.k8s.io/release/v1.30.4/bin/linux/amd64/kubelet +curl -u admin:password -T kubelet \ + "http://100.67.152.212:8082/artifactory/ei-generic-binaries/dl.k8s.io/release/v1.30.4/bin/linux/amd64/kubelet" +``` +See `core/inventory/metadata/offline.yml` for the full list of expected paths. + +--- + +### containerd mirror not working - image pulls go to internet + +**Symptom**: Images are pulled from the original registry instead of JFrog. Confirmed by: `sudo journalctl -u containerd | grep "bytes read=0"`. + +**Root cause**: `hosts.toml` is misconfigured. Common mistakes: + +1. Mirror host URL missing `/v2/` prefix +2. `skip_verify` field present (even `skip_verify: false` breaks HTTP mirrors) +3. Mirror not listed under the correct registry + +**Fix**: Verify `hosts.toml` for each registry: +```bash +cat /etc/containerd/certs.d/docker.io/hosts.toml +``` +Expected format (no `skip_verify` field at all): +```toml +server = "https://docker.io" +[host."http://100.67.152.212:8082/v2/ei-docker-virtual"] + capabilities = ["pull", "resolve"] + override_path = true +``` + +Apply for all registries: +```bash +for reg in docker.io ghcr.io registry.k8s.io quay.io public.ecr.aws; do + sudo mkdir -p /etc/containerd/certs.d/$reg + sudo tee /etc/containerd/certs.d/$reg/hosts.toml </dev/null; true +sudo ctr -n k8s.io images rm $IMAGE 2>/dev/null; true +# Delete the bad blob file directly (find the sha256 from the error log) +sudo rm -f /var/lib/containerd/io.containerd.content.v1.content/blobs/sha256/ +sudo systemctl restart containerd +``` + +--- + +### Docker Hub rate limit hit when pre-caching images on VM1 + +**Symptom**: `docker pull` returns `toomanyrequests: You have reached your pull rate limit`. + +**Fix**: Authenticate with Docker Hub before pulling: +```bash +docker login -u -p +``` +Rotate the PAT after use. Docker Hub free accounts allow 100 pulls/6h unauthenticated, 200/6h authenticated. + +--- + +## 3. Helm / Application Deployment Stage + +### helm repo add fails - upstream URL unreachable + +**Symptom**: `helm repo add https://kubernetes.github.io/ingress-nginx` fails with connection error. + +**Root cause**: Upstream Helm repo URLs are blocked in airgap. + +**Fix**: Use the JFrog virtual Helm repo instead. All EI playbooks handle this automatically when `airgap_enabled=yes`. If running helm manually: +```bash +helm repo add ingress-nginx http://100.67.152.212:8082/artifactory/ei-helm-virtual \ + --username admin --password password --force-update +helm repo update +``` + +--- + +### `helm dependency update` contacts internet in airgap + +**Symptom**: `helm dependency update` fails or hangs trying to contact `charts.apiseven.com`, `registry-1.docker.io`, or other upstream URLs. + +**Root cause**: `Chart.yaml` dependency entries contain hardcoded upstream `repository` URLs. Helm resolves these directly, bypassing registered repos and containerd mirrors. + +**Fix**: Use airgap-specific dependency resolution: +1. Pre-pull subchart tarballs from JFrog registered repo +2. Place them in `charts/` directory +3. Patch `Chart.yaml` to replace upstream URLs with JFrog URL +4. Run `helm dependency build` instead of `helm dependency update` + +This is handled automatically by EI playbooks (`deploy-keycloak-tls-cert.yml`, `deploy-genai-gateway.yml`) when `airgap_enabled=yes`. + +--- + +### Keycloak chart install fails - OCI pull contacts docker.io + +**Symptom**: `helm install` with `chart_ref: oci://registry-1.docker.io/bitnamicharts/keycloak` fails in airgap. + +**Root cause**: Helm uses its own HTTP client for OCI pulls, bypassing containerd mirrors entirely. + +**Fix**: Use the JFrog Helm repo instead of OCI: +```bash +helm repo add ei-helm http://100.67.152.212:8082/artifactory/ei-helm-virtual \ + --username admin --password password --force-update +helm install keycloak ei-helm/keycloak --version 22.1.0 +``` +EI playbooks do this automatically when `airgap_enabled=yes`. + +--- + +### Helm install fails - chart not found in index + +**Symptom**: `helm search repo ei-helm` returns no results or does not show expected charts. + +**Root cause**: `index.yaml` was not uploaded or is outdated after adding new charts. + +**Fix**: Regenerate and re-upload `index.yaml`: +```bash +cd /tmp/helm-charts-dir +helm repo index . --url http://100.67.152.212:8082/artifactory/ei-helm-local +curl -u admin:password -T index.yaml \ + "http://100.67.152.212:8082/artifactory/ei-helm-local/index.yaml" +helm repo update +helm search repo ei-helm +``` + +--- + +## 4. vLLM / Model Deployment Stage + +### vLLM pod stuck at 0/1 - HuggingFace network timeout + +**Symptom**: Pod shows `0/1 Running`. Logs stop after OMP thread binding with no model file activity. No crash, just silence. + +**Root cause**: `HF_HUB_OFFLINE` is not set. The HuggingFace Hub library makes network calls to `huggingface.co` to validate cached model metadata. These calls hang silently in airgap. + +**Fix**: Patch the pod's ConfigMap to set offline mode: +```bash +kubectl patch configmap -config --type=merge \ + -p '{"data":{"HF_HUB_OFFLINE":"1","TRANSFORMERS_OFFLINE":"1"}}' +kubectl rollout restart deployment +``` + +Permanent fix: add to `core/helm-charts/vllm/xeon-values.yaml`: +```yaml +defaultModelConfigs: + configMapValues: + HF_HUB_OFFLINE: "1" + TRANSFORMERS_OFFLINE: "1" +``` + +--- + +### Model download stalls at ~8.8MB on pod restart + +**Symptom**: vLLM pod restarts and the model download appears to start but stops at 8.8MB (metadata only). No further progress. + +**Root cause**: Stale `.lock` files in `/data/hub/.locks/` from a previous pod run prevent the download from resuming. + +**Fix**: +```bash +kubectl exec -- find /data/hub/.locks -type f -delete +kubectl rollout restart deployment +``` + +--- + +### Model files not found with `HF_HUB_OFFLINE=1` + +**Symptom**: vLLM crashes with `OSError: We have no connection to the internet and we cannot find the cached files`. + +**Root cause**: Model files were manually copied to the PV but not in the HuggingFace Hub cache directory format (`/data/hub/models----/snapshots//`). The Hub library cannot locate them. + +**Fix**: Pre-populate the PV using `huggingface_hub.snapshot_download()` which creates the correct directory structure: +```python +from huggingface_hub import snapshot_download +snapshot_download( + 'meta-llama/Llama-3.1-8B-Instruct', + local_dir='/data/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots//', + local_files_only=False # set True if files already present +) +``` + +--- + +### PV deleted and model data lost after pod restart + +**Symptom**: After a pod is deleted and recreated, the model starts downloading from scratch. + +**Root cause**: `local-path` PVs use `Delete` reclaim policy by default. When the PVC is deleted, the PV and all data on disk is permanently removed. + +**Fix**: Patch active PVs to `Retain` immediately after pod creation: +```bash +kubectl patch pv -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' +``` +To reuse a Released PV after PVC deletion: +```bash +# Clear the claimRef so PV becomes Available +kubectl patch pv --type=json \ + -p='[{"op":"remove","path":"/spec/claimRef"}]' +# Create new PVC referencing the PV by name +``` + +--- + +## 5. NRI Balloon Policy Issues + +### NRI auto-enabled despite `deploy_nri_balloon_policy=no` + +**Symptom**: NRI balloon policy deploys on all CPU deployments regardless of config setting. + +**Root cause**: Three compounding bugs: +1. `parse-user-prompts.sh` silently auto-sets `deploy_nri_balloon_policy=yes` for any CPU deployment when the variable is unset +2. `ballon-policy.sh` had `|| [ "$cpu_or_gpu" == "c" ]` bypass that triggered NRI for all CPU deployments regardless of the flag +3. `deploy-inference-models.yml` unconditionally passed `--set cpu_balloon_annotation` to all 7 model tasks + +**Fix**: Both code bugs have been fixed. Always set `deploy_nri_balloon_policy=no` explicitly in `inference-config.cfg` to suppress NRI. + +--- + +### vLLM pods have stale NRI resource requests after NRI uninstall + +**Symptom**: vLLM pods show `cpu: 336` resource requests even after NRI is uninstalled. `helm upgrade --set cpu=""` does not clear them. + +**Root cause**: `helm upgrade` uses strategic merge patch which omits fields rather than removing them. Existing `cpu: 336` value is not cleared. + +**Fix**: Use JSON patch to explicitly replace the resources field: +```bash +# Delete ingress first (helm upgrade conflicts with modified ingress) +kubectl delete ingress -ingress -n auth-apisix + +# Clear stale NRI resource requests +kubectl patch deployment -n default --type=json \ + -p='[{"op":"replace","path":"/spec/template/spec/containers/0/resources","value":{}}]' + +# Upgrade with clean values +helm upgrade ./helm-charts/vllm --reuse-values \ + --set cpu_balloon_annotation="" --set cpu="" --set tensor_parallel_size=1 +``` + +--- + +### NRI with TP=2 crashes with PyTorch assertion error + +**Symptom**: vLLM pod crashes with `ptr->thread_num == thread_num` assertion in PyTorch OMP layer. + +**Root cause**: On asymmetric NUMA nodes (e.g. 85 vs 84 cores), NRI with `tensor_parallel_size=2` splits the balloon unevenly across NUMA nodes. PyTorch asserts that OMP thread counts are symmetric. + +**Fix**: Set `tensor_parallel_size: "1"` in `core/helm-charts/vllm/xeon-values.yaml`. + +--- + +## 6. JFrog Configuration Issues + +### JFrog returns 404 to curl but image is shown as cached in UI + +**Symptom**: `curl http://100.67.152.212:8082/v2/ei-docker-virtual/library/nginx/manifests/1.25.2-alpine` returns 404 but the image appears in JFrog storage. + +**Root cause**: The JFrog v2 Docker API requires specific `Accept` headers to serve manifests. Plain curl without headers returns 404. + +**Fix**: Always use Docker Accept headers when verifying image cache: +```bash +curl -s -u admin:password \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "http://100.67.152.212:8082/v2/ei-docker-virtual/library/nginx/manifests/1.25.2-alpine" +``` + +--- + +### JFrog remote repo fetches from internet despite being set to Offline + +**Symptom**: JFrog still serves new images/packages not previously cached after being set to Offline. + +**Root cause**: The repo was not actually saved as Offline, or the virtual repo routing picks up a different remote that is still Online. + +**Fix**: In JFrog UI, verify each remote repo is set to Offline: +- Admin → Repositories → Edit each remote → Advanced → Online/Offline → set to Offline +- Check all remotes: `ei-docker-dockerhub`, `ei-docker-ecr`, `ei-docker-ghcr`, `ei-docker-k8s`, `ei-docker-quay` + +--- + +### How to find which image caused a 404 during deployment + +The image name comes from Kubespray's defaults. Look for this pattern in the deployment log: +``` +trying next host - response was http.StatusNotFound" host="100.67.152.212:8082" +trying next host" error="...dial tcp...: i/o timeout" host= +``` + +Check `core/kubespray/roles/kubespray-defaults/defaults/main/download.yml` for the image name and tag. Pre-cache on VM1: +```bash +# Set the relevant remote to Online in JFrog UI first +docker pull 100.67.152.212:8082/ei-docker-virtual/: +# Then set back to Offline +``` + +--- + +## 7. Verification Commands + +### Check JFrog is reachable from VM2 +```bash +curl -s --max-time 5 http://100.67.152.212:8082/artifactory/api/system/ping && echo "JFrog OK" || echo "JFrog unreachable" +``` + +### Confirm internet is blocked on VM2 +```bash +curl -s --max-time 5 https://google.com && echo "FAIL - internet open" || echo "OK - internet blocked" +``` + +### Confirm image pulled from JFrog (not internet) +```bash +# bytes > 0 means JFrog mirror was used +sudo journalctl -u containerd --no-pager | grep "stop pulling image" | grep -v "bytes read=0" + +# Or watch JFrog request log on VM1 for VM2's IP +tail -f /var/opt/jfrog/artifactory/log/request.log | grep 100.67.153.209 +``` + +### List all images cached in JFrog +```bash +curl -s -u admin:password \ + http://100.67.152.212:8082/artifactory/api/docker/ei-docker-virtual/v2/_catalog | jq .repositories[] +``` + +### Check tags for a specific image +```bash +curl -s -u admin:password \ + "http://100.67.152.212:8082/artifactory/api/docker/ei-docker-virtual/v2/library/nginx/tags/list" | jq . +``` + +### List all files in a generic repo path +```bash +curl -s -u admin:password \ + "http://100.67.152.212:8082/artifactory/api/storage/ei-generic-binaries/ansible-collections" | jq '.children[].uri' +``` + +### List all PyPI packages in JFrog +```bash +curl -s -u admin:password \ + "http://100.67.152.212:8082/artifactory/api/storage/ei-pypi-local" | jq '.children[].uri' +``` diff --git a/third_party/Dell/air-gap/EI/single-node/air-gap.md b/third_party/Dell/air-gap/EI/single-node/air-gap.md new file mode 100644 index 00000000..7c708aaf --- /dev/null +++ b/third_party/Dell/air-gap/EI/single-node/air-gap.md @@ -0,0 +1,310 @@ +# Airgapped Deployment Guide + +This document is a continuation of the [JFrog Setup README](../../jfrog-setup/README.md). + +It assumes JFrog Artifactory is already installed on VM1, all repositories are created, and all assets (Docker images, Helm charts, PyPI packages, binaries, and the LLM model) have been uploaded. If you have not done that yet, complete the JFrog setup first. + +## Architecture + +``` +VM1 (internet-connected) VM2 (airgapped) +┌─────────────────────┐ ┌─────────────────────┐ +│ JFrog Artifactory │◄──LAN────►│ EI Deployment │ +│ :8082 │ │ Kubernetes + vLLM │ +│ - Docker images │ │ │ +│ - Helm charts │ │ No internet access │ +│ - PyPI packages │ │ All pulls → JFrog │ +│ - Binaries │ └─────────────────────┘ +│ - LLM models │ +└─────────────────────┘ +``` + +--- + +## Step 1 - Block Internet on VM2 + +Before deploying, verify and then block internet access on VM2. All traffic must go through JFrog on VM1. + +### Check current internet access + +```bash +curl -s --max-time 5 https://google.com && echo "HAS INTERNET" || echo "NO INTERNET" +curl -s --max-time 5 https://huggingface.co && echo "HAS INTERNET" || echo "NO INTERNET" +``` + +### Block internet (allow only LAN and loopback) + +Before running the iptables rules, find your LAN subnet and SSH client subnet: + +```bash +# Your VM2 IP - the first two octets give you the LAN subnet +hostname -I +# Example output: 100.67.177.224 --> LAN subnet is 100.67.0.0/16 + +# Your SSH client IP - use the first three octets as the subnet +echo $SSH_CLIENT +# Example output: 100.64.29.169 40047 22 --> SSH client subnet is 100.64.29.0/24 +``` + +Use those values in the rules below. The rules must be added one at a time in order -- each step inserts at a specific position, so do not skip any. + +Replace `` with the first two octets of VM2's IP followed by `.0.0` (for example, if VM2 is `100.67.177.224`, use `100.67.0.0`). + +Replace `` with the first three octets of the SSH client IP followed by `.0` (for example, if the client IP is `100.64.29.169`, use `100.64.29.0`). + +**Step 1 -- Install iptables-persistent before blocking internet.** +Once the DROP rule is active, apt-get cannot reach the Ubuntu mirror. Install the package first while internet is still open. + +```bash +sudo apt-get -o Acquire::ForceIPv4=true install -y iptables-persistent +``` + +Note: the `-o Acquire::ForceIPv4=true` flag is needed because Ubuntu mirrors advertise IPv6 addresses. Without it, apt may try IPv6 first and hang. + +**Step 2 -- Apply the iptables rules.** + +```bash +sudo iptables -F OUTPUT +sudo iptables -I OUTPUT 1 -m state --state ESTABLISHED,RELATED -j ACCEPT +sudo iptables -I OUTPUT 2 -o lo -j ACCEPT +sudo iptables -I OUTPUT 3 -d 127.0.0.0/8 -j ACCEPT +sudo iptables -I OUTPUT 4 -d 10.0.0.0/8 -j ACCEPT +sudo iptables -I OUTPUT 5 -d /16 -j ACCEPT +sudo iptables -I OUTPUT 6 -d /24 -j ACCEPT +sudo iptables -I OUTPUT 7 -d 192.168.0.0/16 -j ACCEPT +sudo iptables -A OUTPUT -j DROP +``` + +**Step 3 -- Save the rules so they survive a reboot.** + +```bash +sudo netfilter-persistent save +``` + +If `iptables-persistent` is not available on your system (for example, the Ubuntu mirror is unreachable), save the rules manually instead: + +```bash +sudo mkdir -p /etc/iptables +sudo iptables-save | sudo tee /etc/iptables/rules.v4 + +sudo tee /etc/network/if-pre-up.d/iptables-restore > /dev/null << 'EOF' +#!/bin/sh +iptables-restore < /etc/iptables/rules.v4 +EOF +sudo chmod +x /etc/network/if-pre-up.d/iptables-restore +``` + +### Verify airgap + +```bash +curl -s --max-time 5 https://google.com && echo "FAIL - internet still open" || echo "OK - internet blocked" +curl -s --max-time 5 http://:8082/artifactory/api/system/ping && echo "OK - JFrog reachable" || echo "FAIL - JFrog unreachable" +``` + +--- + +## Step 2 - Copy the Enterprise Inference Repo to VM2 + +From a machine with access to both the repo and VM2, clone the repository and check out the airgap branch: + +```bash +git clone https://github.com/cld2labs/Enterprise-Inference.git +cd Enterprise-Inference +git checkout ei/airgapped +``` + +Then copy it to VM2: + +```bash +scp -r ~/Enterprise-Inference user@:~/ +``` + +Or copy via USB or shared storage if the environment is fully disconnected. + +After copying, log in to VM2 and strip Windows CRLF line endings (required if the files were edited on a Windows machine): + +```bash +find ~/Enterprise-Inference -name "*.sh" -o -name "*.yml" -o -name "*.yaml" -o -name "*.cfg" | \ + xargs sed -i 's/\r//' +``` + +--- + +## Step 3 - Configure `inference-config.cfg` + +```bash +vi ~/Enterprise-Inference/core/inventory/inference-config.cfg +``` + +Set the following values. Replace each placeholder with your actual values: + +``` +cluster_url=api.example.com +cert_file=~/certs/cert.pem +key_file=~/certs/key.pem +keycloak_client_id=my-client-id +keycloak_admin_user=your-keycloak-admin-user +keycloak_admin_password=changeme +hugging_face_token= # Replace with your HuggingFace token +hugging_face_token_falcon3= # Replace with your HuggingFace token +models= +cpu_or_gpu=cpu +vault_pass_code=place-holder-123 +deploy_kubernetes_fresh=on +deploy_ingress_controller=on +deploy_keycloak_apisix=on +deploy_genai_gateway=off +deploy_observability=off +deploy_llm_models=on +deploy_ceph=off +deploy_istio=off +uninstall_ceph=off +deploy_nri_balloon_policy=no + +# --------------------------------------------------------------------------- +# Airgap Configuration +# Set airgap_enabled=on to route all pulls through JFrog on VM1. +# --------------------------------------------------------------------------- +airgap_enabled=on +jfrog_url=http://:8082/artifactory +jfrog_username=admin +jfrog_password= +``` + +Replace the following placeholders with your own values before running the deployment: + +| Placeholder | What to replace with | +|---|---| +| `` | IP address of VM1 (the JFrog machine) | +| `` | JFrog admin password set during the UI wizard in Step 2 | +| `` | Your HuggingFace token with read access to the gated Llama models | + +### Apply single-node inventory + +```bash +cp ~/Enterprise-Inference/docs/examples/single-node/hosts.yaml \ + ~/Enterprise-Inference/core/inventory/hosts.yaml +``` + +Then update `ansible_user` to match the deployment user: + +```bash +sed -i -E "/^[[:space:]]*master1:/,/^[[:space:]]{2}children:/ \ + s/^([[:space:]]*ansible_user:[[:space:]]*).*/\1$(whoami)/" \ + ~/Enterprise-Inference/core/inventory/hosts.yaml +``` + +### Generate SSL certificates + +```bash +mkdir -p ~/certs +openssl req -x509 -newkey rsa:4096 \ + -keyout ~/certs/key.pem \ + -out ~/certs/cert.pem \ + -days 365 -nodes \ + -subj '/CN=api.example.com' +``` + +These paths are referenced in `inference-config.cfg` as `cert_file` and `key_file`. + +### Add VM2 hosts entry for `api.example.com` + +```bash +echo "$(hostname -I | awk '{print $1}') api.example.com" | sudo tee -a /etc/hosts +``` + +--- + +## Step 4 - Run the Deployment + +```bash +cd ~/Enterprise-Inference/core +chmod +x inference-stack-deploy.sh +./inference-stack-deploy.sh +``` + +The deployment will: +1. Install prerequisites (pip from JFrog PyPI, Ansible collections from JFrog) +2. Download Kubespray from JFrog +3. Deploy Kubernetes via Kubespray (all binaries and images from JFrog) +4. Deploy ingress-nginx, Keycloak, APISIX +5. Deploy vLLM model pods + +### Monitor deployment + +```bash +# Watch pods come up +kubectl get pods -w + +# Check vLLM pod logs (model loading) +kubectl logs --tail=20 | grep -v "OMP tid" +``` + +Expected pod states when complete: + +``` +keycloak-0 1/1 Running +keycloak-postgresql-0 1/1 Running +vllm-llama-8b-cpu-* 1/1 Running +``` + +--- + +## Step 5 - Test Inference + +### Generate Keycloak token + +```bash +cd ~/Enterprise-Inference/core +. scripts/generate-token.sh +``` + +### Verify models are available + +```bash +curl -s http://api.example.com:32353/Llama-3.1-8B-Instruct-vllmcpu/v1/models \ + -H "Authorization: Bearer $TOKEN" | jq . +``` + +### Test inference + +```bash +curl -k https://${BASE_URL}/Llama-3.1-8B-Instruct-vllmcpu/v1/completions \ + -X POST \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "What is Deep Learning?", + "max_tokens": 25, + "temperature": 0 + }' +``` + +--- + +## Additional Information + +### Validated Models in Airgap Mode + +The following models have been tested and validated end-to-end in airgap mode. These are +the **only models that are confirmed to work** in an airgapped deployment. Other models +may work if their weights and tokenizer files are uploaded to JFrog manually, but they +have not been tested and are not supported in this configuration. + +| Model | HuggingFace ID | Approximate size | JFrog setup step | +|---|---|---|---| +| Llama 3.2 3B Instruct | `meta-llama/Llama-3.2-3B-Instruct` | ~6.5 GB | Step 3i | +| Qwen3 0.6B | `Qwen/Qwen3-0.6B` | ~1.2 GB | Step 3j | +| Qwen3 4B | `Qwen/Qwen3-4B` | ~7.6 GB | Step 3k | +| Qwen3 1.7B | `Qwen/Qwen3-1.7B` | ~3.4 GB | Step 3l | + +> [!IMPORTANT] +> Only the models listed above have been validated in airgap mode. Deploying a model not +> listed here requires manually downloading its weights and uploading them to +> `ei-generic-models` in JFrog before deployment. Untested models may fail at model +> load time even if all other components deploy successfully. + +--- + +For troubleshooting common failures, see [air-gap-troubleshooting.md](air-gap-troubleshooting.md). diff --git a/third_party/Dell/air-gap/jfrog-setup/README.md b/third_party/Dell/air-gap/jfrog-setup/README.md new file mode 100644 index 00000000..d128f0c6 --- /dev/null +++ b/third_party/Dell/air-gap/jfrog-setup/README.md @@ -0,0 +1,406 @@ +# JFrog Setup for Enterprise Inference Airgapped Deployment + +This guide walks you through setting up JFrog Artifactory on VM1 as a local mirror for +Enterprise Inference airgapped deployments. Once JFrog is set up, VM2 (the airgapped machine) +pulls all Docker images, Helm charts, Python packages, and binaries from JFrog instead of +the internet. + +``` +┌─────────────────────┐ ┌─────────────────────┐ +│ VM1 (internet) │ LAN │ VM2 (airgapped) │ +│ JFrog Artifactory │◄─────────►│ EI Deployment │ +│ :8082 │ │ Kubernetes + vLLM │ +│ │ │ │ +│ - Docker images │ │ No internet access │ +│ - Helm charts │ │ All pulls -> JFrog │ +│ - Python packages │ └─────────────────────┘ +│ - Binaries │ +│ - LLM models │ +└─────────────────────┘ +``` + +### Scripts in this folder + +- **`jfrog-installation.sh`**: Installs all required tools and JFrog Artifactory on VM1 +- **`jfrog-setup.sh`**: Creates repositories, enables access, and uploads all assets to JFrog + +--- + +## Prerequisites + +### System Requirements + +This airgap solution requires two machines. Both machines must be on the same network and +must be able to reach each other over LAN. VM2 pulls all content from VM1 during deployment, +so connectivity between them is required throughout the entire process. + +| Requirement | VM1 (JFrog machine) | VM2 (airgapped machine) | +|---|---|---| +| Purpose | Hosts JFrog Artifactory, downloads and stores all assets | Runs the Enterprise Inference stack (Kubernetes + vLLM) | +| Internet access | Required (to download Docker images, models, binaries) | Not required (blocked after initial setup) | +| Disk space | At least 80 GB free. This has been validated for downloading Llama-3.2-3B, Qwen3-0.6B, Qwen3-1.7B, and Qwen3-4B models. If you plan to download additional or larger models, you will need more disk space. | At least 80 GB free (for Kubernetes, container images, and model storage) | +| RAM | At least 8 GB | At least 64 GB (vLLM requires significant memory for CPU inference) | +| CPU | No special requirement (JFrog is a file server) | At least 16 cores recommended | +| Network | Must be reachable from VM2 on port 8082 | Must be reachable from VM1. **Internet access must be fully disabled before running the EI deployment.** EI will exit with an error if `airgap_enabled=yes` and the machine can still reach the internet. | +| OS | Ubuntu 22.04 LTS | Ubuntu 22.04 LTS | +| Access | Root or sudo privileges | Root or sudo privileges | + +### Credentials required + +Before you start, collect the following. Have all of them ready before running any scripts. + +**JFrog License** + +A license key is required to activate JFrog. Without it, JFrog will not serve any content. +If you already have a JFrog license, use that. If not, you can get a free 14-day trial at +https://jfrog.com/start-free/ + +1. Click 14-day free trial (not Platform Tour) +2. Select Self-Hosted +3. Fill in the registration form and click Confirm and Start +4. Check your email. JFrog will send you your username, password, and license key within a few minutes. +5. Copy the license key and keep it somewhere handy. You will need all three when completing the setup wizard in Step 2. + +**HuggingFace Token** + +Required to download LLM models. The following models are supported: + +| Step | Model | Approximate size | +|---|---|---| +| 3i | meta-llama/Llama-3.2-3B-Instruct | ~6.5 GB | +| 3j | Qwen/Qwen3-0.6B | ~1.2 GB | +| 3k | Qwen/Qwen3-4B | ~7.6 GB | +| 3l | Qwen/Qwen3-1.7B | ~3.4 GB | + +The Llama model is gated and requires you to accept the license agreement before downloading. +The Qwen models are open and do not require acceptance. + +1. Accept the Llama 3.2 3B license at https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct +2. Generate a token at https://huggingface.co/settings/tokens and select Read access. + +**Docker Hub Credentials** + +Required to pull one image (apache/apisix-ingress-controller) that cannot be fetched +through JFrog remote repos and must be pulled directly from Docker Hub. + +Create a free account at https://hub.docker.com and generate a Personal Access Token at +https://hub.docker.com/settings/security. Use the token as your password when prompted. + +--- + +## Step 1 - Install JFrog on VM1 + +VM1 must have internet access. Run the following on VM1. + +First, install git: + +```bash +sudo apt install -y git +``` + +Clone the repo and check out the airgap branch: + +```bash +git clone https://github.com/cld2labs/Enterprise-Inference.git Enterprise-Inference +cd Enterprise-Inference +git checkout cld2labs/airgap +``` + +Then run the install script: + +```bash +cd ~/Enterprise-Inference/third_party/Dell/air-gap/jfrog-setup +chmod +x jfrog-installation.sh +sudo ./jfrog-installation.sh +``` + +> During the install, the package manager may show a package configuration prompt. Press +> Enter or click OK to accept the defaults and continue. + +The script installs these tools: curl, wget, git, jq, skopeo, helm, python3, pip3, ansible. + +When the script finishes, JFrog is running at `http://localhost:8082`. + +--- + +## Step 2 - Open the JFrog UI and Complete Setup + +Open a browser on VM1 and go to `http://localhost:8082`. + +If VM1 does not have a browser, set up an SSH tunnel from your local machine. Open a new +terminal window (not the one where you are already SSH'd into VM1) and run: + +```bash +ssh -L 8082:localhost:8082 user@ -N +``` + +> Leave that terminal open. Closing it will drop the tunnel and you will lose access to the +> JFrog UI. + +Open `http://localhost:8082` in your local browser. + +### First login and setup + +When you open JFrog for the first time, it will walk you through a short setup wizard. + +**1. Reset the default password** + +Log in with the default credentials: admin / password + +JFrog will immediately ask you to set a new password. Choose a password and save it. You +will need it when running `jfrog-setup.sh` in the next step. + +**2. Activate the license** + +JFrog will ask for a license key. Paste the trial license key from your email and click +Activate. + +> JFrog will not serve any content until the license is activated. Do not skip this step. + +**3. Set the base URL** + +JFrog will ask for a base URL. Leave this blank and click Skip unless you have a specific +base URL. This is optional and does not affect the setup. + +**4. Configure proxy** + +Click Skip. A proxy is only needed if VM1 reaches the internet through a corporate proxy +server. + +**5. Create repositories** + +Click Skip. The `jfrog-setup.sh` script will create all required repositories automatically. + +Click Finish to complete the wizard. + +### Enable anonymous access (required manual step) + +> [!IMPORTANT] +> This step must be done manually in the UI. The JFrog API cannot automate it reliably +> because it requires a token with a specific audience (`jfac@...`) that is not obtainable +> via the standard REST API. `jfrog-setup.sh` step 2 will warn about this and continue, +> but anonymous access will not be active until you complete this step. +> +> Without this, VM2 cannot pull Docker images through the containerd mirror. + +1. In the JFrog UI, go to **Administration → Security → General** +2. Turn on **Allow Anonymous Access** +3. Click **Save** + +You can verify it is working by running this on VM1 after the toggle is on: + +```bash +curl -s "http://localhost:8082/v2/token?scope=repository%3Alibrary%2Fnginx%3Apull&service=localhost:8082" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print('OK' if d.get('token') else 'FAILED')" +``` + +If it prints `OK`, anonymous access is active and VM2 will be able to pull images. + +--- + +## Step 3 - Create Repos, Enable Access, and Upload All Assets + +### Run the full setup + +Once the license is active, run the command below to start the setup. This will take a +while as it downloads and uploads all assets listed above. + +> [!CAUTION] +> Run the script as a normal user, not with `sudo`. For example, run `./jfrog-setup.sh`, +> not `sudo ./jfrog-setup.sh`. Running as root breaks the SSH tunnel and the script will +> not be able to reach JFrog. + +> [!NOTE] +> During step 3f, the script internally installs apt packages and will prompt for your +> sudo password. This is expected: enter your system password when prompted to continue. + +```bash +cd ~/Enterprise-Inference/third_party/Dell/air-gap/jfrog-setup +chmod +x jfrog-setup.sh + +# Replace VM1-IP with the actual value +./jfrog-setup.sh \ + --jfrog-url http://:8082/artifactory \ + --jfrog-user admin \ + --jfrog-pass \ + --dockerhub-user \ + --dockerhub-pass \ + --hf-token +``` + +### All available options + +| Flag | Required | Description | +|---|---|---| +| `--jfrog-url URL` | Yes | JFrog base URL. Script fails to connect if not provided or incorrect | +| `--jfrog-user USER` | Yes | JFrog admin username. Script fails to authenticate if missing | +| `--jfrog-pass PASS` | Yes | JFrog admin password set during the UI wizard. Script fails to authenticate if missing | +| `--hf-token TOKEN` | Only for steps 3i, 3j, 3k | HuggingFace token with read access. Required for Llama-3.2-3B (gated) and Qwen models. If omitted, steps 3i, 3j, and 3k will be skipped with a warning | +| `--dockerhub-user USER` | Only for step 3a | Docker Hub username for `apache/apisix-ingress-controller`. If omitted, that image is skipped with a warning and all other images still upload | +| `--dockerhub-pass PASS` | Only for step 3a | Docker Hub password or Personal Access Token. Required alongside `--dockerhub-user` | +| `--step STEP` | No | Run only one specific step, e.g. `--step 3a`. Useful for re-running a failed step | +| `--skip STEP` | No | Skip a specific step. Can be repeated, e.g. `--skip 3i --skip 3j` to skip model uploads | +| `--workdir DIR` | No | Directory where files are downloaded before uploading to JFrog. Defaults to `/tmp/ei-airgap-upload` | +| `--dry-run` | No | Prints all commands without executing them. Useful for verifying what the script will do before running | + +### Run one step at a time + +If you want to run or re-run a specific step instead of the full script, use any of +these commands: + +| Command | What it does | +|---|---| +| `./jfrog-setup.sh --step 1` | Creates all JFrog repositories: Docker repos for each upstream registry (Docker Hub, ECR, GHCR, registry.k8s.io, Quay), Helm, PyPI, Debian, and generic repos for binaries and models | +| `./jfrog-setup.sh --step 2` | Sets anonymous read permission targets on all Docker repos so VM2 can pull images without credentials. **Note**: the "Allow Anonymous Access" toggle in Administration → Security → General must be enabled manually in the UI before running this step — the JFrog API cannot automate that toggle | +| `./jfrog-setup.sh --step 3a` | Copies ~40 Docker images from upstream registries into JFrog using skopeo. Most images are pulled anonymously. `apache/apisix-ingress-controller:1.8.0` requires `--dockerhub-user` and `--dockerhub-pass` | +| `./jfrog-setup.sh --step 3b` | Downloads 10 Helm charts (ingress-nginx, langfuse, apisix, keycloak, postgresql, redis, clickhouse, minio, valkey, nri-resource-policy-balloons) and uploads them along with an `index.yaml` that JFrog does not generate automatically | +| `./jfrog-setup.sh --step 3c` | Downloads ~30 Python packages used by the EI deployment playbooks and uploads them to JFrog so VM2 can install them without internet access | +| `./jfrog-setup.sh --step 3d` | Uploads the pip installer wheel to JFrog. Required because Ubuntu disables pip by default and VM2 needs it to bootstrap the Python environment | +| `./jfrog-setup.sh --step 3e` | Downloads 4 Ansible collections used by the EI playbooks and uploads them to JFrog | +| `./jfrog-setup.sh --step 3f` | Downloads jq and its dependencies as .deb files and uploads them to JFrog. Also pre-caches all Kubespray apt packages (conntrack, socat, nfs-common, python3-pip, unzip, etc.) in JFrog by temporarily routing VM1's apt through JFrog. Uses `apt-get download` to force a network fetch for already-installed packages so JFrog caches them reliably. Prompts for sudo password during install | +| `./jfrog-setup.sh --step 3g` | Downloads all binaries Kubespray needs to set up the Kubernetes cluster (kubeadm, kubectl, kubelet, containerd, runc, etcd, calico, cni-plugins, crictl, helm, nerdctl, yq, kubectx, kubens) and uploads them to JFrog | +| `./jfrog-setup.sh --step 3h` | Packages the Kubespray repository as a tarball and uploads it to JFrog. VM2 uses this instead of cloning from GitHub | +| `./jfrog-setup.sh --step 3i --hf-token ` | Downloads **meta-llama/Llama-3.2-3B-Instruct** (~6.5 GB) from HuggingFace and uploads all files to JFrog. Requires a HuggingFace token with access to the model | +| `./jfrog-setup.sh --step 3j --hf-token ` | Downloads **Qwen/Qwen3-0.6B** (~1.2 GB) from HuggingFace and uploads all files to JFrog | +| `./jfrog-setup.sh --step 3k --hf-token ` | Downloads **Qwen/Qwen3-4B** (~7.6 GB) from HuggingFace and uploads all files to JFrog | +| `./jfrog-setup.sh --step 3l --hf-token ` | Downloads **Qwen/Qwen3-1.7B** (~3.4 GB) from HuggingFace and uploads all files to JFrog | +| `./jfrog-setup.sh --step 4` | Sets all remote repos to Offline so JFrog serves only cached content and does not fetch anything new from the internet. This enforces the true airgap | + +--- + +## Summary + +Once `jfrog-setup.sh` completes successfully, JFrog on VM1 is fully configured and ready +to serve as the sole package mirror for VM2. The following has been completed: + +- All JFrog repositories created (Docker, Helm, PyPI, Debian, generic) +- Anonymous access enabled so VM2 can pull images without credentials +- All Docker images, Helm charts, Python packages, binaries, and LLM models uploaded +- All remote repos set to Offline: JFrog serves only cached content and will not fetch + anything new from the internet + +VM1 requires no further changes. Proceed to the [Enterprise Inference airgap deployment +guide](../EI/single-node/air-gap.md) to configure VM2 and run the EI stack. + +--- + +## Troubleshooting + +
+Click to expand + +### Deployment exits with "airgap_enabled is set to yes but this machine has internet connectivity" + +This check runs at the start of every EI deployment when `airgap_enabled=yes`. It means +VM2 can still reach the internet, which defeats the purpose of airgap mode — Docker images +not cached in JFrog would silently fall back to internet registries. + +Disable internet access on VM2 before running the deployment. A common way to do this is +to drop the default route or use iptables: + +```bash +# Option 1: remove the default route (re-add it later if needed) +sudo ip route del default + +# Option 2: block outbound internet with iptables (allow LAN traffic to VM1) +sudo iptables -I OUTPUT -d -j ACCEPT +sudo iptables -I OUTPUT -d 0.0.0.0/0 -j DROP +``` + +After disabling internet, re-run the EI deployment. The check will pass once no internet +routes are reachable. + +--- + +### Use skopeo to copy Docker images, not docker + +Docker 29.x forces HTTPS even when insecure-registries is configured in +`/etc/docker/daemon.json`. Use skopeo instead as it handles HTTP correctly: + +```bash +skopeo copy \ + --src-tls-verify=false \ + --dest-tls-verify=false \ + --dest-creds admin: \ + docker:///: \ + docker://:8082/ei-docker-local/: +``` + +> Always push to `ei-docker-local`, not `ei-docker-virtual`. Virtual repos reject pushes. +> Images pushed to `ei-docker-local` are automatically served through `ei-docker-virtual` +> since local is a member of virtual. + +### Verifying an image is cached + +A plain curl request returns 404 even when an image is cached in JFrog. You need to +include Docker manifest Accept headers: + +```bash +curl -s -u admin: \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "http://:8082/v2/ei-docker-virtual/library/nginx/manifests/1.25.2-alpine" +``` + +A response of 200 means the image is properly cached. Anything else means it is not. + +### Very old image tags not available via Docker Hub + +Docker Hub no longer serves very old tags (like busybox:1.28) through the v2 API, so +JFrog cannot proxy them. The workaround is to pull a newer working tag and push it under +the old tag name: + +```bash +skopeo copy \ + --dest-tls-verify=false \ + --dest-creds admin: \ + docker://:8082/ei-docker-virtual/library/busybox:latest \ + docker://:8082/ei-docker-local/library/busybox:1.28 +``` + +### Anonymous access toggle in the UI does not fully work + +The "Allow Anonymous Access" toggle in the JFrog UI only sets one of two required flags. +If VM2 cannot pull images without credentials, patch the config manually: + +```bash +curl -su "admin:" \ + "http://localhost:8082/artifactory/api/system/configuration" > /tmp/jfrog-config.xml + +sed -i 's/false<\/enabledForAnonymous>/true<\/enabledForAnonymous>/' \ + /tmp/jfrog-config.xml + +curl -su "admin:" -X POST \ + "http://localhost:8082/artifactory/api/system/configuration" \ + -H "Content-Type: application/xml" \ + --data-binary @/tmp/jfrog-config.xml +``` + +> This is handled automatically by step 2 of `jfrog-setup.sh`. Only run this manually if +> VM2 is unable to pull images without credentials after the full setup. + +### Virtual repos cannot be added to permission targets + +If you try to add `ei-docker-virtual` to a JFrog permission target you will get an HTTP +400 error. Add the individual local and remote repos instead. Step 2 of `jfrog-setup.sh` +does this correctly. + +### Helm index.yaml is not generated automatically + +JFrog HelmOCI repos do not auto-generate `index.yaml`. After uploading chart tarballs, +generate and upload the index file manually: + +```bash +mkdir ~/helm-index +cp *.tgz ~/helm-index +cd ~/helm-index +helm repo index . --url http://localhost:8082/artifactory/ei-helm-local +curl -u admin: -T index.yaml \ + "http://localhost:8082/artifactory/ei-helm-local/index.yaml" +``` + +> Step 3b of `jfrog-setup.sh` does this automatically. Only run this manually if you are +> uploading charts outside of the script. + +
diff --git a/third_party/Dell/air-gap/jfrog-setup/jfrog-installation.sh b/third_party/Dell/air-gap/jfrog-setup/jfrog-installation.sh new file mode 100644 index 00000000..61a3863a --- /dev/null +++ b/third_party/Dell/air-gap/jfrog-setup/jfrog-installation.sh @@ -0,0 +1,264 @@ +#!/usr/bin/env bash +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# jfrog-installation.sh +# +# Step 1 of 2 for EI airgap VM1 setup. +# Installs all required tools and JFrog Artifactory on VM1. +# +# After this script completes: +# 1. Open http://:8082 in a browser +# (SSH tunnel: ssh -L 8082:localhost:8082 user@ -N then open http://localhost:8082) +# 2. Log in with admin / password +# 3. Activate license: Admin → Artifactory License → paste trial key → Save +# 4. Run jfrog-setup.sh to create repos and upload all EI assets +# +# Usage: +# sudo ./jfrog-installation.sh [OPTIONS] +# +# Options: +# --jfrog-port PORT JFrog HTTP port (default: 8082) +# --skip-jfrog Install tools only, skip JFrog installation +# -h, --help Show this help message + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- +JFROG_PORT="${JFROG_PORT:-8082}" +SKIP_JFROG=false + +# --------------------------------------------------------------------------- +# Colours +# --------------------------------------------------------------------------- +RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'; CYAN='\033[0;36m'; NC='\033[0m' +info() { echo -e "${CYAN}[INFO]${NC} $*"; } +success() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +section() { + echo "" + echo -e "${CYAN}══════════════════════════════════════════${NC}" + echo -e "${CYAN} $*${NC}" + echo -e "${CYAN}══════════════════════════════════════════${NC}" +} + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case $1 in + --jfrog-port) JFROG_PORT="$2"; shift 2 ;; + --skip-jfrog) SKIP_JFROG=true; shift ;; + -h|--help) + sed -n '/^# Usage:/,/^[^#]/p' "$0" | grep '^#' | sed 's/^# \?//' + exit 0 ;; + *) error "Unknown option: $1"; exit 1 ;; + esac +done + +if [[ "$EUID" -ne 0 ]]; then + error "This script must be run as root: sudo $0 $*" + exit 1 +fi + +echo "" +echo "============================================================" +echo " VM1 Setup — Prerequisites + JFrog Artifactory" +echo " JFrog port: $JFROG_PORT" +echo " Skip JFrog: $SKIP_JFROG" +echo "============================================================" + +# --------------------------------------------------------------------------- +# Step 1 — Install prerequisites +# --------------------------------------------------------------------------- +section "Step 1 — Install Prerequisites" + +info "Updating package lists..." +apt-get update -qq + +info "Installing required packages..." +apt-get install -y \ + curl \ + wget \ + git \ + jq \ + skopeo \ + net-tools \ + ca-certificates \ + gnupg \ + lsb-release \ + unzip \ + tar \ + vim \ + software-properties-common \ + python3 \ + python3-pip \ + ansible + +info "Verifying installed tools..." +for cmd in curl wget git jq skopeo python3 pip3 ansible ansible-galaxy; do + if command -v "$cmd" &>/dev/null; then + success "$cmd found: $(command -v $cmd)" + else + warn "$cmd not found after install" + fi +done + +# --------------------------------------------------------------------------- +# Step 2 — Install Helm +# --------------------------------------------------------------------------- +section "Step 2 — Install Helm" + +if command -v helm &>/dev/null; then + success "helm already installed: $(helm version --short 2>/dev/null)" +else + info "Installing helm via official script..." + curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + success "helm installed: $(helm version --short)" +fi + +# --------------------------------------------------------------------------- +# Step 3 — Fix system limits for JFrog +# --------------------------------------------------------------------------- +section "Step 3 — Fix System Limits" + +info "Setting fs.inotify.max_user_instances=512..." +sysctl -w fs.inotify.max_user_instances=512 + +if grep -q "fs.inotify.max_user_instances" /etc/sysctl.conf; then + info "Already in /etc/sysctl.conf" +else + echo "fs.inotify.max_user_instances=512" >> /etc/sysctl.conf + info "Added to /etc/sysctl.conf (persists across reboots)" +fi + +sysctl -p +success "System limits configured" + +# --------------------------------------------------------------------------- +# Step 4 — Download and Install JFrog Artifactory +# --------------------------------------------------------------------------- +if $SKIP_JFROG; then + warn "Skipping JFrog installation (--skip-jfrog)" +else + section "Step 4 — Download and Install JFrog Artifactory" + + if systemctl is-active --quiet artifactory.service 2>/dev/null; then + success "JFrog Artifactory is already running — skipping install" + else + # Check for a local installer tarball first + installer_tgz=$(ls jfrog-platform-trial-prox-*-deb.tar.gz 2>/dev/null | head -1 || true) + + if [[ -n "$installer_tgz" ]]; then + info "Using local installer: $installer_tgz" + else + JFROG_VERSION="7.111.8" + info "Downloading JFrog Platform Trial installer v${JFROG_VERSION}..." + wget -O jfrog-deb-installer.tar.gz \ + "https://releases.jfrog.io/artifactory/jfrog-prox/org/artifactory/pro/deb/jfrog-platform-trial-prox/${JFROG_VERSION}/jfrog-platform-trial-prox-${JFROG_VERSION}-deb.tar.gz" + installer_tgz="jfrog-deb-installer.tar.gz" + fi + + info "Extracting installer..." + tar -xzf "$installer_tgz" + + info "Running JFrog installer (this may take a few minutes)..." + install_dir=$(ls -d jfrog-platform-trial-pro*/ 2>/dev/null | head -1 || true) + if [[ -z "$install_dir" ]]; then + error "Could not find extracted JFrog installer directory" + exit 1 + fi + bash "${install_dir}install.sh" + + success "JFrog installed" + fi + + # --------------------------------------------------------------------------- + # Step 5 — Start JFrog services + # --------------------------------------------------------------------------- + section "Step 5 — Start JFrog Services" + + info "Starting artifactory.service..." + systemctl start artifactory.service + systemctl enable artifactory.service + + info "Starting xray.service..." + systemctl start xray.service || warn "xray failed to start — not required for airgap asset upload" + + # --------------------------------------------------------------------------- + # Step 6 — Wait for JFrog to respond + # --------------------------------------------------------------------------- + section "Step 6 — Wait for JFrog to be Ready" + + jfrog_url="http://localhost:${JFROG_PORT}/artifactory" + info "Waiting for JFrog at $jfrog_url (up to 3 minutes)..." + + max_wait=180 + elapsed=0 + until curl -sf "$jfrog_url/api/system/ping" 2>/dev/null | grep -q "OK"; do + if [[ $elapsed -ge $max_wait ]]; then + error "JFrog did not become ready within ${max_wait}s" + error "Check status: systemctl status artifactory.service" + error "Check logs: journalctl -u artifactory.service -n 50" + exit 1 + fi + echo -n "." + sleep 5 + elapsed=$((elapsed+5)) + done + echo "" + success "JFrog is ready at $jfrog_url" +fi + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +echo "" +echo "============================================================" +success "VM1 setup complete!" +echo "" +echo "Installed tools:" +for cmd in curl wget git jq skopeo helm python3 pip3 ansible ansible-galaxy; do + if command -v "$cmd" &>/dev/null; then + echo " ✓ $cmd" + else + echo " ✗ $cmd (missing)" + fi +done + +if ! $SKIP_JFROG; then + echo "" + echo "JFrog Artifactory is running at http://localhost:${JFROG_PORT}" + echo "Default credentials: admin / password" + echo "" + echo "============================================================" + echo " NEXT: Activate the JFrog license before running script 2" + echo "============================================================" + echo "" + echo " 1. Access the JFrog UI:" + echo " - From VM1 directly: http://localhost:${JFROG_PORT}" + echo " - From your local machine via SSH tunnel:" + echo " ssh -L ${JFROG_PORT}:localhost:${JFROG_PORT} user@ -N" + echo " then open: http://localhost:${JFROG_PORT}" + echo "" + echo " 2. Log in with: admin / password" + echo " (change password when prompted)" + echo "" + echo " 3. Activate license:" + echo " Admin → Artifactory License → paste trial key → Save" + echo " (Get a free trial key at https://jfrog.com/start-free/)" + echo "" +fi + +echo " Once the license is active, run script 2:" +echo "" +echo " ./jfrog-setup.sh \\" +echo " --jfrog-url http://localhost:${JFROG_PORT}/artifactory \\" +echo " --jfrog-user admin \\" +echo " --jfrog-pass \\" +echo " --dockerhub-user \\" +echo " --dockerhub-pass " +echo "============================================================" diff --git a/third_party/Dell/air-gap/jfrog-setup/jfrog-setup.sh b/third_party/Dell/air-gap/jfrog-setup/jfrog-setup.sh new file mode 100644 index 00000000..056e3304 --- /dev/null +++ b/third_party/Dell/air-gap/jfrog-setup/jfrog-setup.sh @@ -0,0 +1,1207 @@ +#!/usr/bin/env bash +# Copyright (C) 2025-2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# jfrog-setup.sh +# +# One-shot script that sets up JFrog Artifactory for EI airgapped deployment: +# Step 1 - Create all required repositories +# Step 2 - Enable anonymous access configuration + set permission targets +# Note: Docker API doesn't support true anonymous pulls; VM2 uses credentials +# Step 3a - Docker images (via skopeo) +# Step 3b - Helm charts +# Step 3c - PyPI packages +# Step 3d - pip bootstrap wheel +# Step 3e - Ansible collections +# Step 3f - apt .deb files for jq + pre-cache Kubespray/inference-tools apt packages +# (conntrack socat ipset ebtables nfs-common ipvsadm unzip python3-pip) +# Step 3g - Kubernetes / Kubespray binaries +# Step 3h - Kubespray tarball +# Step 3i - Meta-Llama-3.2-3B-Instruct model (optional, requires HuggingFace token) +# Step 3j - Qwen/Qwen3-0.6B model (optional, requires HuggingFace token) +# Step 3k - Qwen/Qwen3-4B model (optional, requires HuggingFace token) +# Step 3l - Qwen/Qwen3-1.7B model (optional, requires HuggingFace token) +# +# Run this script on VM1 (internet-connected machine with JFrog installed). +# +# Usage: +# ./jfrog-setup.sh [OPTIONS] +# +# Options: +# --jfrog-url URL JFrog base URL (default: http://localhost:8082/artifactory) +# --jfrog-user USER JFrog username (default: admin) +# --jfrog-pass PASS JFrog password (default: password) +# --hf-token TOKEN HuggingFace token (required for steps 3i, 3j, 3k and 3l) +# --dockerhub-user USER Docker Hub username (required for apisix-ingress-controller) +# --dockerhub-pass PASS Docker Hub password / PAT +# --step STEP Run only a specific step (e.g. --step 1, --step 3a) +# --skip STEP Skip a specific step (repeatable) +# --workdir DIR Working directory for downloads (default: /tmp/ei-airgap-upload) +# --dry-run Print commands without executing them +# -h, --help Show this help message + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- +JFROG_URL="${JFROG_URL:-http://localhost:8082/artifactory}" +JFROG_USER="${JFROG_USER:-admin}" +JFROG_PASS="${JFROG_PASS:-password}" +HF_TOKEN="${HF_TOKEN:-}" +DOCKERHUB_USER="${DOCKERHUB_USER:-}" +DOCKERHUB_PASS="${DOCKERHUB_PASS:-}" +ONLY_STEP="" +SKIP_STEPS=() +WORKDIR="/tmp/ei-airgap-upload" +DRY_RUN=false + +# --------------------------------------------------------------------------- +# Colours +# --------------------------------------------------------------------------- +RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'; CYAN='\033[0;36m'; NC='\033[0m' +info() { echo -e "${CYAN}[INFO]${NC} $*"; } +success() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +step_hdr(){ echo -e "\n${CYAN}========== $* ==========${NC}"; } + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case $1 in + --jfrog-url) JFROG_URL="$2"; shift 2 ;; + --jfrog-user) JFROG_USER="$2"; shift 2 ;; + --jfrog-pass) JFROG_PASS="$2"; shift 2 ;; + --hf-token) HF_TOKEN="$2"; shift 2 ;; + --dockerhub-user) DOCKERHUB_USER="$2"; shift 2 ;; + --dockerhub-pass) DOCKERHUB_PASS="$2"; shift 2 ;; + --step) ONLY_STEP="$2"; shift 2 ;; + --skip) SKIP_STEPS+=("$2"); shift 2 ;; + --workdir) WORKDIR="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + -h|--help) + sed -n '/^# Usage:/,/^[^#]/p' "$0" | grep '^#' | sed 's/^# \?//' + exit 0 ;; + *) error "Unknown option: $1"; exit 1 ;; + esac +done + +# Derived +JFROG_CREDS="${JFROG_USER}:${JFROG_PASS}" +JFROG_HOST="${JFROG_URL#http://}"; JFROG_HOST="${JFROG_HOST#https://}"; JFROG_HOST="${JFROG_HOST%%/*}" + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +run() { + if $DRY_RUN; then echo "[DRY-RUN] $*"; else "$@"; fi +} + +should_run() { + local s="$1" + [[ -z "$ONLY_STEP" || "$ONLY_STEP" == "$s" ]] || return 1 + for skip in "${SKIP_STEPS[@]}"; do [[ "$skip" == "$s" ]] && return 1; done + return 0 +} + +create_repo() { + local name="$1" payload="$2" + info "Creating repo: $name" + local http_code resp + http_code=$(curl -su "$JFROG_CREDS" -X PUT "$JFROG_URL/api/repositories/$name" \ + -H "Content-Type: application/json" -d "$payload" \ + -o /tmp/jfrog_repo_resp.txt -w "%{http_code}") + resp=$(cat /tmp/jfrog_repo_resp.txt) + if [[ "$http_code" == "200" || "$http_code" == "201" ]]; then + success "$name created (HTTP $http_code)" + elif echo "$resp" | grep -qi "already exists"; then + success "$name already exists — skipping" + else + error "$name failed (HTTP $http_code): $resp" + fi +} + +jfrog_upload() { + local file="$1" dest="$2" + info "Uploading $(basename "$file") -> $dest" + run curl -fsSL -u "$JFROG_CREDS" -T "$file" "$JFROG_URL/$dest" +} + +# Pull an image through a JFrog remote repo (temporarily set Online). +# This caches: +# 1. The manifest list with its ORIGINAL digest — required for images that containerd +# pulls by digest (e.g. kube-webhook-certgen pre-install hook). skopeo --override-arch +# produces a single-arch manifest with a different digest, causing 404. +# 2. All amd64 blobs (layers + config) — fetched via skopeo copy to a temp dir. +# Manifest-only fetches cache the manifest metadata but NOT the blobs; containerd +# then fails when it tries to download the config blob (sha256:fcb7...) and gets 404. +# $1 = JFrog remote repo name (e.g. ei-docker-k8s) +# $2 = image path without registry prefix (e.g. ingress-nginx/kube-webhook-certgen) +# $3 = tag (e.g. v1.5.3) +precache_via_remote() { + local remote_repo="$1" image_path="$2" tag="$3" + info "Pre-caching $image_path:$tag via $remote_repo remote..." + + # Temporarily set remote Online + curl -su "$JFROG_CREDS" -X POST "$JFROG_URL/api/repositories/$remote_repo" \ + -H "Content-Type: application/json" -d '{"offline":false}' > /dev/null 2>&1 + + # Step 1: Fetch manifest list by tag — caches manifest list in JFrog with original digest. + # This must happen BEFORE the skopeo copy so the multi-arch manifest list digest is preserved + # (skopeo --override-arch only stores a single-arch manifest, not the list). + local http_code + http_code=$(curl -s -u "$JFROG_CREDS" \ + -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "${JFROG_URL%/artifactory}/v2/$remote_repo/$image_path/manifests/$tag") + + # Step 2: Pull amd64 image blobs through JFrog remote — forces JFrog to fetch and cache + # all blob content (config + layers). Manifest-only fetches do NOT cache blobs. + # skopeo pulls from the JFrog remote (which proxies to upstream) and caches blobs in JFrog. + local tmpdir + tmpdir=$(mktemp -d) + skopeo copy \ + --src-tls-verify=false \ + --src-creds "$JFROG_CREDS" \ + --override-arch amd64 --override-os linux \ + "docker://${JFROG_HOST}/${remote_repo}/${image_path}:${tag}" \ + "dir:${tmpdir}" 2>&1 | sed 's/^/ /' || warn "skopeo blob pull returned non-zero for $image_path:$tag — blobs may be partially cached" + rm -rf "$tmpdir" + + # Set back to Offline + curl -su "$JFROG_CREDS" -X POST "$JFROG_URL/api/repositories/$remote_repo" \ + -H "Content-Type: application/json" -d '{"offline":true}' > /dev/null 2>&1 + + if [[ "$http_code" == "200" ]]; then + success "$image_path:$tag cached (manifest list + amd64 blobs)" + else + warn "$image_path:$tag — manifest list HTTP $http_code from $remote_repo" + fi +} + +check_prereqs() { + local missing=() + for cmd in curl skopeo helm pip3 ansible-galaxy git python3; do + command -v "$cmd" &>/dev/null || missing+=("$cmd") + done + + if [[ ${#missing[@]} -eq 0 ]]; then + success "All prerequisites installed" + return 0 + fi + + error "Missing required tools: ${missing[*]}" + error "Run install-vm1.sh first to install all prerequisites:" + error " sudo ./install-vm1.sh" + exit 1 +} + +# --------------------------------------------------------------------------- +# Step 1 — Create Repositories +# --------------------------------------------------------------------------- +step_1() { + step_hdr "Step 1 - Create JFrog Repositories" + + info "Checking JFrog connectivity..." + if ! curl -su "$JFROG_CREDS" "$JFROG_URL/api/system/ping" | grep -q "OK"; then + error "Cannot reach JFrog at $JFROG_URL — check URL, credentials and that Artifactory is running" + exit 1 + fi + success "JFrog reachable" + + echo "── Docker Repositories ──────────────────────────────────────" + create_repo "ei-docker-local" \ + '{"rclass":"local","packageType":"docker"}' + create_repo "ei-docker-dockerhub" \ + '{"rclass":"remote","packageType":"docker","url":"https://registry-1.docker.io"}' + create_repo "ei-docker-ecr" \ + '{"rclass":"remote","packageType":"docker","url":"https://public.ecr.aws"}' + create_repo "ei-docker-ghcr" \ + '{"rclass":"remote","packageType":"docker","url":"https://ghcr.io"}' + create_repo "ei-docker-k8s" \ + '{"rclass":"remote","packageType":"docker","url":"https://registry.k8s.io"}' + create_repo "ei-docker-quay" \ + '{"rclass":"remote","packageType":"docker","url":"https://quay.io"}' + create_repo "ei-docker-virtual" \ + '{"rclass":"virtual","packageType":"docker","repositories":["ei-docker-local","ei-docker-dockerhub","ei-docker-ecr","ei-docker-ghcr","ei-docker-k8s","ei-docker-quay"]}' + + echo "── Helm Repositories ────────────────────────────────────────" + create_repo "ei-helm-local" \ + '{"rclass":"local","packageType":"helmoci"}' + create_repo "ei-helm-ingress-nginx" \ + '{"rclass":"remote","packageType":"helmoci","url":"https://kubernetes.github.io/ingress-nginx"}' + create_repo "ei-helm-langfuse" \ + '{"rclass":"remote","packageType":"helmoci","url":"https://langfuse.github.io/langfuse-k8s"}' + create_repo "ei-helm-virtual" \ + '{"rclass":"virtual","packageType":"helmoci","repositories":["ei-helm-local","ei-helm-ingress-nginx","ei-helm-langfuse"]}' + + echo "── PyPI Repositories ────────────────────────────────────────" + create_repo "ei-pypi-local" \ + '{"rclass":"local","packageType":"pypi"}' + create_repo "ei-pypi-remote" \ + '{"rclass":"remote","packageType":"pypi","url":"https://pypi.org"}' + create_repo "ei-pypi-virtual" \ + '{"rclass":"virtual","packageType":"pypi","repositories":["ei-pypi-local","ei-pypi-remote"]}' + + echo "── Debian Repositories ──────────────────────────────────────" + create_repo "ei-debian-ubuntu" \ + '{"rclass":"remote","packageType":"debian","url":"http://archive.ubuntu.com/ubuntu"}' + create_repo "ei-debian-virtual" \ + '{"rclass":"virtual","packageType":"debian","repositories":["ei-debian-ubuntu"]}' + + echo "── HuggingFace Repositories ─────────────────────────────────" + create_repo "ei-hf-remote" \ + '{"rclass":"remote","packageType":"huggingfaceml","url":"https://huggingface.co"}' + + echo "── Generic Repositories ─────────────────────────────────────" + create_repo "ei-generic-binaries" \ + '{"rclass":"local","packageType":"generic"}' + create_repo "ei-generic-models" \ + '{"rclass":"local","packageType":"generic"}' + + success "Step 1 complete — all repositories created" +} + +# --------------------------------------------------------------------------- +# Step 2 — Enable Anonymous Access + Permissions +# --------------------------------------------------------------------------- +step_2() { + step_hdr "Step 2 - Enable Anonymous Access" + + # JFrog 7.x (7.38+): anonymous access is stored in the Access microservice DB. + # The legacy XML config field (enabledForAnonymous) and access.config.yml are both + # ignored once the Access service is initialised. The only reliable way is the + # Access REST API, which requires a Bearer token (not Basic auth). + info "Getting admin Bearer token (scope=member-of-groups:*) ..." + local bearer_token access_http + bearer_token=$(curl -su "$JFROG_CREDS" -X POST \ + "$JFROG_URL/api/security/token" \ + -d "username=${JFROG_USER}&scope=member-of-groups:*&expires_in=3600" \ + | python3 -c "import sys,json; print(json.load(sys.stdin).get('access_token',''))" 2>/dev/null || true) + + if [[ -n "$bearer_token" ]]; then + info "Enabling anonymous access via Access API (/access/api/v1/config) ..." + access_http=$(curl -s -X PATCH \ + "http://${JFROG_HOST}/access/api/v1/config" \ + -H "Authorization: Bearer $bearer_token" \ + -H "Content-Type: application/json" \ + -d '{"security":{"allow-anonymous-access":true}}' \ + -o /tmp/jfrog-access-resp.txt -w "%{http_code}") + if [[ "$access_http" == "200" || "$access_http" == "201" || "$access_http" == "204" ]]; then + success "Anonymous access enabled via Access API (HTTP $access_http)" + else + # JFrog 7.x Access API requires a token with audience jfac@... (not jfrt@...). + # member-of-groups:* tokens are scoped to the Artifactory service and are rejected. + # The only reliable way to enable this is through the JFrog UI or the jf CLI. + warn "Access API returned HTTP $access_http (token audience mismatch — expected jfac@...)" + warn "Enable anonymous access manually:" + warn " Browser: http://${JFROG_HOST}/ui → Admin → Security → Settings → Allow Anonymous Access → ON" + warn " OR: jf config add --url http://${JFROG_HOST} --user ${JFROG_USER} --password ${JFROG_PASS} --interactive=false" + warn " jf rt curl -X PATCH /access/api/v1/config -H 'Content-Type: application/json' -d '{\"security\":{\"allow-anonymous-access\":true}}'" + fi + else + warn "Could not obtain Bearer token." + warn "Enable anonymous access manually:" + warn " Browser: http://${JFROG_HOST}/ui → Admin → Security → Settings → Allow Anonymous Access → ON" + fi + + # Verify Artifactory API is reachable anonymously (baseline check) + info "Verifying Artifactory-level anonymous access ..." + local api_code + api_code=$(curl -s -o /dev/null -w "%{http_code}" "$JFROG_URL/api/storage/ei-docker-local") + if [[ "$api_code" == "200" ]]; then + success "Artifactory API anonymous access OK" + else + warn "Artifactory API returned HTTP $api_code for anonymous request" + warn "Re-run: ./jfrog-setup.sh --step 2" + fi + + # Set anonymous read permissions on all Docker repos. + # Note: virtual repos cannot be added to permission targets (JFrog returns 400). + # Two targets are created: + # anonymous-docker — grants anonymous read on all docker repos (image pulls) + # anonymous-user — required for /v2/token to return 200 for anonymous Bearer + # token requests; without this containerd gets 401 on token + # fetch even when enabledForAnonymous=true + local docker_repos='["ei-docker-local","ei-docker-dockerhub","ei-docker-ecr","ei-docker-ghcr","ei-docker-k8s","ei-docker-quay","ANY REMOTE"]' + local perm_name perm_http perm_resp + for perm_name in anonymous-docker anonymous-user; do + info "Setting permission target: $perm_name ..." + python3 -c " +import json +perm = { + 'name': '${perm_name}', + 'includesPattern': '**', + 'excludesPattern': '', + 'repositories': ['ei-docker-local','ei-docker-dockerhub','ei-docker-ecr','ei-docker-ghcr','ei-docker-k8s','ei-docker-quay','ANY REMOTE'], + 'principals': {'users': {'anonymous': ['r']}} +} +print(json.dumps(perm)) +" > /tmp/jfrog-perm.json + perm_http=$(curl -su "$JFROG_CREDS" -X PUT \ + "$JFROG_URL/api/security/permissions/$perm_name" \ + -H "Content-Type: application/json" \ + -d @/tmp/jfrog-perm.json \ + -o /tmp/jfrog-perm-resp.txt -w "%{http_code}") + perm_resp=$(cat /tmp/jfrog-perm-resp.txt) + if [[ "$perm_http" == "200" || "$perm_http" == "201" ]]; then + success "$perm_name permissions set (HTTP $perm_http)" + else + error "$perm_name permission PUT returned HTTP $perm_http: $perm_resp" + error "Anonymous Docker pulls will NOT work until this is fixed." + return 1 + fi + done + + # Verify 1: token endpoint responds 200 for anonymous requests + # Verify the full two-step Docker V2 auth flow (mirrors exactly what containerd does): + # Step 1: GET /v2/token anonymously → should return 200 with a token + # Step 2: GET manifest with Bearer token → should return 200 + # The bare manifest request returning 401 is the normal auth challenge, not an error. + info "Verifying anonymous token endpoint (/v2/token) ..." + local token_resp token_code anon_token + token_resp=$(curl -s \ + "http://${JFROG_HOST}/v2/token?scope=repository%3Alibrary%2Fnginx%3Apull&service=${JFROG_HOST}" \ + -w "\n%{http_code}") + token_code=$(echo "$token_resp" | tail -1) + anon_token=$(echo "$token_resp" | head -1 | python3 -c "import sys,json; print(json.load(sys.stdin).get('token',''))" 2>/dev/null || true) + + if [[ "$token_code" == "200" ]]; then + success "Anonymous token endpoint OK (HTTP 200)" + else + warn "Anonymous token endpoint returned HTTP $token_code" + warn "Enable anonymous access in JFrog UI: http://${JFROG_HOST}/ui" + warn " Admin → Security → Settings → Allow Anonymous Access → ON" + fi + + # Step 2: use the anonymous token to fetch the manifest + if [[ -n "$anon_token" ]]; then + info "Verifying end-to-end anonymous pull flow (token → manifest) ..." + local flow_code + flow_code=$(curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: Bearer $anon_token" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + "http://${JFROG_HOST}/v2/ei-docker-virtual/library/nginx/manifests/1.25.2-alpine") + if [[ "$flow_code" == "200" ]]; then + success "End-to-end anonymous pull flow OK — containerd mirror pulls will work" + else + warn "Manifest with anonymous token returned HTTP $flow_code" + warn "Permission targets may not be applied yet — check anonymous-docker and anonymous-user targets in JFrog UI" + fi + fi + + success "Step 2 complete — anonymous access enabled" +} + +# --------------------------------------------------------------------------- +# Step 3a — Docker Images (via skopeo) +# --------------------------------------------------------------------------- +step_3a() { + step_hdr "3a - Docker Images" + local dest_repo="ei-docker-local" + local -a skopeo_dest_flags=(--dest-tls-verify=false --dest-creds "$JFROG_CREDS") + # Copy only linux/amd64 manifest — skips attestation/in-toto layers that + # older skopeo versions cannot handle when using --all. + local -a skopeo_base=(--src-tls-verify=false --override-arch amd64 --override-os linux) + + # Format: "source_image|dest_path_in_ei-docker-local" + # busybox:1.28 is no longer available via Docker Hub v2 API — copy latest and push as 1.28 + # pause:3.9 is correct for k8s v1.30.4 (3.10 is for k8s 1.31+) + local images=( + # ── ECR ────────────────────────────────────────────────────────────────── + "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.10.2|q9t5s3a7/vllm-cpu-release-repo:v0.10.2" + "public.ecr.aws/bitnami/minio:2024.11.7-debian-12-r0|bitnami/minio:2024.11.7-debian-12-r0" + # minio-client (mc) is bundled inside the minio server image — no separate image needed + + # ── GHCR ───────────────────────────────────────────────────────────────── + "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu|huggingface/text-generation-inference:2.4.0-intel-cpu" + "ghcr.io/huggingface/text-embeddings-inference:cpu-1.7|huggingface/text-embeddings-inference:cpu-1.7" + "ghcr.io/berriai/litellm-non_root:main-v1.75.8-stable|berriai/litellm-non_root:main-v1.75.8-stable" + "ghcr.io/containers/nri-plugins/nri-resource-policy-balloons:v0.12.2|containers/nri-plugins/nri-resource-policy-balloons:v0.12.2" + "ghcr.io/containers/nri-plugins/nri-config-manager:v0.12.2|containers/nri-plugins/nri-config-manager:v0.12.2" + + # ── Docker Hub ──────────────────────────────────────────────────────────── + "docker.io/langfuse/langfuse:3.106.1|langfuse/langfuse:3.106.1" + "docker.io/langfuse/langfuse-worker:3.106.1|langfuse/langfuse-worker:3.106.1" + "docker.io/bitnamilegacy/keycloak:25.0.2-debian-12-r2|bitnamilegacy/keycloak:25.0.2-debian-12-r2" + "docker.io/bitnamilegacy/postgresql:16.3.0-debian-12-r23|bitnamilegacy/postgresql:16.3.0-debian-12-r23" + "docker.io/bitnamilegacy/postgresql:17.5.0-debian-12-r0|bitnamilegacy/postgresql:17.5.0-debian-12-r0" + "docker.io/bitnamilegacy/redis:8.0.1-debian-12-r0|bitnamilegacy/redis:8.0.1-debian-12-r0" + "docker.io/bitnamilegacy/clickhouse:25.2.1-debian-12-r0|bitnamilegacy/clickhouse:25.2.1-debian-12-r0" + "docker.io/bitnamilegacy/valkey:8.0.2-debian-12-r2|bitnamilegacy/valkey:8.0.2-debian-12-r2" + "docker.io/bitnamilegacy/zookeeper:3.9.3-debian-12-r8|bitnamilegacy/zookeeper:3.9.3-debian-12-r8" + "docker.io/bitnamilegacy/os-shell:12-debian-12-r48|bitnamilegacy/os-shell:12-debian-12-r48" + "docker.io/bitnamilegacy/etcd:3.5.10-debian-11-r2|bitnamilegacy/etcd:3.5.10-debian-11-r2" + "docker.io/apache/apisix:3.9.1-debian|apache/apisix:3.9.1-debian" + "docker.io/kubernetesui/dashboard:v2.7.0|kubernetesui/dashboard:v2.7.0" + "docker.io/kubernetesui/metrics-scraper:v1.0.8|kubernetesui/metrics-scraper:v1.0.8" + "docker.io/library/nginx:1.25.2-alpine|library/nginx:1.25.2-alpine" + "docker.io/library/ubuntu:22.04|library/ubuntu:22.04" + "docker.io/library/registry:2.8.1|library/registry:2.8.1" + "docker.io/openvino/model_server:2025.4|openvino/model_server:2025.4" + "docker.io/rancher/local-path-provisioner:v0.0.24|rancher/local-path-provisioner:v0.0.24" + "docker.io/library/busybox:latest|library/busybox:1.28" # 1.28 manifest no longer in Hub v2 API — copy latest, push as 1.28 + "docker.io/library/busybox:latest|library/busybox:latest" # local-path provisioner helper pod uses busybox:latest + "docker.io/library/busybox:latest|library/busybox:1.36" # genai-gateway init container uses busybox:1.36 + "docker.io/curlimages/curl:latest|curlimages/curl:latest" # model registration job + + # ── registry.k8s.io ─────────────────────────────────────────────────────── + # Dest path must NOT include registry.k8s.io/ prefix. + # containerd mirror with override_path=true strips the registry hostname and + # appends only the image path, so the request arrives as: + # /v2/ei-docker-virtual/coredns/coredns/manifests/v1.11.3 (no prefix) + # JFrog remote repos (ei-docker-k8s) also store images without the registry prefix. + "registry.k8s.io/ingress-nginx/controller:v1.12.2|ingress-nginx/controller:v1.12.2" + # kube-webhook-certgen is handled via precache_via_remote below (skopeo --all fails on in-toto attestation layers) + # "registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.3|ingress-nginx/kube-webhook-certgen:v1.5.3" + "registry.k8s.io/pause:3.9|pause:3.9" + "registry.k8s.io/pause:3.10|pause:3.10" + "registry.k8s.io/etcd:3.5.12-0|etcd:3.5.12-0" + "registry.k8s.io/kube-apiserver:v1.30.4|kube-apiserver:v1.30.4" + "registry.k8s.io/kube-controller-manager:v1.30.4|kube-controller-manager:v1.30.4" + "registry.k8s.io/kube-scheduler:v1.30.4|kube-scheduler:v1.30.4" + "registry.k8s.io/kube-proxy:v1.30.4|kube-proxy:v1.30.4" + "registry.k8s.io/coredns/coredns:v1.11.1|coredns/coredns:v1.11.1" + "registry.k8s.io/coredns/coredns:v1.11.3|coredns/coredns:v1.11.3" + "registry.k8s.io/dns/k8s-dns-node-cache:1.22.28|dns/k8s-dns-node-cache:1.22.28" + "registry.k8s.io/cpa/cluster-proportional-autoscaler:v1.8.8|cpa/cluster-proportional-autoscaler:v1.8.8" + + # ── quay.io ─────────────────────────────────────────────────────────────── + "quay.io/calico/node:v3.28.1|calico/node:v3.28.1" + "quay.io/calico/cni:v3.28.1|calico/cni:v3.28.1" + "quay.io/calico/kube-controllers:v3.28.1|calico/kube-controllers:v3.28.1" + "quay.io/calico/pod2daemon-flexvol:v3.28.1|calico/pod2daemon-flexvol:v3.28.1" + "quay.io/calico/node:v3.29.1|calico/node:v3.29.1" + "quay.io/calico/cni:v3.29.1|calico/cni:v3.29.1" + "quay.io/calico/kube-controllers:v3.29.1|calico/kube-controllers:v3.29.1" + "quay.io/calico/pod2daemon-flexvol:v3.29.1|calico/pod2daemon-flexvol:v3.29.1" + ) + + local copied=0 failed=0 fail_list=() + for entry in "${images[@]}"; do + local src="${entry%%|*}" + local dest_path="${entry##*|}" + info "Copying $src -> $dest_repo/$dest_path" + + # Skip if manifest already exists in JFrog — avoids Docker Hub rate limits on re-runs. + # Extract image name and tag from dest_path (e.g. "library/nginx:1.25.2-alpine") + local dest_image="${dest_path%:*}" dest_tag="${dest_path##*:}" + local existing_code + existing_code=$(curl -s -u "$JFROG_CREDS" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "http://${JFROG_HOST}/v2/${dest_repo}/${dest_image}/manifests/${dest_tag}") + if [[ "$existing_code" == "200" ]]; then + info "Already in JFrog — skipping: $dest_repo/$dest_path" + copied=$((copied+1)) + continue + fi + + local -a src_cred_flags=() + if [[ "$src" == docker.io/* ]] && [[ -n "$DOCKERHUB_USER" && -n "$DOCKERHUB_PASS" ]]; then + src_cred_flags+=(--src-creds "$DOCKERHUB_USER:$DOCKERHUB_PASS") + fi + + if run skopeo copy "${skopeo_base[@]}" "${src_cred_flags[@]}" "${skopeo_dest_flags[@]}" \ + "docker://$src" "docker://$JFROG_HOST/$dest_repo/$dest_path"; then + copied=$((copied+1)) + else + warn "Failed: $src" + failed=$((failed+1)) + fail_list+=("$src") + fi + done + + # apisix-ingress-controller requires Docker Hub credentials (rate-limited / auth required) + if [[ -n "$DOCKERHUB_USER" && -n "$DOCKERHUB_PASS" ]]; then + local apisix_ic_code + apisix_ic_code=$(curl -s -u "$JFROG_CREDS" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "http://${JFROG_HOST}/v2/${dest_repo}/apache/apisix-ingress-controller/manifests/1.8.0") + if [[ "$apisix_ic_code" == "200" ]]; then + info "Already in JFrog — skipping: $dest_repo/apache/apisix-ingress-controller:1.8.0" + copied=$((copied+1)) + else + info "Copying apache/apisix-ingress-controller:1.8.0 from Docker Hub..." + if run skopeo copy "${skopeo_base[@]}" \ + --src-creds "$DOCKERHUB_USER:$DOCKERHUB_PASS" \ + "${skopeo_dest_flags[@]}" \ + "docker://docker.io/apache/apisix-ingress-controller:1.8.0" \ + "docker://$JFROG_HOST/$dest_repo/apache/apisix-ingress-controller:1.8.0"; then + copied=$((copied+1)) + else + warn "Failed: apisix-ingress-controller:1.8.0" + failed=$((failed+1)) + fi + fi + else + warn "Skipping apisix-ingress-controller:1.8.0 — pass --dockerhub-user and --dockerhub-pass" + fi + + success "3a complete: copied=$copied failed=$failed" + if [[ $failed -gt 0 ]]; then + warn "Failed images:"; for img in "${fail_list[@]}"; do warn " $img"; done + fi + + # kube-webhook-certgen must be cached via the remote repo (not skopeo) because the + # ingress-nginx chart pulls it by manifest-list digest (sha256:2cf4...). skopeo with + # --override-arch produces a single-arch manifest with a different digest, causing 404. + # precache_via_remote fetches by tag through JFrog's remote, which caches the original + # multi-arch manifest list with its original digest intact. + precache_via_remote "ei-docker-k8s" "ingress-nginx/kube-webhook-certgen" "v1.5.3" + + # Verify nginx is properly cached — must use Docker Accept headers; plain curl returns 404 even if cached + info "Verifying nginx:1.25.2-alpine manifest is accessible in JFrog..." + local http_code + http_code=$(curl -s -u "$JFROG_CREDS" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ + -o /dev/null -w "%{http_code}" \ + "${JFROG_URL%/artifactory}/v2/ei-docker-virtual/library/nginx/manifests/1.25.2-alpine") + if [[ "$http_code" == "200" ]]; then + success "nginx:1.25.2-alpine verified in JFrog (HTTP $http_code)" + else + warn "nginx manifest check returned HTTP $http_code — expected 200; image may not be cached correctly" + fi +} + +# --------------------------------------------------------------------------- +# Step 3b — Helm Charts +# --------------------------------------------------------------------------- +step_3b() { + step_hdr "3b - Helm Charts" + local helmdir="$WORKDIR/helm-charts" + mkdir -p "$helmdir" + cd "$helmdir" + + run helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx + run helm repo add langfuse https://langfuse.github.io/langfuse-k8s + run helm repo add apisix https://charts.apiseven.com + run helm repo add nri-plugins https://containers.github.io/nri-plugins + run helm repo update + + run helm pull ingress-nginx/ingress-nginx --version 4.12.2 --destination . + run helm pull langfuse/langfuse --version 1.5.1 --destination . + run helm pull apisix/apisix --version 2.8.1 --destination . + run helm pull nri-plugins/nri-resource-policy-balloons --version v0.12.2 --destination . + + run helm pull oci://registry-1.docker.io/bitnamicharts/keycloak --version 22.1.0 --destination . + run helm pull oci://registry-1.docker.io/bitnamicharts/postgresql --version 16.7.4 --destination . + run helm pull oci://registry-1.docker.io/bitnamicharts/redis --version 21.1.3 --destination . + run helm pull oci://registry-1.docker.io/bitnamicharts/clickhouse --version 8.0.5 --destination . + run helm pull oci://registry-1.docker.io/bitnamicharts/minio --version 14.10.5 --destination . + run helm pull oci://registry-1.docker.io/bitnamicharts/valkey --version 2.2.4 --destination . + + for chart in *.tgz; do + [[ -f "$chart" ]] || continue + jfrog_upload "$chart" "ei-helm-local/$chart" + done + + # Generate and upload index.yaml — JFrog HelmOCI repos do not auto-generate it. + # IMPORTANT: index.yaml URLs must use the externally-accessible IP (not localhost), + # because VM2 downloads charts using these URLs. If JFROG_URL contains localhost/127.0.0.1, + # helm on VM2 will fail with "connection refused" when trying to download chart tarballs. + # Always run this script with --jfrog-url http://:8082/artifactory. + if echo "$JFROG_URL" | grep -qE "localhost|127\.0\.0\.1"; then + error "JFROG_URL contains '$JFROG_URL' — index.yaml would have localhost URLs that VM2 cannot reach." + error "Re-run with: --jfrog-url http://:8082/artifactory" + return 1 + fi + run helm repo index . --url "$JFROG_URL/ei-helm-local" + jfrog_upload "index.yaml" "ei-helm-local/index.yaml" + + success "3b complete" + cd - >/dev/null +} + +# --------------------------------------------------------------------------- +# Step 3c — PyPI Packages +# --------------------------------------------------------------------------- +step_3c() { + step_hdr "3c - PyPI Packages" + local wheelsdir="$WORKDIR/wheels" + mkdir -p "$wheelsdir" + + run pip3 download \ + ansible==9.13.0 ansible-core==2.16.18 \ + jinja2 jmespath==1.0.1 jsonschema==4.23.0 jsonschema-specifications \ + netaddr==1.3.0 kubernetes==35.0.0 pyyaml==6.0.3 \ + cryptography==44.0.0 requests oauthlib requests-oauthlib urllib3 \ + certifi charset-normalizer idna packaging typing-extensions \ + six python-dateutil attrs rpds-py referencing resolvelib \ + durationpy websocket-client cffi pycparser markupsafe \ + -d "$wheelsdir" + + # Download cryptography 46.x separately — cannot mix with 44.x in one pip download call + run pip3 download cryptography==46.0.7 -d "$wheelsdir" + + for pkg in "$wheelsdir"/*.whl "$wheelsdir"/*.tar.gz; do + [[ -f "$pkg" ]] || continue + jfrog_upload "$pkg" "ei-pypi-local/$(basename "$pkg")" + done + + success "3c complete" +} + +# --------------------------------------------------------------------------- +# Step 3d — pip Bootstrap Wheel +# --------------------------------------------------------------------------- +step_3d() { + step_hdr "3d - pip Bootstrap Wheel" + local pipdir="$WORKDIR/pip-dl" + mkdir -p "$pipdir" + + run pip3 download pip --no-deps -d "$pipdir" + + local whl + whl=$(ls "$pipdir"/pip-*.whl 2>/dev/null | head -1) + if [[ -z "$whl" ]]; then + error "pip wheel not found in $pipdir" + return 1 + fi + + # Uploaded as generic 'pip.whl' — deployment script reads version from WHEEL metadata inside the zip + jfrog_upload "$whl" "ei-generic-binaries/pip.whl" + success "3d complete" +} + +# --------------------------------------------------------------------------- +# Step 3e — Ansible Collections +# --------------------------------------------------------------------------- +step_3e() { + step_hdr "3e - Ansible Collections" + local colldir="$WORKDIR/ansible-collections" + mkdir -p "$colldir" + + run ansible-galaxy collection download \ + kubernetes.core:6.3.0 \ + community.general:12.5.0 \ + ansible.posix \ + -p "$colldir" + + # setup-env.sh looks for --latest.tar.gz + local kube_core_tgz community_general_tgz ansible_posix_tgz + kube_core_tgz=$(ls "$colldir"/kubernetes-core-*.tar.gz 2>/dev/null | head -1) + community_general_tgz=$(ls "$colldir"/community-general-*.tar.gz 2>/dev/null | head -1) + ansible_posix_tgz=$(ls "$colldir"/ansible-posix-*.tar.gz 2>/dev/null | head -1) + + if [[ -n "$kube_core_tgz" ]]; then + # Upload both versioned name (matches JFrog listing) and -latest (what setup-env.sh looks for) + jfrog_upload "$kube_core_tgz" "ei-generic-binaries/ansible-collections/kubernetes-core-6.3.0.tar.gz" + jfrog_upload "$kube_core_tgz" "ei-generic-binaries/ansible-collections/kubernetes-core-latest.tar.gz" + else + warn "kubernetes.core tarball not found — skipping" + fi + + if [[ -n "$community_general_tgz" ]]; then + jfrog_upload "$community_general_tgz" "ei-generic-binaries/ansible-collections/community-general-12.5.0.tar.gz" + jfrog_upload "$community_general_tgz" "ei-generic-binaries/ansible-collections/community-general-latest.tar.gz" + else + warn "community.general tarball not found — skipping" + fi + + if [[ -n "$ansible_posix_tgz" ]]; then + jfrog_upload "$ansible_posix_tgz" "ei-generic-binaries/ansible-collections/ansible-posix-latest.tar.gz" + else + warn "ansible.posix tarball not found — skipping" + fi + + # community.kubernetes is the legacy name — upload same tarball as community-kubernetes-2.0.1 + run ansible-galaxy collection download community.kubernetes:2.0.1 -p "$colldir" || true + local community_kubernetes_tgz + community_kubernetes_tgz=$(ls "$colldir"/community-kubernetes-*.tar.gz 2>/dev/null | head -1) + if [[ -n "$community_kubernetes_tgz" ]]; then + jfrog_upload "$community_kubernetes_tgz" "ei-generic-binaries/ansible-collections/community-kubernetes-2.0.1.tar.gz" + else + warn "community.kubernetes tarball not found — skipping" + fi + + success "3e complete" +} + +# --------------------------------------------------------------------------- +# Step 3f — apt .deb Files +# Part 1: Download jq .deb files and upload to ei-generic-binaries/apt-debs/ +# (installed via dpkg on VM2 by the inference-tools role) +# Part 2: Pre-cache all required apt packages in JFrog by routing VM1 apt +# through ei-debian-virtual so JFrog fetches and caches each package +# and its dependencies before going Offline. +# Includes: conntrack socat ipset ebtables nfs-common ipvsadm unzip +# python3-pip (required by inference-tools role on VM2) +# --------------------------------------------------------------------------- +step_3f() { + step_hdr "3f - apt .deb Files" + local debdir="$WORKDIR/apt-debs" + mkdir -p "$debdir" + + # ── Part 1: jq via dpkg path ───────────────────────────────────────────── + # apt-get download fails if the exact installed version is no longer in the + # configured apt sources (e.g. sources.list was modified by a previous run + # or the version was removed from the mirror). Run apt-get update first, + # then download; if it still fails, warn and skip — the debs can be + # uploaded manually to ei-generic-binaries/apt-debs/ later. + info "Downloading jq, libjq1, libonig5..." + cd "$debdir" + sudo apt-get update -qq 2>/dev/null || true + if ! run sudo apt-get download jq libjq1 libonig5; then + warn "apt-get download for jq/libjq1/libonig5 failed — debs not uploaded to JFrog" + warn "Upload them manually: sudo apt-get download jq libjq1 libonig5 && curl -u admin:password -T http://:8082/artifactory/ei-generic-binaries/apt-debs/" + fi + for deb in *.deb; do + [[ -f "$deb" ]] || continue + jfrog_upload "$deb" "ei-generic-binaries/apt-debs/$deb" + done + cd - >/dev/null + + # ── Part 2: Kubespray apt packages via JFrog Debian remote ─────────────── + # Kubespray kubernetes/preinstall requires these packages on VM2. + # VM2 apt is pointed at JFrog in airgap mode, so JFrog must have them + # cached before going Offline. Route VM1 apt through JFrog here to + # trigger fetching and caching of each package and its dependencies. + info "Pre-caching Kubespray apt packages in JFrog..." + + local http_code + http_code=$(curl -su "$JFROG_CREDS" -X POST \ + "$JFROG_URL/api/repositories/ei-debian-ubuntu" \ + -H "Content-Type: application/json" \ + -d '{"offline":false}' \ + -o /dev/null -w "%{http_code}") + if [[ "$http_code" != "200" ]]; then + warn "Could not set ei-debian-ubuntu Online (HTTP $http_code) — skipping Kubespray apt pre-cache" + success "3f complete (jq packages only)" + return 0 + fi + info "ei-debian-ubuntu set to Online" + + local jfrog_src="http://${JFROG_CREDS}@${JFROG_HOST}/artifactory/ei-debian-virtual" + local jfrog_list="/etc/apt/sources.list.d/jfrog-precache.list" + + # Write sources file using two separate tee calls to avoid shell line-wrap issues + echo "deb [trusted=yes] $jfrog_src jammy main restricted universe multiverse" \ + | sudo tee "$jfrog_list" > /dev/null + echo "deb [trusted=yes] $jfrog_src jammy-updates main restricted universe multiverse" \ + | sudo tee -a "$jfrog_list" > /dev/null + + # Disable default Ubuntu sources so apt only talks to JFrog + sudo mv /etc/apt/sources.list /etc/apt/sources.list.bak + + local precache_ok=true + if run sudo apt-get update; then + # Clear any locally cached .deb files so apt must fetch from JFrog + # (if the package is already in /var/cache/apt/archives, apt skips the + # download entirely and JFrog never sees the request) + sudo rm -f /var/cache/apt/archives/conntrack*.deb \ + /var/cache/apt/archives/socat*.deb \ + /var/cache/apt/archives/ipset*.deb \ + /var/cache/apt/archives/ebtables*.deb \ + /var/cache/apt/archives/nfs-common*.deb \ + /var/cache/apt/archives/apt-transport-https*.deb \ + /var/cache/apt/archives/ipvsadm*.deb + + run sudo apt-get install --download-only -y \ + conntrack socat ipset ebtables nfs-common apt-transport-https ipvsadm \ + || { warn "Some packages may not have been cached"; precache_ok=false; } + + # apt-get download always fetches from the configured sources regardless of + # whether the package is already installed — unlike `install --reinstall` + # which can use apt's in-memory state and skip the network fetch entirely. + local pip_tmpdir + pip_tmpdir=$(mktemp -d) + cd "$pip_tmpdir" + run apt-get download python3-pip \ + || { warn "python3-pip may not have been cached"; precache_ok=false; } + run apt-get download unzip \ + || { warn "unzip may not have been cached"; precache_ok=false; } + cd - >/dev/null + rm -rf "$pip_tmpdir" + else + warn "apt-get update through JFrog failed — Kubespray packages may not be cached" + precache_ok=false + fi + + sudo mv /etc/apt/sources.list.bak /etc/apt/sources.list + sudo rm -f "$jfrog_list" + + # Set ei-debian-ubuntu back to Offline + curl -su "$JFROG_CREDS" -X POST \ + "$JFROG_URL/api/repositories/ei-debian-ubuntu" \ + -H "Content-Type: application/json" \ + -d '{"offline":true}' > /dev/null 2>&1 + info "ei-debian-ubuntu set back to Offline" + + if $precache_ok; then + success "3f complete — jq debs uploaded, Kubespray apt packages cached in JFrog" + else + warn "3f finished with warnings — some apt packages may be missing from JFrog cache" + fi +} + +# --------------------------------------------------------------------------- +# Step 3g — Kubernetes / Kubespray Binaries +# --------------------------------------------------------------------------- +step_3g() { + step_hdr "3g - Kubernetes Binaries" + local bindir="$WORKDIR/k8s-binaries" + mkdir -p "$bindir" + cd "$bindir" + + for bin in kubeadm kubectl kubelet; do + run curl -fsSLO "https://dl.k8s.io/release/v1.30.4/bin/linux/amd64/$bin" + jfrog_upload "$bin" "ei-generic-binaries/dl.k8s.io/release/v1.30.4/bin/linux/amd64/$bin" + done + + run curl -fsSLO "https://github.com/containernetworking/plugins/releases/download/v1.4.0/cni-plugins-linux-amd64-v1.4.0.tgz" + jfrog_upload "cni-plugins-linux-amd64-v1.4.0.tgz" \ + "ei-generic-binaries/github.com/containernetworking/plugins/releases/download/v1.4.0/cni-plugins-linux-amd64-v1.4.0.tgz" + + run curl -fsSLO "https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.30.1/crictl-v1.30.1-linux-amd64.tar.gz" + jfrog_upload "crictl-v1.30.1-linux-amd64.tar.gz" \ + "ei-generic-binaries/github.com/kubernetes-sigs/cri-tools/releases/download/v1.30.1/crictl-v1.30.1-linux-amd64.tar.gz" + + run curl -fsSLO "https://github.com/etcd-io/etcd/releases/download/v3.5.16/etcd-v3.5.16-linux-amd64.tar.gz" + jfrog_upload "etcd-v3.5.16-linux-amd64.tar.gz" \ + "ei-generic-binaries/github.com/etcd-io/etcd/releases/download/v3.5.16/etcd-v3.5.16-linux-amd64.tar.gz" + + # v3.28.1 — used by kubespray v2.27.0 / k8s v1.30.4 + run curl -fsSL -o "calicoctl-linux-amd64-v3.28.1" \ + "https://github.com/projectcalico/calico/releases/download/v3.28.1/calicoctl-linux-amd64" + jfrog_upload "calicoctl-linux-amd64-v3.28.1" \ + "ei-generic-binaries/github.com/projectcalico/calico/releases/download/v3.28.1/calicoctl-linux-amd64" + + run curl -fsSL -o "calico-v3.28.1.tar.gz" "https://github.com/projectcalico/calico/archive/v3.28.1.tar.gz" + jfrog_upload "calico-v3.28.1.tar.gz" \ + "ei-generic-binaries/github.com/projectcalico/calico/archive/v3.28.1.tar.gz" + + # v3.29.1 — newer version (pre-cache for future use) + run curl -fsSLO "https://github.com/projectcalico/calico/releases/download/v3.29.1/calicoctl-linux-amd64" + jfrog_upload "calicoctl-linux-amd64" \ + "ei-generic-binaries/github.com/projectcalico/calico/releases/download/v3.29.1/calicoctl-linux-amd64" + + run curl -fsSL -o "calico-v3.29.1.tar.gz" "https://github.com/projectcalico/calico/archive/v3.29.1.tar.gz" + jfrog_upload "calico-v3.29.1.tar.gz" \ + "ei-generic-binaries/github.com/projectcalico/calico/archive/v3.29.1.tar.gz" + + run curl -fsSLO "https://github.com/containerd/containerd/releases/download/v1.7.24/containerd-1.7.24-linux-amd64.tar.gz" + jfrog_upload "containerd-1.7.24-linux-amd64.tar.gz" \ + "ei-generic-binaries/github.com/containerd/containerd/releases/download/v1.7.24/containerd-1.7.24-linux-amd64.tar.gz" + + run curl -fsSLO "https://github.com/containerd/nerdctl/releases/download/v1.7.7/nerdctl-1.7.7-linux-amd64.tar.gz" + jfrog_upload "nerdctl-1.7.7-linux-amd64.tar.gz" \ + "ei-generic-binaries/github.com/containerd/nerdctl/releases/download/v1.7.7/nerdctl-1.7.7-linux-amd64.tar.gz" + + run curl -fsSLO "https://github.com/opencontainers/runc/releases/download/v1.2.3/runc.amd64" + jfrog_upload "runc.amd64" \ + "ei-generic-binaries/github.com/opencontainers/runc/releases/download/v1.2.3/runc.amd64" + + run curl -fsSLO "https://get.helm.sh/helm-v3.15.4-linux-amd64.tar.gz" + jfrog_upload "helm-v3.15.4-linux-amd64.tar.gz" \ + "ei-generic-binaries/get.helm.sh/helm-v3.15.4-linux-amd64.tar.gz" + # Also upload the bare helm binary (extracted from tarball) + run tar -xzf "helm-v3.15.4-linux-amd64.tar.gz" "linux-amd64/helm" + run mv "linux-amd64/helm" "helm" + jfrog_upload "helm" "ei-generic-binaries/helm" + + run curl -fsSL -o "get-pip.py" "https://bootstrap.pypa.io/get-pip.py" + jfrog_upload "get-pip.py" "ei-generic-binaries/get-pip.py" + + # kubectl is already uploaded under dl.k8s.io path — also upload as bare binary at root + jfrog_upload "kubectl" "ei-generic-binaries/kubectl" + + # yq + run curl -fsSL -o "yq" \ + "https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64" + run chmod +x yq + jfrog_upload "yq" "ei-generic-binaries/yq" + + # kubectx + kubens + run curl -fsSL -o "kubectx" \ + "https://github.com/ahmetb/kubectx/releases/download/v0.9.5/kubectx" + run chmod +x kubectx + jfrog_upload "kubectx" "ei-generic-binaries/kubectx" + + run curl -fsSL -o "kubens" \ + "https://github.com/ahmetb/kubectx/releases/download/v0.9.5/kubens" + run chmod +x kubens + jfrog_upload "kubens" "ei-generic-binaries/kubens" + + success "3g complete" + cd - >/dev/null +} + +# --------------------------------------------------------------------------- +# Helper — upload a HuggingFace model to JFrog one file at a time +# $1 = HuggingFace repo ID (e.g. meta-llama/Llama-3.1-8B-Instruct) +# $2 = JFrog destination folder name under ei-generic-models/ +# $3 = local working directory +# --------------------------------------------------------------------------- +upload_hf_model() { + local hf_repo="$1" + local jfrog_folder="$2" + local modeldir="$3" + + mkdir -p "$modeldir" + run pip3 install -q huggingface_hub + + # Get the list of all files in the model repo without downloading anything + info "Fetching file list for $hf_repo..." + local file_list + file_list=$(python3 - < "$cfg_tmp" + if grep -q "fileUploadMaxSizeMb" "$cfg_tmp"; then + sed -i 's|[0-9]*|0|' "$cfg_tmp" + local http_code + http_code=$(curl -su "$JFROG_CREDS" -X POST \ + "$JFROG_URL/api/system/configuration" \ + -H "Content-Type: application/xml" \ + --data-binary @"$cfg_tmp" \ + -o /dev/null -w "%{http_code}") + if [[ "$http_code" == "200" ]]; then + success "File upload limit set to unlimited" + else + warn "Could not update file upload limit (HTTP $http_code) -- large files may fail" + fi + else + warn "fileUploadMaxSizeMb not found in config -- skipping limit patch" + fi + rm -f "$cfg_tmp" +} + +# --------------------------------------------------------------------------- +# Step 3k — Qwen3-4B (optional) +# --------------------------------------------------------------------------- +step_3k() { + step_hdr "3k - LLM Model: Qwen/Qwen3-4B" + + if [[ -z "$HF_TOKEN" ]]; then + warn "Skipping 3k: --hf-token not provided" + warn "Re-run with: --step 3k --hf-token hf_..." + return 0 + fi + + set_jfrog_upload_limit_unlimited + upload_hf_model \ + "Qwen/Qwen3-4B" \ + "Qwen3-4B" \ + "$WORKDIR/Qwen3-4B" + + success "3k complete" +} + +# --------------------------------------------------------------------------- +# Step 3l — Qwen3-1.7B (optional) +# --------------------------------------------------------------------------- +step_3l() { + step_hdr "3l - LLM Model: Qwen/Qwen3-1.7B" + + if [[ -z "$HF_TOKEN" ]]; then + warn "Skipping 3l: --hf-token not provided" + warn "Re-run with: --step 3l --hf-token hf_..." + return 0 + fi + + set_jfrog_upload_limit_unlimited + upload_hf_model \ + "Qwen/Qwen3-1.7B" \ + "Qwen3-1.7B" \ + "$WORKDIR/Qwen3-1.7B" + + success "3l complete" +} + +# --------------------------------------------------------------------------- +# Step 3j — Qwen3-0.6B (optional) +# --------------------------------------------------------------------------- +step_3j() { + step_hdr "3j - LLM Model: Qwen/Qwen3-0.6B" + + if [[ -z "$HF_TOKEN" ]]; then + warn "Skipping 3j: --hf-token not provided" + warn "Re-run with: --step 3j --hf-token hf_..." + return 0 + fi + + set_jfrog_upload_limit_unlimited + upload_hf_model \ + "Qwen/Qwen3-0.6B" \ + "Qwen3-0.6B" \ + "$WORKDIR/Qwen3-0.6B" + + success "3j complete" +} + +# --------------------------------------------------------------------------- +# Step 3i — Meta-Llama-3.2-3B-Instruct (optional) +# --------------------------------------------------------------------------- +step_3i() { + step_hdr "3i - LLM Model: Meta-Llama-3.2-3B-Instruct" + + if [[ -z "$HF_TOKEN" ]]; then + warn "Skipping 3i: --hf-token not provided" + warn "Re-run with: --step 3i --hf-token hf_..." + return 0 + fi + + set_jfrog_upload_limit_unlimited + upload_hf_model \ + "meta-llama/Llama-3.2-3B-Instruct" \ + "Meta-Llama-3.2-3B-Instruct" \ + "$WORKDIR/Llama-3.2-3B-Instruct" + + success "3i complete" +} + +# --------------------------------------------------------------------------- +# Step 3h — Kubespray Tarball +# --------------------------------------------------------------------------- +step_3h() { + step_hdr "3h - Kubespray Tarball" + local kubedir="$WORKDIR/kubespray-build" + mkdir -p "$kubedir" + cd "$kubedir" + + if [[ ! -d "kubespray" ]]; then + run git clone https://github.com/kubernetes-sigs/kubespray + fi + run git -C kubespray fetch --tags + run git -C kubespray checkout v2.27.0 + run tar -czf kubespray.tar.gz kubespray/ + jfrog_upload "kubespray.tar.gz" "ei-generic-binaries/kubespray.tar.gz" + + success "3h complete" + cd - >/dev/null +} + +# --------------------------------------------------------------------------- +# Step 4 — Set Remote Repos to Offline +# --------------------------------------------------------------------------- +step_4() { + step_hdr "4 - Set Remote Repos to Offline" + + local remote_repos=( + ei-docker-dockerhub + ei-docker-ecr + ei-docker-ghcr + ei-docker-k8s + ei-docker-quay + ei-pypi-remote + ei-debian-ubuntu + ei-helm-ingress-nginx + ei-helm-langfuse + ei-hf-remote + ) + + for repo in "${remote_repos[@]}"; do + info "Setting $repo to Offline..." + local http_code + http_code=$(curl -su "$JFROG_CREDS" -X POST \ + "$JFROG_URL/api/repositories/$repo" \ + -H "Content-Type: application/json" \ + -d '{"offline":true}' \ + -o /dev/null -w "%{http_code}") + if [[ "$http_code" == "200" ]]; then + success "$repo set to Offline" + else + warn "$repo — unexpected HTTP $http_code (may already be Offline or not exist)" + fi + done + + success "Step 4 complete — all remote repos set to Offline" +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +echo "" +echo "============================================================" +echo " EI Airgap — JFrog Full Setup" +echo " JFrog: $JFROG_URL" +echo " Workdir: $WORKDIR" +echo " Dry-run: $DRY_RUN" +echo " Only step: ${ONLY_STEP:-all}" +echo " Skip steps: ${SKIP_STEPS[*]:-none}" +echo "============================================================" +echo "" + +if ! $DRY_RUN; then + check_prereqs + mkdir -p "$WORKDIR" +fi + +should_run "1" && step_1 +should_run "2" && step_2 +should_run "3a" && step_3a +should_run "3b" && step_3b +should_run "3c" && step_3c +should_run "3d" && step_3d +should_run "3e" && step_3e +should_run "3f" && step_3f +should_run "3g" && step_3g +should_run "3h" && step_3h +should_run "3i" && step_3i +should_run "3j" && step_3j +should_run "3k" && step_3k +should_run "3l" && step_3l + +should_run "4" && step_4 + +echo "" +success "JFrog setup is complete. Proceed with EI deployment on VM2." diff --git a/third_party/Dell/ubuntu-22.04/iac/terraform.tfvars b/third_party/Dell/ubuntu-22.04/iac/terraform.tfvars index 730cb8cf..cd72341b 100644 --- a/third_party/Dell/ubuntu-22.04/iac/terraform.tfvars +++ b/third_party/Dell/ubuntu-22.04/iac/terraform.tfvars @@ -1,6 +1,6 @@ -idrac_endpoint = "replace endpoint" -idrac_user = "replace idrac username" -idrac_password = "replace idrac password" +idrac_endpoint = "https://100.67.153.13" +idrac_user = "root" +idrac_password = "calvin" idrac_ssl_insecure = true -ubuntu_username = "provide username for ubunutu machine" -ubuntu_password = "provide password for ubuntu machine" \ No newline at end of file +ubuntu_username = "user" +ubuntu_password = "Linux123!" \ No newline at end of file