Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/helm-charts/genai-gateway/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
spec:
initContainers:
- name: wait-for-postgres-redis
image: busybox:1.36
image: docker.io/library/busybox:1.28
command:
- /bin/sh
- -c
Expand Down
2 changes: 1 addition & 1 deletion core/helm-charts/genai-gateway/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ replicaCount: 1
image:
repository: ghcr.io/berriai/litellm-non_root
tag: main-v1.75.8-stable
pullPolicy: Always
pullPolicy: IfNotPresent
imagePullSecrets: []
service:
type: LoadBalancer
Expand Down
5 changes: 5 additions & 0 deletions core/helm-charts/vllm/xeon-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ accelDevice: ""
# CPU Balloon configuration for NRI resource policy
cpu_balloon_annotation: ""

# Override tensor parallelism to 1 for Xeon — NRI balloon CPU allocation creates
# asymmetric NUMA splits (85 vs 84 physical cores) when TP=2, causing PyTorch
# shm assertion failure: ptr->thread_num == thread_num
tensor_parallel_size: "1"


resources:
requests:
Expand Down
69 changes: 69 additions & 0 deletions core/inventory/metadata/offline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
files_repo: "http://JFROG_HOST:8082/artifactory/ei-generic-binaries"
kube_version: v1.30.4
crictl_version: "v1.30.1"
etcd_version: "v3.5.16"
runc_version: "v1.2.3"
containerd_version: "1.7.24"
kubeadm_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubeadm"
kubectl_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubectl"
kubelet_download_url: "{{ files_repo }}/dl.k8s.io/release/{{ kube_version }}/bin/linux/{{ image_arch }}/kubelet"
cni_download_url: "{{ files_repo }}/github.com/containernetworking/plugins/releases/download/{{ cni_version }}/cni-plugins-linux-{{ image_arch }}-{{ cni_version }}.tgz"
crictl_download_url: "{{ files_repo }}/github.com/kubernetes-sigs/cri-tools/releases/download/{{ crictl_version }}/crictl-{{ crictl_version }}-{{ ansible_system | lower }}-{{ image_arch }}.tar.gz"
etcd_download_url: "{{ files_repo }}/github.com/etcd-io/etcd/releases/download/{{ etcd_version }}/etcd-{{ etcd_version }}-linux-{{ image_arch }}.tar.gz"
calicoctl_download_url: "{{ files_repo }}/github.com/projectcalico/calico/releases/download/{{ calico_ctl_version }}/calicoctl-linux-{{ image_arch }}"
calico_crds_download_url: "{{ files_repo }}/github.com/projectcalico/calico/archive/{{ calico_version }}.tar.gz"
helm_download_url: "{{ files_repo }}/get.helm.sh/helm-{{ helm_version }}-linux-{{ image_arch }}.tar.gz"
containerd_download_url: "{{ files_repo }}/github.com/containerd/containerd/releases/download/v{{ containerd_version }}/containerd-{{ containerd_version }}-linux-{{ image_arch }}.tar.gz"
runc_download_url: "{{ files_repo }}/github.com/opencontainers/runc/releases/download/{{ runc_version }}/runc.{{ image_arch }}"
nerdctl_download_url: "{{ files_repo }}/github.com/containerd/nerdctl/releases/download/v{{ nerdctl_version }}/nerdctl-{{ nerdctl_version }}-linux-{{ image_arch }}.tar.gz"

## Pin Calico to version validated in JFrog airgap cache
calico_version: v3.28.1

## Pin CoreDNS to version validated in JFrog airgap cache
coredns_version: v1.11.1

# JFrog registry mirrors — Kubespray writes these into /etc/containerd/certs.d
# on every cluster node during cluster.yml. JFROG_HOST is substituted with the
# actual JFrog IP by setup-env.sh before Kubespray runs.
containerd_registries_mirrors:
- registry: "docker.io"
prefix: "docker.io"
mirrors:
- host: "http://JFROG_HOST:8082/v2/ei-docker-virtual"
capabilities:
- pull
- resolve
override_path: true
- registry: "ghcr.io"
prefix: "ghcr.io"
mirrors:
- host: "http://JFROG_HOST:8082/v2/ei-docker-virtual"
capabilities:
- pull
- resolve
override_path: true
- registry: "registry.k8s.io"
prefix: "registry.k8s.io"
mirrors:
- host: "http://JFROG_HOST:8082/v2/ei-docker-virtual"
capabilities:
- pull
- resolve
override_path: true
- registry: "quay.io"
prefix: "quay.io"
mirrors:
- host: "http://JFROG_HOST:8082/v2/ei-docker-virtual"
capabilities:
- pull
- resolve
override_path: true
- registry: "public.ecr.aws"
prefix: "public.ecr.aws"
mirrors:
- host: "http://JFROG_HOST:8082/v2/ei-docker-virtual"
capabilities:
- pull
- resolve
override_path: true
15 changes: 14 additions & 1 deletion core/inventory/metadata/vars/inference_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,17 @@ helm_charts_base: "{{ lookup('env', 'PWD') }}/helm-charts"
remote_home_dir: "{{ lookup('env', 'PWD') }}/scripts"
remote_helm_charts_base: "/tmp/helm-charts"
ansible_python_interpreter: "{{ lookup('env', 'ANSIBLE_PYTHON_INTERPRETER') or '/usr/bin/python3' }}"
remote_home_scripts_dir: "{{ lookup('env', 'PWD') }}/scripts"
remote_home_scripts_dir: "{{ lookup('env', 'PWD') }}/scripts"

# ---------------------------------------------------------------------------
# Airgap — Helm repository URLs
# airgap_enabled, jfrog_url, jfrog_username, jfrog_password are sourced from
# inference-config.cfg and passed in via --extra-vars by the shell layer.
# When airgap_enabled=true → routes to JFrog ei-helm-virtual on VM1.
# When airgap_enabled=false → uses original upstream URLs (internet).
# ---------------------------------------------------------------------------
helm_repo_ingress_nginx: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://kubernetes.github.io/ingress-nginx' }}"
helm_repo_langfuse: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://langfuse.github.io/langfuse-k8s' }}"
helm_repo_apisix: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | bool else 'https://charts.apiseven.com' }}"
helm_repo_nri_plugins: "{{ jfrog_url + '/ei-helm-virtual' if airgap_enabled | default(false) | bool else 'https://containers.github.io/nri-plugins' }}"
helm_oci_jfrog_host: "{{ jfrog_url | regex_replace('^https?://', '') | regex_replace('/.*$', '') }}"
2 changes: 1 addition & 1 deletion core/lib/cluster/config/cluster-config-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ deploy_cluster_config_playbook() {
tags=""
fi

ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-cluster-config.yml --become --become-user=root --extra-vars "brownfield_deployment=${brownfield_deployment} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file}" --tags "$tags"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-cluster-config.yml --become --become-user=root --extra-vars "brownfield_deployment=${brownfield_deployment} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" --tags "$tags"
}
2 changes: 1 addition & 1 deletion core/lib/cluster/config/label-nodes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

run_label_nodes_playbook() {
echo "Running the label-nodes.yml playbook to label Kubernetes nodes..."
ansible-playbook -i "${INVENTORY_PATH}" playbooks/label-nodes.yml
ansible-playbook -i "${INVENTORY_PATH}" playbooks/label-nodes.yml --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
}
2 changes: 1 addition & 1 deletion core/lib/cluster/deployment/cluster-purge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ run_reset_playbook() {
uninstall_ceph_cluster
fi

ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "delete_pv_on_purge=${delete_pv_on_purge}"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "delete_pv_on_purge=${delete_pv_on_purge} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root reset.yml -e "confirm_reset=yes reset_nodes=false"
# Check the exit status of the Ansible playbook command
if [ $? -eq 0 ]; then
Expand Down
45 changes: 43 additions & 2 deletions core/lib/cluster/deployment/fresh-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,40 @@ fresh_installation() {

if [[ "$deploy_kubernetes_fresh" == "yes" ]]; then
echo "Starting fresh installation of Intel AI for Enterprise Inference Cluster..."
if [[ "$airgap_enabled" == "yes" ]]; then
echo "Airgap mode: fixing containerd mirrors and purging any stale image blobs before Kubernetes install..."
local _b64 _jfrog_host
_jfrog_host=$(echo "$jfrog_url" | sed 's|https\?://||' | sed 's|/.*||')
_b64=$(echo -n "${jfrog_username}:${jfrog_password}" | base64 -w 0)
for _reg in docker.io ghcr.io registry.k8s.io quay.io public.ecr.aws; do
sudo mkdir -p /etc/containerd/certs.d/$_reg
sudo tee /etc/containerd/certs.d/$_reg/hosts.toml > /dev/null <<EOF
server = "https://$_reg"
[host."http://${_jfrog_host}/v2/ei-docker-virtual"]
capabilities = ["pull", "resolve"]
override_path = true
[host."http://${_jfrog_host}/v2/ei-docker-virtual".header]
Authorization = ["Basic $_b64"]
EOF
done
# Purge any HTML blobs cached from failed prior pulls (containerd corruption loop)
for _img in docker.io/library/nginx:1.25.2-alpine; do
sudo crictl rmi "$_img" 2>/dev/null; true
sudo ctr -n k8s.io images rm "$_img" 2>/dev/null; true
done
sudo find /var/lib/containerd/io.containerd.content.v1.content/blobs/sha256 \
-size +100k -newer /etc/containerd/config.toml \
-exec sh -c 'file "$1" | grep -q "HTML" && sudo rm -f "$1"' _ {} \; 2>/dev/null; true
sudo systemctl restart containerd
echo "Containerd mirrors configured and restarted."
fi
install_kubernetes "$@"
if [[ "$airgap_enabled" == "yes" ]]; then
echo "Patching local-path-config to use busybox:1.28 (airgap mode)..."
kubectl patch configmap local-path-config -n local-path-storage --type merge -p \
'{"data":{"helperPod.yaml":"apiVersion: v1\nkind: Pod\nmetadata:\n name: helper-pod\nspec:\n containers:\n - name: helper-pod\n image: \"docker.io/library/busybox:1.28\"\n imagePullPolicy: IfNotPresent"}}' \
2>/dev/null || true
fi
else
echo "Skipping Kubernetes installation..."
fi
Expand Down Expand Up @@ -137,7 +170,11 @@ fresh_installation() {
--extra-vars "cluster_url=${cluster_url} \
cert_file=${cert_file} \
key_file=${key_file} \
kubernetes_platform=${kubernetes_platform}" \
kubernetes_platform=${kubernetes_platform} \
airgap_enabled=${airgap_enabled} \
jfrog_url=${jfrog_url} \
jfrog_username=${jfrog_username} \
jfrog_password=${jfrog_password}" \
--vault-password-file "$vault_pass_file"
if [ $? -eq 0 ]; then
echo "Agentic AI Plugin deployed successfully."
Expand Down Expand Up @@ -230,5 +267,9 @@ fresh_installation() {

run_fresh_install_playbook() {
echo "Running the cluster.yml playbook to set up the Kubernetes cluster..."
ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root cluster.yml
local _airgap_extra_vars=""
if [[ "$airgap_enabled" == "yes" ]]; then
_airgap_extra_vars="--extra-vars \"airgap_enabled=true jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}\""
fi
eval ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root cluster.yml ${_airgap_extra_vars}
}
4 changes: 2 additions & 2 deletions core/lib/cluster/drv-fw-update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ update_drivers() {
invoke_prereq_workflows
echo "${YELLOW}Updating drivers...${NC}"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-gaudi-firmware-driver.yml \
--extra-vars "update_type=drivers"
--extra-vars "update_type=drivers airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
echo "${GREEN}Drivers updated successfully!${NC}"
}

Expand All @@ -54,7 +54,7 @@ update_firmware() {
invoke_prereq_workflows
echo "${YELLOW}Updating firmware...${NC}"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-gaudi-firmware-driver.yml \
--extra-vars "update_type=firmware"
--extra-vars "update_type=firmware airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
echo "${GREEN}Firmware updated successfully!${NC}"
}

Expand Down
3 changes: 2 additions & 1 deletion core/lib/cluster/nodes/add-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ add_inference_nodes_playbook() {

invoke_prereq_workflows "$@"

ansible-playbook -i "${INVENTORY_PATH}" playbooks/cluster.yml --become --become-user=root
ansible-playbook -i "${INVENTORY_PATH}" playbooks/cluster.yml --become --become-user=root \
--extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"

}

Expand Down
2 changes: 1 addition & 1 deletion core/lib/components/genai-gateway-controller.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
run_genai_gateway_playbook() {
echo "Deploying GenAI Gateway Service..."
echo "************************************"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-genai-gateway.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_genai_gateway=${deploy_genai_gateway} model_name_list='${model_name_list//\ /,}' genai_gateway_trace_chart_version=${genai_gateway_trace_chart_version} kubernetes_platform=${kubernetes_platform}" --vault-password-file "$vault_pass_file"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-genai-gateway.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_genai_gateway=${deploy_genai_gateway} model_name_list='${model_name_list//\ /,}' genai_gateway_trace_chart_version=${genai_gateway_trace_chart_version} kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" --vault-password-file "$vault_pass_file"
}
2 changes: 1 addition & 1 deletion core/lib/components/ingress-controller.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

run_ingress_nginx_playbook() {
echo "Deploying the Ingress NGINX Controller..."
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ingress-controller.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} ingress_controller=${ingress_controller}"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-ingress-controller.yml --extra-vars "secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} ingress_controller=${ingress_controller} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
}
6 changes: 4 additions & 2 deletions core/lib/components/intel-base-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@

run_deploy_habana_ai_operator_playbook() {
echo "Running the deploy-habana-ai-operator.yml playbook to deploy the habana-ai-operator..."
ansible-galaxy collection install community.kubernetes
if [[ "$airgap_enabled" != "yes" ]]; then
ansible-galaxy collection install kubernetes.core
fi
if [[ "$gaudi_platform" == "gaudi2" ]]; then
gaudi_operator="$gaudi2_operator"
elif [[ "$gaudi_platform" == "gaudi3" ]]; then
gaudi_operator="$gaudi3_operator"
else
gaudi_operator=""
fi
ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root playbooks/deploy-habana-ai-operator.yml --extra-vars "gaudi_operator=${gaudi_operator}"
ansible-playbook -i "${INVENTORY_PATH}" --become --become-user=root playbooks/deploy-habana-ai-operator.yml --extra-vars "gaudi_operator=${gaudi_operator} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
if [ $? -eq 0 ]; then
echo "The deploy-habana-ai-operator.yml playbook ran successfully."
else
Expand Down
6 changes: 3 additions & 3 deletions core/lib/components/keycloak-controller.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@

run_keycloak_playbook() {
echo "Deploying Keycloak using Ansible playbook..."
install_ansible_collection
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml
install_ansible_collection
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-controller.yml --extra-vars "airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
}

create_keycloak_tls_secret_playbook() {
echo "Deploying Keycloak TLS secret playbook..."
echo "************************************"

ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-keycloak-tls-cert.yml \
--extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} model_name_list='${model_name_list//\ /,}' deploy_keycloak=${deploy_keycloak} deploy_apisix=${deploy_apisix} keycloak_chart_version=${keycloak_chart_version}"
--extra-vars "kubernetes_platform=${kubernetes_platform} secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} keycloak_admin_user=${keycloak_admin_user} keycloak_admin_password=${keycloak_admin_password} keycloak_client_id=${keycloak_client_id} hugging_face_token=${hugging_face_token} model_name_list='${model_name_list//\ /,}' deploy_keycloak=${deploy_keycloak} deploy_apisix=${deploy_apisix} keycloak_chart_version=${keycloak_chart_version} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"
}


2 changes: 1 addition & 1 deletion core/lib/components/observability-controller.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ deploy_observability_playbook() {
playbook_path="playbooks/deploy-observability-openshift.yml"
fi

local extra_vars="secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_observability=${deploy_observability} deploy_logging=${deploy_logging} observability_stack_chart_version=${observability_stack_chart_version} kubernetes_platform=${kubernetes_platform}"
local extra_vars="secret_name=${cluster_url} cert_file=${cert_file} key_file=${key_file} deploy_observability=${deploy_observability} deploy_logging=${deploy_logging} observability_stack_chart_version=${observability_stack_chart_version} kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}"

ansible-playbook -i "${INVENTORY_PATH}" "$playbook_path" --become --become-user=root --extra-vars "$extra_vars" --tags "$tags" --vault-password-file "$vault_pass_file"
}
4 changes: 2 additions & 2 deletions core/lib/components/service-mesh/install-istio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ deploy_istio_playbook() {
# Expect kubernetes_platform to be set globally (brownfield or fresh install path)
if [ "$(echo "${kubernetes_platform:-vanilla}" | tr '[:upper:]' '[:lower:]')" = "openshift" ]; then
echo "Detected OpenShift platform. Using OpenShift Service Mesh playbook."
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio-openshift.yml --extra-vars "kubernetes_platform=${kubernetes_platform}" || return 1
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio-openshift.yml --extra-vars "kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" || return 1
else
echo "Using vanilla/helm-based Istio playbook for platform: ${kubernetes_platform}"
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio.yml --extra-vars "kubernetes_platform=${kubernetes_platform}" || return 1
ansible-playbook -i "${INVENTORY_PATH}" playbooks/deploy-istio.yml --extra-vars "kubernetes_platform=${kubernetes_platform} airgap_enabled=${airgap_enabled} jfrog_url=${jfrog_url} jfrog_username=${jfrog_username} jfrog_password=${jfrog_password}" || return 1
fi
}
Loading