Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion config/multi_node/shampoo_opt_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ fsdp:
forward_prefetch: true
sync_module_states: true
param_init_device: meta
# RCCL warmup settings to avoid race conditions during FSDP init
rccl_warmup_iterations: 5
skip_rccl_warmup: false
# Training warmup: forward/backward/optimizer steps before main loop
training_warmup_steps: 1
skip_training_warmup: false

distributed:
backend: nccl
Expand Down Expand Up @@ -94,7 +100,7 @@ dataloader:
pin_memory: true

profiling:
enabled: true
enabled: false
wait: 2
warmup: 2
active: 6
Expand Down
57 changes: 37 additions & 20 deletions scripts/multi_node/local_launch.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/bin/bash
# Multi-node local launch script for GEMM training
# Runs on each node with single channel/thread configuration
#
# NCCL/RCCL environment variables are sourced from set_env_variables.sh
# Edit that file to change NCCL configuration - no need to modify this script.

if [[ $# -lt 11 ]]; then
echo "Usage: $0 <NODE_RANK> <NODE_IP> <MASTER_IP> <MASTER_PORT> <NNODES> <WORLD_SIZE> <EXPERIMENT_DIR> <CONFIG_FILE> <NPROC_PER_NODE> <CHANNELS> <THREADS> [ENABLE_ROCPROF] [ROCPROF_STATS] [ROCPROF_INPUT] [DOCKER_CONTAINER]"
Expand All @@ -23,6 +26,20 @@ ROCPROF_STATS="${13:-false}"
ROCPROF_INPUT="${14:-}"
DOCKER_CONTAINER="${15:-training-overlap-bugs-rocm70_9-1}"

# Source environment variables (should already be sourced by config_node.sh, but ensure it's loaded)
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if [[ -f "$SCRIPT_DIR/set_env_variables.sh" ]]; then
source "$SCRIPT_DIR/set_env_variables.sh"
fi

# Override channel/thread settings from command line arguments
export NCCL_MAX_NCHANNELS="${CHANNELS}"
export RCCL_THREADS_PER_BLOCK="${THREADS}"

# Set AMD_LOG_LEVEL_FILE to experiment directory (will be converted to Docker path later)
# This ensures AMD logs go to the experiment folder instead of current directory
export AMD_LOG_LEVEL_FILE="${EXPERIMENT_DIR}/${THREADS}thread_${CHANNELS}channels/trace_amd_node${NODE_RANK}.log"

echo "=========================================="
echo "Local Launch Configuration"
echo "=========================================="
Expand All @@ -37,7 +54,6 @@ echo "Experiment Dir: $EXPERIMENT_DIR"
echo "Config File: $CONFIG_FILE"
echo "Channels: $CHANNELS"
echo "Threads: $THREADS"
echo "Docker Container: $DOCKER_CONTAINER"
echo "rocprof enabled: $ENABLE_ROCPROF"
echo "=========================================="
echo ""
Expand All @@ -60,25 +76,25 @@ else
CONFIG_FILE_DOCKER="$CONFIG_FILE"
fi

# Log file
LOG_FILE="${OUTPUT_DIR}/node_${NODE_RANK}_output.log"
# Convert AMD_LOG_LEVEL_FILE to Docker path
export AMD_LOG_LEVEL_FILE=$(echo "$AMD_LOG_LEVEL_FILE" | sed "s|^${AORTA_ROOT_FROM_EXP}|/workspace/aorta|")

# Function to log with timestamp
log() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[${timestamp}] [Node ${NODE_RANK}] ${message}" | tee -a "${LOG_FILE}"
echo "[${timestamp}] [Node ${NODE_RANK}] ${message}"
}

# Cleanup function
cleanup() {
echo ""
echo "=== Caught interrupt signal ===" | tee -a "${LOG_FILE}"
echo "=== Caught interrupt signal ==="
log "Cleaning up training processes on node ${NODE_RANK}..."

# Try to kill processes inside Docker container
docker exec "$DOCKER_CONTAINER" pkill -9 -f "train.py" 2>/dev/null || true
docker exec "$DOCKER_CONTAINER" pkill -9 -f "torchrun" 2>/dev/null || true
docker exec "${DOCKER_CONTAINER}" pkill -9 -f "train.py" 2>/dev/null || true
docker exec "${DOCKER_CONTAINER}" pkill -9 -f "torchrun" 2>/dev/null || true

# Also try on host (in case anything leaked)
sudo pkill -9 -f "train.py" 2>/dev/null || true
Expand Down Expand Up @@ -109,14 +125,18 @@ BASE_CMD="torchrun --nnodes ${NNODES} --node_rank ${NODE_RANK} --nproc_per_node
BASE_OVERRIDES="--override profiling.tensorboard=false"

# Build docker exec prefix with environment variables
DOCKER_EXEC="docker exec \
-e RCCL_THREADS_PER_BLOCK=${THREADS} \
-e NCCL_MAX_NCHANNELS=${CHANNELS} \
-e HSA_ENABLE_SDMA=0 \
-e PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1 \
${DOCKER_CONTAINER}"
# All NCCL/RCCL variables are defined in set_env_variables.sh
DOCKER_ENV_FLAGS=$(build_docker_env_flags)
DOCKER_EXEC="docker exec ${DOCKER_ENV_FLAGS} ${DOCKER_CONTAINER}"

# Log which env vars are being passed
log "Docker environment variables:"
for var in "${DOCKER_ENV_VARS[@]}"; do
log " ${var}=${!var}"
done

# Run with or without rocprofv3
# Note: Output is already captured by master_launch.sh's redirection, no need for tee
if [ "${ENABLE_ROCPROF}" = "true" ]; then
ROCPROF_DIR="${OUTPUT_DIR}/rocprof_traces/node_${NODE_RANK}"
mkdir -p "${ROCPROF_DIR}"
Expand All @@ -125,8 +145,7 @@ if [ "${ENABLE_ROCPROF}" = "true" ]; then
log "Using rocprofv3 input file: ${ROCPROF_INPUT}"
${DOCKER_EXEC} bash -c "rocprofv3 -i ${ROCPROF_INPUT} -d ${ROCPROF_DIR} -- \
${BASE_CMD} ${BASE_OVERRIDES} \
--override training.output_dir=${OUTPUT_DIR_DOCKER}" \
2>&1 | tee -a "${LOG_FILE}"
--override training.output_dir=${OUTPUT_DIR_DOCKER}" 2>&1
else
ROCPROF_ARGS="--kernel-trace"
if [ "${ROCPROF_STATS}" = "true" ]; then
Expand All @@ -136,18 +155,16 @@ if [ "${ENABLE_ROCPROF}" = "true" ]; then
log "Running with rocprofv3 kernel tracing inside Docker"
${DOCKER_EXEC} bash -c "rocprofv3 ${ROCPROF_ARGS} -d ${ROCPROF_DIR} -- \
${BASE_CMD} ${BASE_OVERRIDES} \
--override training.output_dir=${OUTPUT_DIR_DOCKER}" \
2>&1 | tee -a "${LOG_FILE}"
--override training.output_dir=${OUTPUT_DIR_DOCKER}" 2>&1
fi
else
log "Running inside Docker container"
log "Command: ${BASE_CMD} ${BASE_OVERRIDES} --override training.output_dir=${OUTPUT_DIR_DOCKER}"
${DOCKER_EXEC} bash -c "${BASE_CMD} ${BASE_OVERRIDES} \
--override training.output_dir=${OUTPUT_DIR_DOCKER}" \
2>&1 | tee -a "${LOG_FILE}"
--override training.output_dir=${OUTPUT_DIR_DOCKER}" 2>&1
Comment on lines 146 to +164

Copilot AI Feb 4, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The expansion of DOCKER_EXEC here is vulnerable to shell command injection because it is built from untrusted inputs (the DOCKER_CONTAINER CLI argument and environment-derived values in DOCKER_ENV_FLAGS) and then expanded unquoted in a command context. An attacker who can influence DOCKER_CONTAINER or one of the environment variables (e.g., by including characters like ;, && or $(...)) can break out of the intended docker exec invocation and execute arbitrary additional commands on the host. To mitigate this, avoid constructing a full command string in DOCKER_EXEC and instead pass docker exec and its arguments as separate, properly quoted words (or an array), and ensure DOCKER_CONTAINER is strictly validated/whitelisted to be a simple container name.

Copilot uses AI. Check for mistakes.
fi

EXIT_CODE=${PIPESTATUS[0]}
EXIT_CODE=$?
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))

Expand Down
137 changes: 114 additions & 23 deletions scripts/multi_node/set_env_variables.sh
Original file line number Diff line number Diff line change
@@ -1,42 +1,133 @@
#!/bin/bash
# =============================================================================
# Global NCCL/RCCL environment variables for multi-node training
# Based on DLRM_set_env_variables.sh
# Configured for MI350X cluster
#
# This file is the SINGLE SOURCE OF TRUTH for all NCCL/RCCL configuration.
# Edit variables here - local_launch.sh will automatically pick them up.
#
# NOTE: When adding a new environment variable, you MUST also add its name
# to the DOCKER_ENV_VARS array below, otherwise it won't be passed
# to the Docker container.
# =============================================================================

# NCCL Debug Settings (use INFO for debugging network issues)
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=INIT,NET
# Try disabling IB if InfiniBand is not properly configured
export NCCL_IB_DISABLE=1
# -----------------------------------------------------------------------------
# NCCL Debug Settings
# -----------------------------------------------------------------------------
export NCCL_DEBUG=WARN
export NCCL_DEBUG_SUBSYS= # Options: COLL,INIT,NET (empty = none)

Copilot AI Feb 4, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting NCCL_DEBUG_SUBSYS to an empty string may cause issues depending on how NCCL interprets empty environment variables. Some versions of NCCL may treat an empty value differently than an unset variable. Consider either unsetting the variable (unset NCCL_DEBUG_SUBSYS) when not needed, or omitting the export statement entirely when the value is empty, rather than exporting an empty string.

Suggested change
export NCCL_DEBUG_SUBSYS= # Options: COLL,INIT,NET (empty = none)
unset NCCL_DEBUG_SUBSYS # Options: COLL,INIT,NET (empty/unset = none)

Copilot uses AI. Check for mistakes.

# IB/RNIC Configuration (commented out when IB is disabled)
# export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
# export NCCL_IB_GID_INDEX=3
# -----------------------------------------------------------------------------
# RCCL-Specific Settings (ROCm)
# -----------------------------------------------------------------------------
export RCCL_DIRECT_ALLGATHER_DISABLE=1 # Disable direct allgather
export RCCL_MSCCL_ENABLE=0 # Disable MSCCL
export RCCL_THREADS_PER_BLOCK=256 # Threads per block (override via --threads)

# -----------------------------------------------------------------------------
# IB/RNIC Configuration for MI350X
# -----------------------------------------------------------------------------
export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
export NCCL_IB_GID_INDEX=3
export NCCL_NCHANNELS_PER_NET_PEER=8

# -----------------------------------------------------------------------------
# HSA Settings for ROCm
# -----------------------------------------------------------------------------
export HSA_ENABLE_IPC_MODE_LEGACY=1
export HSA_ENABLE_SDMA=0 # Disable SDMA for stability

# NCCL Protocol
# -----------------------------------------------------------------------------
# NCCL Protocol and Channels
# -----------------------------------------------------------------------------
export NCCL_PROTO=Simple
#export NCCL_MIN_NCHANNELS=40
export NCCL_MAX_NCHANNELS=56 # Override via --channels

# Channel Configuration (can be overridden by sweep parameters)
export NCCL_MIN_NCHANNELS=40
export NCCL_MAX_NCHANNELS=40

# Network Interface
# Change this to match your network interface: eth0, ib0, enp49s0f0np0, etc.
# Temporarily commented out for auto-detection:
# export NCCL_SOCKET_IFNAME=enp193s0f0
# -----------------------------------------------------------------------------
# Network Interface for MI350X cluster
# -----------------------------------------------------------------------------
export NCCL_SOCKET_IFNAME=enp49s0f0np0,fenic0

# -----------------------------------------------------------------------------
# Timeout and Error Handling
# -----------------------------------------------------------------------------
export NCCL_TIMEOUT_MS=12000 # 12 second timeout (legacy, not used by PyTorch)
export NCCL_TIMEOUT=100 # 100 second timeout - first backward can be slow due to JIT/init
export TORCH_DIST_INIT_TIMEOUT=150 # Match collective timeout for consistency

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

timeout NCCL_TIMEOUT and TORCH_DIST_INIT_TIMEOUT are pointing to different values. Any reason why do we have different values and different environment variables for same thing? Which one is effective in our code?

export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export TORCH_NCCL_TRACE_BUFFER_SIZE=10000
export TORCH_NCCL_DUMP_ON_TIMEOUT=1 # Critical for hang debugging
#export AMD_LOG_LEVEL=5
# AMD_LOG_LEVEL_FILE is set dynamically in local_launch.sh to point to experiment directory
# Default fallback (will be overridden):
#export AMD_LOG_LEVEL_FILE=trace_amd.log
# -----------------------------------------------------------------------------
# PyTorch ROCm Profiler
# -----------------------------------------------------------------------------
export PYTORCH_ROCM_PROFILER_ENABLE_TRACING=1

# Optional: Force non-overlap for debugging
# -----------------------------------------------------------------------------
# List of environment variables to pass to Docker container
# Add/remove variables here to control what gets passed through
# -----------------------------------------------------------------------------
DOCKER_ENV_VARS=(
# NCCL Debug
NCCL_DEBUG
NCCL_DEBUG_SUBSYS
# RCCL
RCCL_DIRECT_ALLGATHER_DISABLE
RCCL_MSCCL_ENABLE
RCCL_THREADS_PER_BLOCK
# IB/RNIC
NCCL_IB_HCA
NCCL_IB_GID_INDEX
NCCL_NCHANNELS_PER_NET_PEER
# HSA
HSA_ENABLE_IPC_MODE_LEGACY
HSA_ENABLE_SDMA
# Protocol/Channels
NCCL_PROTO
NCCL_MIN_NCHANNELS

Copilot AI Feb 4, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DOCKER_ENV_VARS array includes NCCL_MIN_NCHANNELS (line 91), but the actual export statement for NCCL_MIN_NCHANNELS is commented out (line 44). This means the build_docker_env_flags function will pass an undefined or empty value for NCCL_MIN_NCHANNELS to the Docker container. Either uncomment the export on line 44 or remove NCCL_MIN_NCHANNELS from the DOCKER_ENV_VARS array.

Suggested change
NCCL_MIN_NCHANNELS

Copilot uses AI. Check for mistakes.
NCCL_MAX_NCHANNELS
# Network
NCCL_SOCKET_IFNAME
# Timeout/Error Handling
NCCL_TIMEOUT_MS
NCCL_TIMEOUT
TORCH_DIST_INIT_TIMEOUT
TORCH_NCCL_ASYNC_ERROR_HANDLING
TORCH_NCCL_TRACE_BUFFER_SIZE
TORCH_NCCL_DUMP_ON_TIMEOUT
# AMD Logging
AMD_LOG_LEVEL
AMD_LOG_LEVEL_FILE
# Profiler
PYTORCH_ROCM_PROFILER_ENABLE_TRACING
)
export DOCKER_ENV_VARS

# -----------------------------------------------------------------------------
# Helper function: Build docker -e flags from DOCKER_ENV_VARS
# Usage: DOCKER_ENV_FLAGS=$(build_docker_env_flags)
# -----------------------------------------------------------------------------
build_docker_env_flags() {
local flags=""
for var in "${DOCKER_ENV_VARS[@]}"; do
local value="${!var}"
flags+=" -e ${var}=${value}"

Copilot AI Feb 4, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The build_docker_env_flags function does not quote the environment variable values when building the flags string. If any environment variable contains spaces or special characters (e.g., NCCL_SOCKET_IFNAME=enp49s0f0np0,fenic0), this could cause issues with shell word splitting when the flags are used in the docker exec command. Consider quoting the values: flags+=" -e ${var}="${value}""

Suggested change
flags+=" -e ${var}=${value}"
flags+=" -e ${var}=$(printf '%q' "$value")"

Copilot uses AI. Check for mistakes.
done
echo "$flags"
}
export -f build_docker_env_flags

# =============================================================================
# Optional settings (uncomment to enable)
# =============================================================================

# Force non-overlap for debugging (single HW queue)
# export GPU_MAX_HW_QUEUES=1
# unset TORCH_NCCL_HIGH_PRIORITY

# Optional: Disable SDMA for testing
# export HSA_ENABLE_SDMA=0

# Optional: Disable IB for Ethernet-only testing
# Disable IB for Ethernet-only testing
# export NCCL_IB_DISABLE=1
Loading