IntelLabs · sandlbn · May 7, 2026 · Apr 20, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.env.example b/.env.example
@@ -54,6 +54,12 @@ TARGET_SPEEDUP=2.0
 # Target device
 XPU_DEVICE=xpu
 
+# Path to SYCL TLA (XeTLA) installation directory
+SYCL_TLA_DIR=
+
+# Path to MKL include directory
+MKL_INCLUDE=/swtools/intel/mkl/latest/include
+
 
 # ============================================================================
 # Logging Configuration

diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+__pycache__/
+*.pyc
+*.egg-info/
+.env
+.env*.local
+uv.lock
+
+# Build artifacts
+sycl_build/
+/build/
+/dist/
+
+# SLURM output
+*.out
+
+# IDE
+.vscode/
+.idea/
diff --git a/README.md b/README.md
@@ -34,10 +34,10 @@ FlashAttention benchmark optimized across diverse shapes including skinny, non-s
 ### Prerequisites
 
 - Python 3.11+
-- Intel XPU with drivers and runtime installed
-- PyTorch 2.9.1+ with XPU support
-- Triton 3.0+ with Intel XPU backend
 - Access to an OpenAI-compatible LLM API
+- One of the following backends:
+  - **Intel XPU**: Intel GPU with drivers and runtime installed
+  - **NVIDIA CUDA**: NVIDIA GPU with CUDA toolkit installed
 
 ### Install with uv (recommended)
 
@@ -46,8 +46,11 @@ FlashAttention benchmark optimized across diverse shapes including skinny, non-s
 git clone https://github.com/IntelLabs/Xe-Forge
 cd Xe-Forge
 
-# Install with uv
-uv sync
+# Install for Intel XPU
+uv sync --extra intel
+
+# Install for NVIDIA CUDA
+uv sync --extra nvidia
 ```
 
 ### Environment Setup

diff --git a/knowledge_base/algorithmic_patterns.yaml → ...dge_base/common/algorithmic_patterns.yaml b/knowledge_base/algorithmic_patterns.yaml → ...dge_base/common/algorithmic_patterns.yaml
diff --git a/knowledge_base/correctness.yaml → knowledge_base/common/correctness.yaml b/knowledge_base/correctness.yaml → knowledge_base/common/correctness.yaml
diff --git a/knowledge_base/fusion_patterns.yaml → knowledge_base/common/fusion_patterns.yaml b/knowledge_base/fusion_patterns.yaml → knowledge_base/common/fusion_patterns.yaml
diff --git a/knowledge_base/memory_patterns.yaml → knowledge_base/common/memory_patterns.yaml b/knowledge_base/memory_patterns.yaml → knowledge_base/common/memory_patterns.yaml
diff --git a/knowledge_base/gluon/xpu/gluon_xpu_patterns.yaml b/knowledge_base/gluon/xpu/gluon_xpu_patterns.yaml
@@ -0,0 +1,101 @@
+constraints:
+- id: gluon_tile_size_alignment
+  name: Tile sizes must be aligned to hardware subgroup size
+  severity: critical
+  description: |
+    Gluon kernels on Intel XPU require tile dimensions aligned to the
+    subgroup (SIMD) width. For Xe-cores this is typically 16 or 32.
+    Misaligned tiles cause silent performance degradation.
+
+- id: gluon_dtype_accumulator
+  name: Use float32 accumulator for bf16/fp16 matmuls
+  severity: critical
+  description: |
+    When performing matrix multiplications with bfloat16 or float16 inputs
+    on Intel XPU, always accumulate in float32 to avoid numerical overflow.
+    The DPAS units natively support mixed-precision accumulation.
+
+- id: gluon_avoid_host_sync
+  name: Avoid unnecessary host-device synchronization
+  severity: warning
+  description: |
+    Calling .cpu() or .item() inside kernel loops forces synchronization
+    between host and device, destroying pipeline parallelism. Keep all
+    intermediate results on XPU.
+
+patterns:
+- id: gluon_block_tiling
+  name: Apply block tiling for GEMM
+  stage: device_specific
+  description: Use hardware-aligned block tiling for matrix operations
+  rationale: |
+    Intel XPU Xe-cores achieve peak throughput with 256x256 workgroup tiles
+    and 32-deep K dimension to match DPAS systolic array depth.
+  pattern_before: |
+    # Naive matmul without tiling
+    C = torch.matmul(A, B)
+  pattern_after: |
+    # Gluon tiled matmul with XPU-optimal tile shape
+    # TileShape: M=256, N=256, K=32 for Xe-core DPAS
+    C = gluon.matmul(A, B, tile_m=256, tile_n=256, tile_k=32)
+  expected_speedup: "2-5x over naive torch.matmul for large matrices"
+
+- id: gluon_prefetch_pipeline
+  name: Enable prefetching for memory-bound kernels
+  stage: memory_access
+  description: Use software prefetching to hide memory latency
+  rationale: |
+    Intel XPU benefits heavily from prefetching future K-blocks of A and B
+    matrices. This overlaps compute with memory transfers across pipeline
+    stages.
+  pattern_before: |
+    # Single-stage load without prefetch
+    for k in range(K // BLOCK_K):
+        a_tile = load(A, k)
+        b_tile = load(B, k)
+        acc += dpas(a_tile, b_tile)
+  pattern_after: |
+    # Two-stage pipelined load with prefetch
+    prefetch(A, k=0)
+    prefetch(B, k=0)
+    for k in range(K // BLOCK_K):
+        a_tile = load(A, k)
+        b_tile = load(B, k)
+        prefetch(A, k+1)
+        prefetch(B, k+1)
+        acc += dpas(a_tile, b_tile)
+  expected_speedup: "1.3-2x for memory-bound kernels"
+
+- id: gluon_fusion_elementwise
+  name: Fuse elementwise operations into GEMM epilogue
+  stage: fusion
+  description: Fold activation functions and scaling into the GEMM epilogue
+  rationale: |
+    Separate elementwise kernels after GEMM force an extra global memory
+    round-trip. Fusing them into the epilogue eliminates this overhead.
+  pattern_before: |
+    C = gluon.matmul(A, B)
+    D = torch.relu(C)
+    E = D * scale
+  pattern_after: |
+    # Fused GEMM + ReLU + scale in single kernel
+    E = gluon.matmul(A, B, epilogue=lambda c: torch.relu(c) * scale)
+  expected_speedup: "1.2-1.5x for fusion-eligible patterns"
+
+- id: gluon_dtype_optimization
+  name: Use bfloat16 inputs with float32 accumulation
+  stage: dtype_fix
+  description: Leverage mixed-precision DPAS for higher throughput
+  rationale: |
+    Intel XPU DPAS units deliver 2x throughput with bf16 inputs vs float32.
+    Using bf16 inputs with fp32 accumulation maximizes compute throughput
+    while maintaining numerical stability.
+  pattern_before: |
+    A = A.float()
+    B = B.float()
+    C = gluon.matmul(A, B)
+  pattern_after: |
+    A = A.bfloat16()
+    B = B.bfloat16()
+    C = gluon.matmul(A, B, acc_dtype=torch.float32)
+  expected_speedup: "~2x throughput improvement"