diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp index 0ef828e6770b..c04cb76103c8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinNVPTX.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/Value.h" +#include "mlir/IR/ValueRange.h" #include "clang/AST/GlobalDecl.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/TargetBuiltins.h" @@ -551,21 +552,67 @@ mlir::Value CIRGenFunction::emitNVPTXBuiltinExpr(unsigned builtinId, case NVPTX::BI__nvvm_getctarank_shared_cluster: llvm_unreachable("getctarank_shared_cluster NYI"); case NVPTX::BI__nvvm_barrier_cluster_arrive: - llvm_unreachable("barrier_cluster_arrive NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cluster.arrive"), + builder.getVoidTy()) + .getResult(); case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed: - llvm_unreachable("barrier_cluster_arrive_relaxed NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cluster.arrive.relaxed"), + builder.getVoidTy()) + .getResult(); case NVPTX::BI__nvvm_barrier_cluster_wait: - llvm_unreachable("barrier_cluster_wait NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cluster.wait"), + builder.getVoidTy()) + .getResult(); case NVPTX::BI__nvvm_fence_sc_cluster: - llvm_unreachable("fence_sc_cluster NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.fence.sc.cluster"), builder.getVoidTy(), + mlir::ValueRange{}) + .getResult(); case NVPTX::BI__nvvm_bar_sync: - llvm_unreachable("bar_sync NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cta.sync.aligned.all"), + builder.getVoidTy(), + mlir::ValueRange{emitScalarExpr(expr->getArg(0))}) + .getResult(); case NVPTX::BI__syncthreads: - llvm_unreachable("syncthreads NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cta.sync.aligned.all"), + builder.getVoidTy(), + mlir::ValueRange{ + builder.getConstInt(getLoc(expr->getExprLoc()), SInt32Ty, 0)}) + .getResult(); case NVPTX::BI__nvvm_barrier_sync: - llvm_unreachable("barrier_sync NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cta.sync.all"), + builder.getVoidTy(), + mlir::ValueRange{emitScalarExpr(expr->getArg(0))}) + .getResult(); case NVPTX::BI__nvvm_barrier_sync_cnt: - llvm_unreachable("barrier_sync_cnt NYI"); + return builder + .create( + getLoc(expr->getExprLoc()), + builder.getStringAttr("nvvm.barrier.cta.sync.count"), + builder.getVoidTy(), + mlir::ValueRange{emitScalarExpr(expr->getArg(0)), + emitScalarExpr(expr->getArg(1))}) + .getResult(); default: return nullptr; } diff --git a/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu b/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu index 0f952043bcc6..d6e5ed5a2d42 100644 --- a/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu +++ b/clang/test/CIR/CodeGen/CUDA/builtin-functions.cu @@ -10,6 +10,19 @@ // RUN: %s -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCHECK --input-file=%t.ll %s + +__device__ void sync() { + + // CIR: cir.llvm.intrinsic "nvvm.barrier.cta.sync.aligned.all" {{.*}} : (!s32i) + // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + // OGCHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + __nvvm_bar_sync(0); +} + __device__ void builtins() { float f1, f2; double d1, d2; @@ -59,7 +72,8 @@ __device__ void builtins() { // LLVM: call void @llvm.nvvm.membar.sys() __nvvm_membar_sys(); - // TODO-CIR: cir.llvm.intrinsic "nvvm.barrier0" - // TODO-LLVM: call void @llvm.nvvm.barrier0() - // __syncthreads(); + // CIR: cir.llvm.intrinsic "nvvm.barrier.cta.sync.aligned.all" + // LLVM: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + // OGCHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0) + __syncthreads(); } diff --git a/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu b/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu index 2f195d547804..0bfb65623c09 100644 --- a/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu +++ b/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu @@ -1,41 +1,41 @@ // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_70 \ // RUN: -fcuda-is-device -target-feature +ptx60 \ -// RUN: -emit-cir -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CIR %s +// RUN: -emit-cir -o %t.cir -x cuda %s +// RUN: FileCheck -check-prefix=CIR --input-file=%t.cir %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx65 \ -// RUN: -emit-cir -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CIR %s +// RUN: -emit-cir -o %t.cir -x cuda %s +// RUN: FileCheck -check-prefix=CIR --input-file=%t.cir %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx70 \ -// RUN: -emit-cir -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CIR %s +// RUN: -emit-cir -o %t.cir -x cuda %s +// RUN: FileCheck -check-prefix=CIR --input-file=%t.cir %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_70 \ // RUN: -fcuda-is-device -target-feature +ptx60 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=LLVM %s +// RUN: -emit-llvm -o %t.ll -x cuda %s +// RUN: FileCheck -check-prefix=LLVM --input-file=%t.ll %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx65 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=LLVM %s +// RUN: -emit-llvm -o %t.ll -x cuda %s +// RUN: FileCheck -check-prefix=LLVM --input-file=%t.ll %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx70 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=LLVM %s +// RUN: -emit-llvm -o %t.ll -x cuda %s +// RUN: FileCheck -check-prefix=LLVM --input-file=%t.ll %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \ // RUN: -fcuda-is-device -target-feature +ptx60 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=OGCHECK %s +// RUN: -emit-llvm -o %t_og.ll -x cuda %s +// RUN: FileCheck -check-prefix=OGCHECK --input-file=%t_og.ll %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx65 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=OGCHECK %s +// RUN: -emit-llvm -o %t_og.ll -x cuda %s +// RUN: FileCheck -check-prefix=OGCHECK --input-file=%t_og.ll %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 \ // RUN: -fcuda-is-device -target-feature +ptx70 \ -// RUN: -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=OGCHECK %s +// RUN: -emit-llvm -o %t_og.ll -x cuda %s +// RUN: FileCheck -check-prefix=OGCHECK --input-file=%t_og.ll %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -52,4 +52,14 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b, // OGCHECK: call void @llvm.nvvm.bar.warp.sync(i32 __nvvm_bar_warp_sync(mask); + // CIR: cir.llvm.intrinsic "nvvm.barrier.cta.sync.all" {{.*}} : (!u32i) + // LLVM: call void @llvm.nvvm.barrier.cta.sync.all(i32 + // OGCHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32 + __nvvm_barrier_sync(mask); + + // CIR: cir.llvm.intrinsic "nvvm.barrier.cta.sync.count" {{.*}} : (!u32i, !u32i) + // LLVM: call void @llvm.nvvm.barrier.cta.sync.count(i32 + // OGCHECK: call void @llvm.nvvm.barrier.cta.sync.count(i32 + __nvvm_barrier_sync_cnt(mask, i); + } diff --git a/clang/test/CIR/CodeGen/CUDA/builtins-sm90.cu b/clang/test/CIR/CodeGen/CUDA/builtins-sm90.cu new file mode 100644 index 000000000000..5df9569144b4 --- /dev/null +++ b/clang/test/CIR/CodeGen/CUDA/builtins-sm90.cu @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-feature +ptx80 \ +// RUN: -target-cpu sm_90 -fclangir -emit-cir -fcuda-is-device -target-sdk-version=12.3 \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-feature +ptx80 \ +// RUN: -target-cpu sm_90 -fclangir -emit-llvm -fcuda-is-device -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-feature +ptx80 \ +// RUN: -target-cpu sm_90 -fclangir -emit-llvm -fcuda-is-device -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCHECK --input-file=%t.ll %s + +// CIR-LABEL: _Z6kernelPlPvj( +// LLVM: define{{.*}} void @_Z6kernelPlPvj( +// OGCHECK: define{{.*}} void @_Z6kernelPlPvj( +__attribute__((global)) void kernel(long *out, void *ptr, unsigned u) { + // CIR: cir.llvm.intrinsic "nvvm.barrier.cluster.arrive" + // LLVM: call void @llvm.nvvm.barrier.cluster.arrive() + // OGCHECK: call void @llvm.nvvm.barrier.cluster.arrive() + __nvvm_barrier_cluster_arrive(); + + // CIR: cir.llvm.intrinsic "nvvm.barrier.cluster.arrive.relaxed" + // LLVM: call void @llvm.nvvm.barrier.cluster.arrive.relaxed() + // OGCHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed() + + __nvvm_barrier_cluster_arrive_relaxed(); + // CIR: cir.llvm.intrinsic "nvvm.barrier.cluster.wait" + // LLVM: call void @llvm.nvvm.barrier.cluster.wait() + // OGCHECK: call void @llvm.nvvm.barrier.cluster.wait() + __nvvm_barrier_cluster_wait(); + + // CIR: cir.llvm.intrinsic "nvvm.fence.sc.cluster" + // LLVM: call void @llvm.nvvm.fence.sc.cluster() + // OGCHECK: call void @llvm.nvvm.fence.sc.cluster() + __nvvm_fence_sc_cluster(); + + // CIR: cir.return + // LLVM: ret void + // OGCHECK: ret void +}